# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel data/fr_token_list/bpe_unigram350/bpe.model --token_type bpe --token_list data/fr_token_list/bpe_unigram350/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_fr/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev_fr/text,text,text --valid_shape_file exp/asr_stats_raw_fr_bpe350_sp/valid/speech_shape --valid_shape_file exp/asr_stats_raw_fr_bpe350_sp/valid/text_shape.bpe --resume true --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_oxford_French_config_raw_fr_bpe350_sp --config conf/tuning/oxford_French_config.yaml --frontend_conf fs=16k --train_data_path_and_name_and_type dump/raw/train_fr_sp/wav.scp,speech,sound --train_data_path_and_name_and_type dump/raw/train_fr_sp/text,text,text --train_shape_file exp/asr_stats_raw_fr_bpe350_sp/train/speech_shape --train_shape_file exp/asr_stats_raw_fr_bpe350_sp/train/text_shape.bpe --ngpu 3 --multiprocessing_distributed True # Started at Sat Jun 11 00:34:40 EDT 2022 # /usr/bin/python3 /project/ocean/junweih/espnet/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel data/fr_token_list/bpe_unigram350/bpe.model --token_type bpe --token_list data/fr_token_list/bpe_unigram350/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_fr/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev_fr/text,text,text --valid_shape_file exp/asr_stats_raw_fr_bpe350_sp/valid/speech_shape --valid_shape_file exp/asr_stats_raw_fr_bpe350_sp/valid/text_shape.bpe --resume true --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_oxford_French_config_raw_fr_bpe350_sp --config conf/tuning/oxford_French_config.yaml --frontend_conf fs=16k --train_data_path_and_name_and_type dump/raw/train_fr_sp/wav.scp,speech,sound --train_data_path_and_name_and_type dump/raw/train_fr_sp/text,text,text --train_shape_file exp/asr_stats_raw_fr_bpe350_sp/train/speech_shape --train_shape_file exp/asr_stats_raw_fr_bpe350_sp/train/text_shape.bpe --ngpu 3 --multiprocessing_distributed True [islpc50:0/3] 2022-06-11 00:35:07,179 (distributed_c10d:217) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 [islpc50:0/3] 2022-06-11 00:35:07,179 (distributed_c10d:251) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 3 nodes. [islpc50:0/3] 2022-06-11 00:35:07,229 (asr:411) INFO: Vocabulary size: 350 [islpc50:0/3] 2022-06-11 00:35:07,809 (filelock:274) INFO: Lock 139839044598944 acquired on ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f.lock [islpc50:0/3] 2022-06-11 00:35:07,810 (filelock:318) INFO: Lock 139839044598944 released on ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f.lock [Featurizer] - The selected feature last_hidden_state's downsample rate is 320 [islpc50:0/3] 2022-06-11 00:35:30,797 (s3prl:159) INFO: Pretrained S3PRL frontend model parameters reloaded! [islpc50:0/3] 2022-06-11 00:36:06,362 (abs_task:1157) INFO: pytorch.version=1.10.1+cu111, cuda.available=True, cudnn.version=8005, cudnn.benchmark=False, cudnn.deterministic=True [islpc50:0/3] 2022-06-11 00:36:06,367 (abs_task:1158) INFO: Model structure: ESPnetASRModel( (frontend): S3prlFrontend( (upstream): UpstreamExpert( (model): Wav2Vec2Model( (feature_extractor): ConvFeatureExtractionModel( (conv_layers): ModuleList( (0): Sequential( (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,)) (1): Dropout(p=0.0, inplace=False) (2): Sequential( (0): TransposeLast() (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) (2): TransposeLast() ) (3): GELU() ) (1): Sequential( (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,)) (1): Dropout(p=0.0, inplace=False) (2): Sequential( (0): TransposeLast() (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) (2): TransposeLast() ) (3): GELU() ) (2): Sequential( (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,)) (1): Dropout(p=0.0, inplace=False) (2): Sequential( (0): TransposeLast() (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) (2): TransposeLast() ) (3): GELU() ) (3): Sequential( (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,)) (1): Dropout(p=0.0, inplace=False) (2): Sequential( (0): TransposeLast() (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) (2): TransposeLast() ) (3): GELU() ) (4): Sequential( (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,)) (1): Dropout(p=0.0, inplace=False) (2): Sequential( (0): TransposeLast() (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) (2): TransposeLast() ) (3): GELU() ) (5): Sequential( (0): Conv1d(512, 512, kernel_size=(2,), stride=(2,)) (1): Dropout(p=0.0, inplace=False) (2): Sequential( (0): TransposeLast() (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) (2): TransposeLast() ) (3): GELU() ) (6): Sequential( (0): Conv1d(512, 512, kernel_size=(2,), stride=(2,)) (1): Dropout(p=0.0, inplace=False) (2): Sequential( (0): TransposeLast() (1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) (2): TransposeLast() ) (3): GELU() ) ) ) (post_extract_proj): Linear(in_features=512, out_features=1024, bias=True) (dropout_input): Dropout(p=0.1, inplace=False) (dropout_features): Dropout(p=0.1, inplace=False) (quantizer): GumbelVectorQuantizer( (weight_proj): Linear(in_features=512, out_features=640, bias=True) ) (project_q): Linear(in_features=768, out_features=768, bias=True) (encoder): TransformerEncoder( (pos_conv): Sequential( (0): Conv1d(1024, 1024, kernel_size=(128,), stride=(1,), padding=(64,), groups=16) (1): SamePad() (2): GELU() ) (layers): ModuleList( (0): AdapterTransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (adapter1): Adapter( (down_projection): Linear(in_features=1024, out_features=192, bias=True) (up_projection): Linear(in_features=192, out_features=1024, bias=True) ) (adapter2): Adapter( (down_projection): Linear(in_features=1024, out_features=192, bias=True) (up_projection): Linear(in_features=192, out_features=1024, bias=True) ) ) (1): AdapterTransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (adapter1): Adapter( (down_projection): Linear(in_features=1024, out_features=192, bias=True) (up_projection): Linear(in_features=192, out_features=1024, bias=True) ) (adapter2): Adapter( (down_projection): Linear(in_features=1024, out_features=192, bias=True) (up_projection): Linear(in_features=192, out_features=1024, bias=True) ) ) (2): AdapterTransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (adapter1): Adapter( (down_projection): Linear(in_features=1024, out_features=192, bias=True) (up_projection): Linear(in_features=192, out_features=1024, bias=True) ) (adapter2): Adapter( (down_projection): Linear(in_features=1024, out_features=192, bias=True) (up_projection): Linear(in_features=192, out_features=1024, bias=True) ) ) (3): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (4): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (5): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (6): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (7): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (8): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (9): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (10): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (11): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (12): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (13): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (14): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (15): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (16): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (17): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (18): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (19): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (20): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (21): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (22): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (23): TransformerSentenceEncoderLayer( (self_attn): MultiheadAttention( (dropout_module): FairseqDropout() (k_proj): Linear(in_features=1024, out_features=1024, bias=True) (v_proj): Linear(in_features=1024, out_features=1024, bias=True) (q_proj): Linear(in_features=1024, out_features=1024, bias=True) (out_proj): Linear(in_features=1024, out_features=1024, bias=True) ) (dropout1): Dropout(p=0.0, inplace=False) (dropout2): Dropout(p=0.0, inplace=False) (dropout3): Dropout(p=0.0, inplace=False) (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) (fc1): Linear(in_features=1024, out_features=4096, bias=True) (fc2): Linear(in_features=4096, out_features=1024, bias=True) (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) ) (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) ) (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) (final_proj): Linear(in_features=1024, out_features=768, bias=True) ) ) (featurizer): Featurizer() ) (normalize): UtteranceMVN(norm_means=True, norm_vars=False) (encoder): RNNEncoder( (enc): ModuleList( (0): RNNP( (birnn0): LSTM(1024, 320, batch_first=True, bidirectional=True) (bt0): Linear(in_features=640, out_features=320, bias=True) (birnn1): LSTM(320, 320, batch_first=True, bidirectional=True) (bt1): Linear(in_features=640, out_features=320, bias=True) (birnn2): LSTM(320, 320, batch_first=True, bidirectional=True) (bt2): Linear(in_features=640, out_features=320, bias=True) (birnn3): LSTM(320, 320, batch_first=True, bidirectional=True) (bt3): Linear(in_features=640, out_features=320, bias=True) ) ) ) (criterion_att): LabelSmoothingLoss( (criterion): KLDivLoss() ) (ctc): CTC( (ctc_lo): Linear(in_features=320, out_features=350, bias=True) (ctc_loss): CTCLoss() ) ) Model summary: Class Name: ESPnetASRModel Total Number of model parameters: 329.07 M Number of trainable parameters: 11.68 M (3.5%) Size: 46.7 MB Type: torch.float32 [islpc50:0/3] 2022-06-11 00:36:06,367 (abs_task:1161) INFO: Optimizer: Adam ( Parameter Group 0 amsgrad: False betas: (0.9, 0.999) eps: 1e-08 initial_lr: 0.0002 lr: 5e-09 weight_decay: 0 ) [islpc50:0/3] 2022-06-11 00:36:06,367 (abs_task:1162) INFO: Scheduler: WarmupLR(warmup_steps=40000) [islpc50:0/3] 2022-06-11 00:36:06,380 (abs_task:1171) INFO: Saving the configuration in exp/asr_oxford_French_config_raw_fr_bpe350_sp/config.yaml [islpc50:0/3] 2022-06-11 00:36:19,907 (abs_task:1525) INFO: [train] dataset: ESPnetDataset( speech: {"path": "dump/raw/train_fr_sp/wav.scp", "type": "sound"} text: {"path": "dump/raw/train_fr_sp/text", "type": "text"} preprocess: ) [islpc50:0/3] 2022-06-11 00:36:19,907 (abs_task:1526) INFO: [train] Batch sampler: FoldedBatchSampler(N-batch=51024, batch_size=32, shape_files=['exp/asr_stats_raw_fr_bpe350_sp/train/speech_shape', 'exp/asr_stats_raw_fr_bpe350_sp/train/text_shape.bpe'], sort_in_batch=descending, sort_batch=descending) [islpc50:0/3] 2022-06-11 00:36:19,913 (abs_task:1527) INFO: [train] mini-batch sizes summary: N-batch=51024, mean=22.4, min=4, max=32 [islpc50:0/3] 2022-06-11 00:36:22,128 (abs_task:1525) INFO: [valid] dataset: ESPnetDataset( speech: {"path": "dump/raw/dev_fr/wav.scp", "type": "sound"} text: {"path": "dump/raw/dev_fr/text", "type": "text"} preprocess: ) [islpc50:0/3] 2022-06-11 00:36:22,129 (abs_task:1526) INFO: [valid] Batch sampler: FoldedBatchSampler(N-batch=784, batch_size=32, shape_files=['exp/asr_stats_raw_fr_bpe350_sp/valid/speech_shape', 'exp/asr_stats_raw_fr_bpe350_sp/valid/text_shape.bpe'], sort_in_batch=descending, sort_batch=descending) [islpc50:0/3] 2022-06-11 00:36:22,129 (abs_task:1527) INFO: [valid] mini-batch sizes summary: N-batch=784, mean=19.9, min=5, max=32 [islpc50:0/3] 2022-06-11 00:36:22,404 (abs_task:1525) INFO: [plot_att] dataset: ESPnetDataset( speech: {"path": "dump/raw/dev_fr/wav.scp", "type": "sound"} text: {"path": "dump/raw/dev_fr/text", "type": "text"} preprocess: ) [islpc50:0/3] 2022-06-11 00:36:22,404 (abs_task:1526) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=15621, batch_size=1, key_file=exp/asr_stats_raw_fr_bpe350_sp/valid/speech_shape, [islpc50:0/3] 2022-06-11 00:36:22,404 (abs_task:1527) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 islpc50:2248462:2248462 [0] NCCL INFO Bootstrap : Using bond0:128.2.205.9<0> islpc50:2248462:2248462 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation islpc50:2248462:2248462 [0] NCCL INFO NET/IB : No device found. islpc50:2248462:2248462 [0] NCCL INFO NET/Socket : Using [0]bond0:128.2.205.9<0> islpc50:2248462:2248462 [0] NCCL INFO Using network Socket NCCL version 2.10.3+cuda11.1 islpc50:2248463:2248463 [1] NCCL INFO Bootstrap : Using bond0:128.2.205.9<0> islpc50:2248464:2248464 [2] NCCL INFO Bootstrap : Using bond0:128.2.205.9<0> islpc50:2248463:2248463 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation islpc50:2248464:2248464 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation islpc50:2248463:2248463 [1] NCCL INFO NET/IB : No device found. islpc50:2248464:2248464 [2] NCCL INFO NET/IB : No device found. islpc50:2248464:2248464 [2] NCCL INFO NET/Socket : Using [0]bond0:128.2.205.9<0> islpc50:2248463:2248463 [1] NCCL INFO NET/Socket : Using [0]bond0:128.2.205.9<0> islpc50:2248464:2248464 [2] NCCL INFO Using network Socket islpc50:2248463:2248463 [1] NCCL INFO Using network Socket islpc50:2248462:2248763 [0] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248462:2248763 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248462:2248763 [0] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248462:2248763 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248463:2248765 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248463:2248765 [1] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248463:2248765 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248463:2248765 [1] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248464:2248764 [2] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248464:2248764 [2] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248464:2248764 [2] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248464:2248764 [2] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248464:2248764 [2] NCCL INFO Trees [0] -1/-1/-1->2->1 [1] -1/-1/-1->2->1 islpc50:2248464:2248764 [2] NCCL INFO Setting affinity for GPU 2 to ffffff islpc50:2248463:2248765 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 islpc50:2248462:2248763 [0] NCCL INFO Channel 00/02 : 0 1 2 islpc50:2248462:2248763 [0] NCCL INFO Channel 01/02 : 0 1 2 islpc50:2248463:2248765 [1] NCCL INFO Setting affinity for GPU 1 to ffffff islpc50:2248462:2248763 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 islpc50:2248462:2248763 [0] NCCL INFO Setting affinity for GPU 0 to ffffff islpc50:2248463:2248765 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248463:2248765 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248464:2248764 [2] NCCL INFO Channel 00 : 2[67000] -> 0[19000] via direct shared memory islpc50:2248464:2248764 [2] NCCL INFO Channel 01 : 2[67000] -> 0[19000] via direct shared memory islpc50:2248463:2248765 [1] NCCL INFO Channel 00 : 1[1a000] -> 2[67000] via direct shared memory islpc50:2248463:2248765 [1] NCCL INFO Channel 01 : 1[1a000] -> 2[67000] via direct shared memory islpc50:2248462:2248763 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248462:2248763 [0] NCCL INFO Channel 00 : 0[19000] -> 1[1a000] via direct shared memory islpc50:2248462:2248763 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248462:2248763 [0] NCCL INFO Channel 01 : 0[19000] -> 1[1a000] via direct shared memory islpc50:2248463:2248765 [1] NCCL INFO Connected all rings islpc50:2248464:2248764 [2] NCCL INFO Connected all rings islpc50:2248464:2248764 [2] NCCL INFO Channel 00 : 2[67000] -> 1[1a000] via direct shared memory islpc50:2248464:2248764 [2] NCCL INFO Channel 01 : 2[67000] -> 1[1a000] via direct shared memory islpc50:2248462:2248763 [0] NCCL INFO Connected all rings islpc50:2248462:2248763 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248462:2248763 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) islpc50:2248463:2248765 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248463:2248765 [1] NCCL INFO Channel 00 : 1[1a000] -> 0[19000] via direct shared memory islpc50:2248463:2248765 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) islpc50:2248463:2248765 [1] NCCL INFO Channel 01 : 1[1a000] -> 0[19000] via direct shared memory islpc50:2248462:2248763 [0] NCCL INFO Connected all trees islpc50:2248462:2248763 [0] NCCL INFO threadThresholds 8/8/64 | 24/8/64 | 8/8/512 islpc50:2248462:2248763 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer islpc50:2248463:2248765 [1] NCCL INFO Connected all trees islpc50:2248463:2248765 [1] NCCL INFO threadThresholds 8/8/64 | 24/8/64 | 8/8/512 islpc50:2248463:2248765 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer islpc50:2248464:2248764 [2] NCCL INFO Connected all trees islpc50:2248464:2248764 [2] NCCL INFO threadThresholds 8/8/64 | 24/8/64 | 8/8/512 islpc50:2248464:2248764 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer islpc50:2248463:2248765 [1] NCCL INFO comm 0x7f3ed8002fb0 rank 1 nranks 3 cudaDev 1 busId 1a000 - Init COMPLETE islpc50:2248464:2248764 [2] NCCL INFO comm 0x7f8ef8002fb0 rank 2 nranks 3 cudaDev 2 busId 67000 - Init COMPLETE islpc50:2248462:2248763 [0] NCCL INFO comm 0x7f2d8c002fb0 rank 0 nranks 3 cudaDev 0 busId 19000 - Init COMPLETE islpc50:2248462:2248462 [0] NCCL INFO Launch mode Parallel [s3prl.upstream.experts] Warning: can not import s3prl.upstream.byol_a.expert: No module named 'easydict'. Pass. [s3prl.hub] Warning: can not import s3prl.upstream.byol_a.hubconf: No module named 'easydict'. Please see upstream/byol_a/README.md [s3prl.downstream.experts] Warning: can not import s3prl.downstream.quesst14_dtw.expert: No module named 'dtw'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.separation_stft.expert: No module named 'asteroid'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.enhancement_stft.expert: No module named 'asteroid'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.speech_commands.expert: No module named 'catalyst'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.a2a-vc-vctk.expert: No module named 'resemblyzer'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.voxceleb2_ge2e.expert: No module named 'sox'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.sv_voxceleb1.expert: No module named 'sox'. Pass. Using cache found in ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f for https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt >> inserted adapters to the following layers: 0, 1, 2 * original model weights: 317,390,592 * new model weights - all: 319,757,184 * new model weights - trainable: 2,366,592 ( 0.75% of original model) [s3prl.upstream.experts] Warning: can not import s3prl.upstream.byol_a.expert: No module named 'easydict'. Pass. [s3prl.hub] Warning: can not import s3prl.upstream.byol_a.hubconf: No module named 'easydict'. Please see upstream/byol_a/README.md [s3prl.downstream.experts] Warning: can not import s3prl.downstream.quesst14_dtw.expert: No module named 'dtw'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.separation_stft.expert: No module named 'asteroid'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.enhancement_stft.expert: No module named 'asteroid'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.speech_commands.expert: No module named 'catalyst'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.a2a-vc-vctk.expert: No module named 'resemblyzer'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.voxceleb2_ge2e.expert: No module named 'sox'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.sv_voxceleb1.expert: No module named 'sox'. Pass. Using cache found in ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f for https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt >> inserted adapters to the following layers: 0, 1, 2 * original model weights: 317,390,592 * new model weights - all: 319,757,184 * new model weights - trainable: 2,366,592 ( 0.75% of original model) [islpc50:0/3] 2022-06-11 00:36:27,722 (trainer:280) INFO: 1/30epoch started [s3prl.upstream.experts] Warning: can not import s3prl.upstream.byol_a.expert: No module named 'easydict'. Pass. [s3prl.hub] Warning: can not import s3prl.upstream.byol_a.hubconf: No module named 'easydict'. Please see upstream/byol_a/README.md [s3prl.downstream.experts] Warning: can not import s3prl.downstream.quesst14_dtw.expert: No module named 'dtw'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.separation_stft.expert: No module named 'asteroid'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.enhancement_stft.expert: No module named 'asteroid'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.speech_commands.expert: No module named 'catalyst'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.a2a-vc-vctk.expert: No module named 'resemblyzer'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.voxceleb2_ge2e.expert: No module named 'sox'. Pass. [s3prl.downstream.experts] Warning: can not import s3prl.downstream.sv_voxceleb1.expert: No module named 'sox'. Pass. Using cache found in ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f for https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt >> inserted adapters to the following layers: 0, 1, 2 * original model weights: 317,390,592 * new model weights - all: 319,757,184 * new model weights - trainable: 2,366,592 ( 0.75% of original model) Process SpawnProcess-1: Traceback (most recent call last): File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap self.run() File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) File "/project/ocean/junweih/espnet/espnet2/tasks/abs_task.py", line 1315, in main_worker cls.trainer.run( File "/project/ocean/junweih/espnet/espnet2/train/trainer.py", line 286, in run all_steps_are_invalid = cls.train_one_epoch( File "/project/ocean/junweih/espnet/espnet2/train/trainer.py", line 524, in train_one_epoch retval = model(**batch) File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 886, in forward output = self.module(*inputs[0], **kwargs[0]) File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "/project/ocean/junweih/espnet/espnet2/asr/espnet_model.py", line 185, in forward encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) File "/project/ocean/junweih/espnet/espnet2/asr/espnet_model.py", line 313, in encode feats, feats_lengths = self._extract_feats(speech, speech_lengths) File "/project/ocean/junweih/espnet/espnet2/asr/espnet_model.py", line 374, in _extract_feats feats, feats_lengths = self.frontend(speech, speech_lengths) File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "/project/ocean/junweih/espnet/espnet2/asr/frontend/s3prl.py", line 143, in forward feats = self.upstream(wavs) File "/project/ocean/junweih/espnet/tools/s3prl/s3prl/upstream/interfaces.py", line 103, in __call__ result = super().__call__(wavs, *args, **kwargs) or {} File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "/project/ocean/junweih/espnet/tools/s3prl/s3prl/upstream/wav2vec2/expert.py", line 68, in forward results = self.model.extract_features( File "/project/ocean/junweih/espnet/tools/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 700, in extract_features res = self.forward( File "/project/ocean/junweih/espnet/tools/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 602, in forward x, layer_results = self.encoder(x, padding_mask=padding_mask, layer=layer) File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1120, in _call_impl result = forward_call(*input, **kwargs) File "/project/ocean/junweih/espnet/tools/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 862, in forward x, layer_results = self.extract_features(x, padding_mask, layer) File "/project/ocean/junweih/espnet/tools/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 891, in extract_features x, z = layer(x, self_attn_padding_mask=padding_mask, need_weights=False) File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1120, in _call_impl result = forward_call(*input, **kwargs) File "/project/ocean/junweih/espnet/tools/fairseq/fairseq/models/wav2vec/wav2vec2.py", line 978, in forward x, attn = self.self_attn( File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(*input, **kwargs) File "/project/ocean/junweih/espnet/tools/fairseq/fairseq/modules/multihead_attention.py", line 170, in forward return F.multi_head_attention_forward( File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/functional.py", line 5101, in multi_head_attention_forward attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, dropout_p) File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/functional.py", line 4847, in _scaled_dot_product_attention attn = softmax(attn, dim=-1) File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/nn/functional.py", line 1680, in softmax ret = input.softmax(dim) RuntimeError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 10.76 GiB total capacity; 2.24 GiB already allocated; 18.56 MiB free; 2.26 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF Traceback (most recent call last): File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main return _run_code(code, main_globals, None, File "/usr/lib/python3.8/runpy.py", line 87, in _run_code exec(code, run_globals) File "/project/ocean/junweih/espnet/espnet2/bin/asr_train.py", line 23, in main() File "/project/ocean/junweih/espnet/espnet2/bin/asr_train.py", line 19, in main ASRTask.main(cmd=cmd) File "/project/ocean/junweih/espnet/espnet2/tasks/abs_task.py", line 1069, in main while not ProcessContext(processes, error_queues).join(): File "/project/ocean/junweih/espnet/tools/python_user_base/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 139, in join raise ProcessExitedException( torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1 # Accounting: time=141 threads=1 # Ended (code 1) at Sat Jun 11 00:37:01 EDT 2022, elapsed time 141 seconds /usr/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 25 leaked semaphore objects to clean up at shutdown warnings.warn('resource_tracker: There appear to be %d '