|
# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel data/cy_token_list/bpe_unigram150/bpe.model --token_type bpe --token_list data/cy_token_list/bpe_unigram150/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_cy/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev_cy/text,text,text --valid_shape_file exp/asr_stats_raw_cy_bpe150_sp/valid/speech_shape --valid_shape_file exp/asr_stats_raw_cy_bpe150_sp/valid/text_shape.bpe --resume true --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_oxford_frontend_raw_cy_bpe150_sp --config conf/tuning/oxford_frontend.yaml --frontend_conf fs=16k --train_data_path_and_name_and_type dump/raw/train_cy_sp/wav.scp,speech,sound --train_data_path_and_name_and_type dump/raw/train_cy_sp/text,text,text --train_shape_file exp/asr_stats_raw_cy_bpe150_sp/train/speech_shape --train_shape_file exp/asr_stats_raw_cy_bpe150_sp/train/text_shape.bpe --ngpu 3 --multiprocessing_distributed True |
|
# Started at Mon Jun 6 21:31:07 EDT 2022 |
|
# |
|
/usr/bin/python3 /project/ocean/junweih/espnet/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel data/cy_token_list/bpe_unigram150/bpe.model --token_type bpe --token_list data/cy_token_list/bpe_unigram150/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_cy/wav.scp,speech,sound --valid_data_path_and_name_and_type dump/raw/dev_cy/text,text,text --valid_shape_file exp/asr_stats_raw_cy_bpe150_sp/valid/speech_shape --valid_shape_file exp/asr_stats_raw_cy_bpe150_sp/valid/text_shape.bpe --resume true --init_param --ignore_init_mismatch false --fold_length 80000 --fold_length 150 --output_dir exp/asr_oxford_frontend_raw_cy_bpe150_sp --config conf/tuning/oxford_frontend.yaml --frontend_conf fs=16k --train_data_path_and_name_and_type dump/raw/train_cy_sp/wav.scp,speech,sound --train_data_path_and_name_and_type dump/raw/train_cy_sp/text,text,text --train_shape_file exp/asr_stats_raw_cy_bpe150_sp/train/speech_shape --train_shape_file exp/asr_stats_raw_cy_bpe150_sp/train/text_shape.bpe --ngpu 3 --multiprocessing_distributed True |
|
[islpc50:0/3] 2022-06-06 21:31:22,179 (distributed_c10d:217) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 |
|
[islpc50:0/3] 2022-06-06 21:31:22,180 (distributed_c10d:251) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 3 nodes. |
|
[islpc50:0/3] 2022-06-06 21:31:22,249 (asr:411) INFO: Vocabulary size: 150 |
|
[islpc50:0/3] 2022-06-06 21:31:22,968 (filelock:274) INFO: Lock 140511308283184 acquired on ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f.lock |
|
[islpc50:0/3] 2022-06-06 21:31:22,969 (filelock:318) INFO: Lock 140511308283184 released on ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f.lock |
|
[Featurizer] - The selected feature last_hidden_state's downsample rate is 320 |
|
[islpc50:0/3] 2022-06-06 21:31:34,900 (s3prl:159) INFO: Pretrained S3PRL frontend model parameters reloaded! |
|
[islpc50:0/3] 2022-06-06 21:31:38,773 (abs_task:1157) INFO: pytorch.version=1.10.1+cu111, cuda.available=True, cudnn.version=8005, cudnn.benchmark=False, cudnn.deterministic=True |
|
[islpc50:0/3] 2022-06-06 21:31:38,779 (abs_task:1158) INFO: Model structure: |
|
ESPnetASRModel( |
|
(frontend): S3prlFrontend( |
|
(upstream): UpstreamExpert( |
|
(model): Wav2Vec2Model( |
|
(feature_extractor): ConvFeatureExtractionModel( |
|
(conv_layers): ModuleList( |
|
(0): Sequential( |
|
(0): Conv1d(1, 512, kernel_size=(10,), stride=(5,)) |
|
(1): Dropout(p=0.0, inplace=False) |
|
(2): Sequential( |
|
(0): TransposeLast() |
|
(1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
(2): TransposeLast() |
|
) |
|
(3): GELU() |
|
) |
|
(1): Sequential( |
|
(0): Conv1d(512, 512, kernel_size=(3,), stride=(2,)) |
|
(1): Dropout(p=0.0, inplace=False) |
|
(2): Sequential( |
|
(0): TransposeLast() |
|
(1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
(2): TransposeLast() |
|
) |
|
(3): GELU() |
|
) |
|
(2): Sequential( |
|
(0): Conv1d(512, 512, kernel_size=(3,), stride=(2,)) |
|
(1): Dropout(p=0.0, inplace=False) |
|
(2): Sequential( |
|
(0): TransposeLast() |
|
(1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
(2): TransposeLast() |
|
) |
|
(3): GELU() |
|
) |
|
(3): Sequential( |
|
(0): Conv1d(512, 512, kernel_size=(3,), stride=(2,)) |
|
(1): Dropout(p=0.0, inplace=False) |
|
(2): Sequential( |
|
(0): TransposeLast() |
|
(1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
(2): TransposeLast() |
|
) |
|
(3): GELU() |
|
) |
|
(4): Sequential( |
|
(0): Conv1d(512, 512, kernel_size=(3,), stride=(2,)) |
|
(1): Dropout(p=0.0, inplace=False) |
|
(2): Sequential( |
|
(0): TransposeLast() |
|
(1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
(2): TransposeLast() |
|
) |
|
(3): GELU() |
|
) |
|
(5): Sequential( |
|
(0): Conv1d(512, 512, kernel_size=(2,), stride=(2,)) |
|
(1): Dropout(p=0.0, inplace=False) |
|
(2): Sequential( |
|
(0): TransposeLast() |
|
(1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
(2): TransposeLast() |
|
) |
|
(3): GELU() |
|
) |
|
(6): Sequential( |
|
(0): Conv1d(512, 512, kernel_size=(2,), stride=(2,)) |
|
(1): Dropout(p=0.0, inplace=False) |
|
(2): Sequential( |
|
(0): TransposeLast() |
|
(1): Fp32LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
(2): TransposeLast() |
|
) |
|
(3): GELU() |
|
) |
|
) |
|
) |
|
(post_extract_proj): Linear(in_features=512, out_features=1024, bias=True) |
|
(dropout_input): Dropout(p=0.1, inplace=False) |
|
(dropout_features): Dropout(p=0.1, inplace=False) |
|
(quantizer): GumbelVectorQuantizer( |
|
(weight_proj): Linear(in_features=512, out_features=640, bias=True) |
|
) |
|
(project_q): Linear(in_features=768, out_features=768, bias=True) |
|
(encoder): TransformerEncoder( |
|
(pos_conv): Sequential( |
|
(0): Conv1d(1024, 1024, kernel_size=(128,), stride=(1,), padding=(64,), groups=16) |
|
(1): SamePad() |
|
(2): GELU() |
|
) |
|
(layers): ModuleList( |
|
(0): AdapterTransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(adapter1): Adapter( |
|
(down_projection): Linear(in_features=1024, out_features=192, bias=True) |
|
(up_projection): Linear(in_features=192, out_features=1024, bias=True) |
|
) |
|
(adapter2): Adapter( |
|
(down_projection): Linear(in_features=1024, out_features=192, bias=True) |
|
(up_projection): Linear(in_features=192, out_features=1024, bias=True) |
|
) |
|
) |
|
(1): AdapterTransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(adapter1): Adapter( |
|
(down_projection): Linear(in_features=1024, out_features=192, bias=True) |
|
(up_projection): Linear(in_features=192, out_features=1024, bias=True) |
|
) |
|
(adapter2): Adapter( |
|
(down_projection): Linear(in_features=1024, out_features=192, bias=True) |
|
(up_projection): Linear(in_features=192, out_features=1024, bias=True) |
|
) |
|
) |
|
(2): AdapterTransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(adapter1): Adapter( |
|
(down_projection): Linear(in_features=1024, out_features=192, bias=True) |
|
(up_projection): Linear(in_features=192, out_features=1024, bias=True) |
|
) |
|
(adapter2): Adapter( |
|
(down_projection): Linear(in_features=1024, out_features=192, bias=True) |
|
(up_projection): Linear(in_features=192, out_features=1024, bias=True) |
|
) |
|
) |
|
(3): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(4): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(5): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(6): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(7): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(8): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(9): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(10): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(11): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(12): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(13): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(14): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(15): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(16): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(17): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(18): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(19): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(20): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(21): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(22): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(23): TransformerSentenceEncoderLayer( |
|
(self_attn): MultiheadAttention( |
|
(dropout_module): FairseqDropout() |
|
(k_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(v_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(q_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
(out_proj): Linear(in_features=1024, out_features=1024, bias=True) |
|
) |
|
(dropout1): Dropout(p=0.0, inplace=False) |
|
(dropout2): Dropout(p=0.0, inplace=False) |
|
(dropout3): Dropout(p=0.0, inplace=False) |
|
(self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
(fc1): Linear(in_features=1024, out_features=4096, bias=True) |
|
(fc2): Linear(in_features=4096, out_features=1024, bias=True) |
|
(final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
) |
|
(layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True) |
|
) |
|
(layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True) |
|
(final_proj): Linear(in_features=1024, out_features=768, bias=True) |
|
) |
|
) |
|
(featurizer): Featurizer() |
|
) |
|
(normalize): UtteranceMVN(norm_means=True, norm_vars=False) |
|
(encoder): RNNEncoder( |
|
(enc): ModuleList( |
|
(0): RNNP( |
|
(birnn0): LSTM(1024, 320, batch_first=True, bidirectional=True) |
|
(bt0): Linear(in_features=640, out_features=320, bias=True) |
|
(birnn1): LSTM(320, 320, batch_first=True, bidirectional=True) |
|
(bt1): Linear(in_features=640, out_features=320, bias=True) |
|
(birnn2): LSTM(320, 320, batch_first=True, bidirectional=True) |
|
(bt2): Linear(in_features=640, out_features=320, bias=True) |
|
(birnn3): LSTM(320, 320, batch_first=True, bidirectional=True) |
|
(bt3): Linear(in_features=640, out_features=320, bias=True) |
|
) |
|
) |
|
) |
|
(criterion_att): LabelSmoothingLoss( |
|
(criterion): KLDivLoss() |
|
) |
|
(ctc): CTC( |
|
(ctc_lo): Linear(in_features=320, out_features=150, bias=True) |
|
(ctc_loss): CTCLoss() |
|
) |
|
) |
|
|
|
Model summary: |
|
Class Name: ESPnetASRModel |
|
Total Number of model parameters: 329.00 M |
|
Number of trainable parameters: 11.61 M (3.5%) |
|
Size: 46.45 MB |
|
Type: torch.float32 |
|
[islpc50:0/3] 2022-06-06 21:31:38,779 (abs_task:1161) INFO: Optimizer: |
|
Adam ( |
|
Parameter Group 0 |
|
amsgrad: False |
|
betas: (0.9, 0.999) |
|
eps: 1e-08 |
|
initial_lr: 0.00027 |
|
lr: 6.749999999999999e-09 |
|
weight_decay: 0 |
|
) |
|
[islpc50:0/3] 2022-06-06 21:31:38,779 (abs_task:1162) INFO: Scheduler: WarmupLR(warmup_steps=40000) |
|
[islpc50:0/3] 2022-06-06 21:31:38,792 (abs_task:1171) INFO: Saving the configuration in exp/asr_oxford_frontend_raw_cy_bpe150_sp/config.yaml |
|
[islpc50:0/3] 2022-06-06 21:31:41,115 (abs_task:1525) INFO: [train] dataset: |
|
ESPnetDataset( |
|
speech: {"path": "dump/raw/train_cy_sp/wav.scp", "type": "sound"} |
|
text: {"path": "dump/raw/train_cy_sp/text", "type": "text"} |
|
preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7fca96680220>) |
|
[islpc50:0/3] 2022-06-06 21:31:41,115 (abs_task:1526) INFO: [train] Batch sampler: FoldedBatchSampler(N-batch=24097, batch_size=10, shape_files=['exp/asr_stats_raw_cy_bpe150_sp/train/speech_shape', 'exp/asr_stats_raw_cy_bpe150_sp/train/text_shape.bpe'], sort_in_batch=descending, sort_batch=descending) |
|
[islpc50:0/3] 2022-06-06 21:31:41,118 (abs_task:1527) INFO: [train] mini-batch sizes summary: N-batch=24097, mean=7.3, min=3, max=10 |
|
[islpc50:0/3] 2022-06-06 21:31:41,217 (abs_task:1525) INFO: [valid] dataset: |
|
ESPnetDataset( |
|
speech: {"path": "dump/raw/dev_cy/wav.scp", "type": "sound"} |
|
text: {"path": "dump/raw/dev_cy/text", "type": "text"} |
|
preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7fca6854fb50>) |
|
[islpc50:0/3] 2022-06-06 21:31:41,217 (abs_task:1526) INFO: [valid] Batch sampler: FoldedBatchSampler(N-batch=464, batch_size=10, shape_files=['exp/asr_stats_raw_cy_bpe150_sp/valid/speech_shape', 'exp/asr_stats_raw_cy_bpe150_sp/valid/text_shape.bpe'], sort_in_batch=descending, sort_batch=descending) |
|
[islpc50:0/3] 2022-06-06 21:31:41,217 (abs_task:1527) INFO: [valid] mini-batch sizes summary: N-batch=464, mean=6.3, min=3, max=10 |
|
[islpc50:0/3] 2022-06-06 21:31:41,255 (abs_task:1525) INFO: [plot_att] dataset: |
|
ESPnetDataset( |
|
speech: {"path": "dump/raw/dev_cy/wav.scp", "type": "sound"} |
|
text: {"path": "dump/raw/dev_cy/text", "type": "text"} |
|
preprocess: <espnet2.train.preprocessor.CommonPreprocessor object at 0x7fca96ccfd60>) |
|
[islpc50:0/3] 2022-06-06 21:31:41,255 (abs_task:1526) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=2933, batch_size=1, key_file=exp/asr_stats_raw_cy_bpe150_sp/valid/speech_shape, |
|
[islpc50:0/3] 2022-06-06 21:31:41,255 (abs_task:1527) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 |
|
[islpc50:0/3] 2022-06-06 21:31:59,752 (trainer:155) INFO: The training was resumed using exp/asr_oxford_frontend_raw_cy_bpe150_sp/checkpoint.pth |
|
islpc50:1048747:1048747 [0] NCCL INFO Bootstrap : Using bond0:128.2.205.9<0> |
|
islpc50:1048747:1048747 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation |
|
islpc50:1048747:1048747 [0] NCCL INFO NET/IB : No device found. |
|
islpc50:1048747:1048747 [0] NCCL INFO NET/Socket : Using [0]bond0:128.2.205.9<0> |
|
islpc50:1048747:1048747 [0] NCCL INFO Using network Socket |
|
NCCL version 2.10.3+cuda11.1 |
|
islpc50:1048748:1048748 [1] NCCL INFO Bootstrap : Using bond0:128.2.205.9<0> |
|
islpc50:1048748:1048748 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation |
|
islpc50:1048749:1048749 [2] NCCL INFO Bootstrap : Using bond0:128.2.205.9<0> |
|
islpc50:1048749:1048749 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation |
|
islpc50:1048748:1048748 [1] NCCL INFO NET/IB : No device found. |
|
islpc50:1048748:1048748 [1] NCCL INFO NET/Socket : Using [0]bond0:128.2.205.9<0> |
|
islpc50:1048748:1048748 [1] NCCL INFO Using network Socket |
|
islpc50:1048749:1048749 [2] NCCL INFO NET/IB : No device found. |
|
islpc50:1048749:1048749 [2] NCCL INFO NET/Socket : Using [0]bond0:128.2.205.9<0> |
|
islpc50:1048749:1048749 [2] NCCL INFO Using network Socket |
|
islpc50:1048747:1048816 [0] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048747:1048816 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048747:1048816 [0] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048747:1048816 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048749:1048818 [2] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048749:1048818 [2] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048749:1048818 [2] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048749:1048818 [2] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048749:1048818 [2] NCCL INFO Trees [0] -1/-1/-1->2->1 [1] -1/-1/-1->2->1 |
|
islpc50:1048748:1048817 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 |
|
islpc50:1048749:1048818 [2] NCCL INFO Setting affinity for GPU 2 to ffffff |
|
islpc50:1048748:1048817 [1] NCCL INFO Setting affinity for GPU 1 to ffffff |
|
islpc50:1048747:1048816 [0] NCCL INFO Channel 00/02 : 0 1 2 |
|
islpc50:1048747:1048816 [0] NCCL INFO Channel 01/02 : 0 1 2 |
|
islpc50:1048747:1048816 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 |
|
islpc50:1048747:1048816 [0] NCCL INFO Setting affinity for GPU 0 to ffffff |
|
islpc50:1048748:1048817 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Channel 00 : 1[1a000] -> 2[67000] via direct shared memory |
|
islpc50:1048748:1048817 [1] NCCL INFO Channel 01 : 1[1a000] -> 2[67000] via direct shared memory |
|
islpc50:1048749:1048818 [2] NCCL INFO Channel 00 : 2[67000] -> 0[19000] via direct shared memory |
|
islpc50:1048749:1048818 [2] NCCL INFO Channel 01 : 2[67000] -> 0[19000] via direct shared memory |
|
islpc50:1048747:1048816 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048747:1048816 [0] NCCL INFO Channel 00 : 0[19000] -> 1[1a000] via direct shared memory |
|
islpc50:1048747:1048816 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048747:1048816 [0] NCCL INFO Channel 01 : 0[19000] -> 1[1a000] via direct shared memory |
|
islpc50:1048749:1048818 [2] NCCL INFO Connected all rings |
|
islpc50:1048749:1048818 [2] NCCL INFO Channel 00 : 2[67000] -> 1[1a000] via direct shared memory |
|
islpc50:1048749:1048818 [2] NCCL INFO Channel 01 : 2[67000] -> 1[1a000] via direct shared memory |
|
islpc50:1048748:1048817 [1] NCCL INFO Connected all rings |
|
islpc50:1048747:1048816 [0] NCCL INFO Connected all rings |
|
islpc50:1048747:1048816 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048747:1048816 [0] NCCL INFO Could not enable P2P between dev 0(=19000) and dev 1(=1a000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Channel 00 : 1[1a000] -> 0[19000] via direct shared memory |
|
islpc50:1048748:1048817 [1] NCCL INFO Could not enable P2P between dev 1(=1a000) and dev 0(=19000) |
|
islpc50:1048748:1048817 [1] NCCL INFO Channel 01 : 1[1a000] -> 0[19000] via direct shared memory |
|
islpc50:1048749:1048818 [2] NCCL INFO Connected all trees |
|
islpc50:1048749:1048818 [2] NCCL INFO threadThresholds 8/8/64 | 24/8/64 | 8/8/512 |
|
islpc50:1048749:1048818 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer |
|
islpc50:1048747:1048816 [0] NCCL INFO Connected all trees |
|
islpc50:1048747:1048816 [0] NCCL INFO threadThresholds 8/8/64 | 24/8/64 | 8/8/512 |
|
islpc50:1048747:1048816 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer |
|
islpc50:1048748:1048817 [1] NCCL INFO Connected all trees |
|
islpc50:1048748:1048817 [1] NCCL INFO threadThresholds 8/8/64 | 24/8/64 | 8/8/512 |
|
islpc50:1048748:1048817 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer |
|
islpc50:1048748:1048817 [1] NCCL INFO comm 0x7fafe8002fb0 rank 1 nranks 3 cudaDev 1 busId 1a000 - Init COMPLETE |
|
islpc50:1048749:1048818 [2] NCCL INFO comm 0x7f5734002fb0 rank 2 nranks 3 cudaDev 2 busId 67000 - Init COMPLETE |
|
islpc50:1048747:1048816 [0] NCCL INFO comm 0x7fc9d0002fb0 rank 0 nranks 3 cudaDev 0 busId 19000 - Init COMPLETE |
|
islpc50:1048747:1048747 [0] NCCL INFO Launch mode Parallel |
|
[islpc50:0/3] 2022-06-06 21:32:00,204 (trainer:280) INFO: 6/6epoch started |
|
[s3prl.upstream.experts] Warning: can not import s3prl.upstream.byol_a.expert: No module named 'easydict'. Pass. |
|
[s3prl.hub] Warning: can not import s3prl.upstream.byol_a.hubconf: No module named 'easydict'. Please see upstream/byol_a/README.md |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.quesst14_dtw.expert: No module named 'dtw'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.separation_stft.expert: No module named 'asteroid'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.enhancement_stft.expert: No module named 'asteroid'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.speech_commands.expert: No module named 'catalyst'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.a2a-vc-vctk.expert: No module named 'resemblyzer'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.voxceleb2_ge2e.expert: No module named 'sox'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.sv_voxceleb1.expert: No module named 'sox'. Pass. |
|
Using cache found in ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f |
|
for https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt |
|
>> inserted adapters to the following layers: 0, 1, 2 |
|
* original model weights: 317,390,592 |
|
* new model weights - all: 319,757,184 |
|
* new model weights - trainable: 2,366,592 ( 0.75% of original model) |
|
[s3prl.upstream.experts] Warning: can not import s3prl.upstream.byol_a.expert: No module named 'easydict'. Pass. |
|
[s3prl.hub] Warning: can not import s3prl.upstream.byol_a.hubconf: No module named 'easydict'. Please see upstream/byol_a/README.md |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.quesst14_dtw.expert: No module named 'dtw'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.separation_stft.expert: No module named 'asteroid'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.enhancement_stft.expert: No module named 'asteroid'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.speech_commands.expert: No module named 'catalyst'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.a2a-vc-vctk.expert: No module named 'resemblyzer'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.voxceleb2_ge2e.expert: No module named 'sox'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.sv_voxceleb1.expert: No module named 'sox'. Pass. |
|
Using cache found in ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f |
|
for https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt |
|
>> inserted adapters to the following layers: 0, 1, 2 |
|
* original model weights: 317,390,592 |
|
* new model weights - all: 319,757,184 |
|
* new model weights - trainable: 2,366,592 ( 0.75% of original model) |
|
[s3prl.upstream.experts] Warning: can not import s3prl.upstream.byol_a.expert: No module named 'easydict'. Pass. |
|
[s3prl.hub] Warning: can not import s3prl.upstream.byol_a.hubconf: No module named 'easydict'. Please see upstream/byol_a/README.md |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.quesst14_dtw.expert: No module named 'dtw'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.separation_stft.expert: No module named 'asteroid'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.enhancement_stft.expert: No module named 'asteroid'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.speech_commands.expert: No module named 'catalyst'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.a2a-vc-vctk.expert: No module named 'resemblyzer'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.voxceleb2_ge2e.expert: No module named 'sox'. Pass. |
|
[s3prl.downstream.experts] Warning: can not import s3prl.downstream.sv_voxceleb1.expert: No module named 'sox'. Pass. |
|
Using cache found in ./hub/s3prl_cache/1c76d6e88090f01736036b28dc995fef583f47f42662d55286332557f957609f |
|
for https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt |
|
>> inserted adapters to the following layers: 0, 1, 2 |
|
* original model weights: 317,390,592 |
|
* new model weights - all: 319,757,184 |
|
* new model weights - trainable: 2,366,592 ( 0.75% of original model) |
|
[islpc50:0/3] 2022-06-06 21:32:07,213 (distributed:874) INFO: Reducer buckets have been rebuilt in this iteration. |
|
[islpc50:0/3] 2022-06-06 21:36:25,787 (trainer:678) INFO: 6epoch:train:1-1204batch: iter_time=2.363e-04, forward_time=0.086, loss_ctc=4.112, loss=4.112, backward_time=0.051, optim_step_time=0.004, optim0_lr0=1.552e-04, train_time=0.220 |
|
[islpc50:0/3] 2022-06-06 21:40:44,801 (trainer:678) INFO: 6epoch:train:1205-2408batch: iter_time=7.979e-05, forward_time=0.086, loss_ctc=3.838, loss=3.838, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.544e-04, train_time=0.215 |
|
[islpc50:0/3] 2022-06-06 21:45:04,292 (trainer:678) INFO: 6epoch:train:2409-3612batch: iter_time=8.071e-05, forward_time=0.086, loss_ctc=4.243, loss=4.243, backward_time=0.053, optim_step_time=0.003, optim0_lr0=1.537e-04, train_time=0.215 |
|
[islpc50:0/3] 2022-06-06 21:49:27,543 (trainer:678) INFO: 6epoch:train:3613-4816batch: iter_time=8.183e-05, forward_time=0.088, loss_ctc=4.037, loss=4.037, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.529e-04, train_time=0.218 |
|
[islpc50:0/3] 2022-06-06 21:53:45,627 (trainer:678) INFO: 6epoch:train:4817-6020batch: iter_time=8.053e-05, forward_time=0.085, loss_ctc=3.920, loss=3.920, backward_time=0.051, optim_step_time=0.003, optim0_lr0=1.522e-04, train_time=0.214 |
|
[islpc50:0/3] 2022-06-06 21:58:07,670 (trainer:678) INFO: 6epoch:train:6021-7224batch: iter_time=8.188e-05, forward_time=0.087, loss_ctc=3.811, loss=3.811, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.515e-04, train_time=0.217 |
|
[islpc50:0/3] 2022-06-06 22:02:28,898 (trainer:678) INFO: 6epoch:train:7225-8428batch: iter_time=8.150e-05, forward_time=0.086, loss_ctc=3.931, loss=3.931, backward_time=0.052, optim_step_time=0.004, optim0_lr0=1.508e-04, train_time=0.217 |
|
[islpc50:0/3] 2022-06-06 22:06:48,145 (trainer:678) INFO: 6epoch:train:8429-9632batch: iter_time=8.323e-05, forward_time=0.086, loss_ctc=4.285, loss=4.285, backward_time=0.050, optim_step_time=0.004, optim0_lr0=1.500e-04, train_time=0.215 |
|
[islpc50:0/3] 2022-06-06 22:11:09,779 (trainer:678) INFO: 6epoch:train:9633-10836batch: iter_time=8.026e-05, forward_time=0.086, loss_ctc=3.813, loss=3.813, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.494e-04, train_time=0.217 |
|
[islpc50:0/3] 2022-06-06 22:15:28,851 (trainer:678) INFO: 6epoch:train:10837-12040batch: iter_time=8.056e-05, forward_time=0.086, loss_ctc=3.816, loss=3.816, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.487e-04, train_time=0.215 |
|
[islpc50:0/3] 2022-06-06 22:19:51,154 (trainer:678) INFO: 6epoch:train:12041-13244batch: iter_time=8.233e-05, forward_time=0.087, loss_ctc=4.035, loss=4.035, backward_time=0.053, optim_step_time=0.003, optim0_lr0=1.480e-04, train_time=0.218 |
|
[islpc50:0/3] 2022-06-06 22:24:11,949 (trainer:678) INFO: 6epoch:train:13245-14448batch: iter_time=8.187e-05, forward_time=0.086, loss_ctc=3.966, loss=3.966, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.473e-04, train_time=0.216 |
|
[islpc50:0/3] 2022-06-06 22:28:32,228 (trainer:678) INFO: 6epoch:train:14449-15652batch: iter_time=8.346e-05, forward_time=0.086, loss_ctc=4.210, loss=4.210, backward_time=0.051, optim_step_time=0.004, optim0_lr0=1.467e-04, train_time=0.216 |
|
[islpc50:0/3] 2022-06-06 22:32:51,170 (trainer:678) INFO: 6epoch:train:15653-16856batch: iter_time=8.046e-05, forward_time=0.085, loss_ctc=3.757, loss=3.757, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.460e-04, train_time=0.215 |
|
[islpc50:0/3] 2022-06-06 22:37:11,808 (trainer:678) INFO: 6epoch:train:16857-18060batch: iter_time=7.927e-05, forward_time=0.086, loss_ctc=4.036, loss=4.036, backward_time=0.050, optim_step_time=0.003, optim0_lr0=1.454e-04, train_time=0.216 |
|
[islpc50:0/3] 2022-06-06 22:41:32,075 (trainer:678) INFO: 6epoch:train:18061-19264batch: iter_time=8.539e-05, forward_time=0.086, loss_ctc=3.985, loss=3.985, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.448e-04, train_time=0.216 |
|
[islpc50:0/3] 2022-06-06 22:45:55,830 (trainer:678) INFO: 6epoch:train:19265-20468batch: iter_time=8.007e-05, forward_time=0.088, loss_ctc=3.995, loss=3.995, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.441e-04, train_time=0.219 |
|
[islpc50:0/3] 2022-06-06 22:50:14,829 (trainer:678) INFO: 6epoch:train:20469-21672batch: iter_time=8.269e-05, forward_time=0.085, loss_ctc=3.735, loss=3.735, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.435e-04, train_time=0.215 |
|
[islpc50:0/3] 2022-06-06 22:54:36,835 (trainer:678) INFO: 6epoch:train:21673-22876batch: iter_time=8.108e-05, forward_time=0.087, loss_ctc=3.469, loss=3.469, backward_time=0.052, optim_step_time=0.004, optim0_lr0=1.429e-04, train_time=0.217 |
|
[islpc50:0/3] 2022-06-06 22:58:56,514 (trainer:678) INFO: 6epoch:train:22877-24080batch: iter_time=7.962e-05, forward_time=0.086, loss_ctc=3.565, loss=3.565, backward_time=0.053, optim_step_time=0.003, optim0_lr0=1.423e-04, train_time=0.215 |
|
[islpc50:0/3] 2022-06-06 23:00:18,310 (trainer:334) INFO: 6epoch results: [train] iter_time=8.914e-05, forward_time=0.086, loss_ctc=3.929, loss=3.929, backward_time=0.052, optim_step_time=0.003, optim0_lr0=1.485e-04, train_time=0.216, time=1 hour, 27 minutes and 1.34 seconds, total_count=144582, gpu_max_cached_mem_GB=6.926, [valid] loss_ctc=39.163, cer_ctc=0.184, loss_att=nan, acc=nan, cer=nan, wer=nan, loss=39.163, time=1 minute and 8.76 seconds, total_count=2784, gpu_max_cached_mem_GB=6.926, [att_plot] time=7.98 seconds, total_count=0, gpu_max_cached_mem_GB=6.926 |
|
[islpc50:0/3] 2022-06-06 23:00:48,850 (trainer:382) INFO: The best model has been updated: train.loss |
|
[islpc50:0/3] 2022-06-06 23:00:48,851 (trainer:454) INFO: The training was finished at 6 epochs |
|
/usr/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 7 leaked semaphore objects to clean up at shutdown |
|
warnings.warn('resource_tracker: There appear to be %d ' |
|
# Accounting: time=5384 threads=1 |
|
# Ended (code 0) at Mon Jun 6 23:00:51 EDT 2022, elapsed time 5384 seconds |
|
|