diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/47epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/47epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..e426b1e724e35d5231f6e786be526e3833a7b9bc --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/47epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec2e4fcd4f8b304e0b86667cd1868eaa7505bfd6cab4bb8e3ba25a4bec14441b +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/48epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/48epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..d00906518656dd67f7fce263947147e2cbae333f --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/48epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:723f61d6ae38c4dfe06a8c5d0770c057d8b8e17909909b64117beaad445ea733 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/50epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/50epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..ad75281145b59b48eb47eaff090df5f29b485501 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/50epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72f45ac4204e62012096f8f811c7fed27dab5140166b6d33c50ef0fbcab3439d +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/52epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/52epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..1cf8284e902fa638046cca02c7a5f93a933bc111 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/52epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfac6a2bc79e49b74cd49d3ef954c755f36885277d1dcdc57c84dd497543f6ac +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/54epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/54epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..c8e6f3325aefa8d5392fd8f95c4c6e400da1e3ab --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/54epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0107f21cc4ec3208e81bc4065b1b7c2d6e9ebea3fa0df09518531a6268430783 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/55epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/55epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..fae1c555b52217609dd7c5439c74858c3cdd133c --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/55epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b7e15c311a51848a22efa5626215db5b3c95a4f0e02ef25d0975eaa715f7d06 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/57epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/57epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..46bac00230e18899b8c1554255cc8357a2e1fbc6 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/57epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec381eb186b0df466f4334af0870480ee358b3641d0fb26ac8b0e2ed39b51bb0 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/58epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/58epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..b8d6281cfc2bb17c909976e58e70e56c02890880 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/58epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbbb194b9c615c467052711edbac4d5e8626affe4298ead189570fa309e3c7f7 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/59epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/59epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..656313a542142e8d15a52481f762b2835693bfe4 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/59epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10dd51d66a258909a8f6aab8ab41d6eaa08dc2dd0b08c1e4845ad4705befd817 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/60epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/60epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..586c06fc68b637b6f569bcc3a2cf5486b8798763 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/60epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c215ed61c217a50608410e89c77ae5f208ca92c388249d897ad93e33561e3ba +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/RESULTS.md b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..5f4ff5679d75f1a7abdf09108602a23fc880e7ab --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/RESULTS.md @@ -0,0 +1,108 @@ + +# RESULTS +## Environments +- date: `Tue Mar 12 04:03:00 CST 2024` +- python version: `3.9.18 (main, Sep 11 2023, 13:41:44) [GCC 11.2.0]` +- espnet version: `espnet 202308` +- pytorch version: `pytorch 1.12.1+cu116` +- Git hash: `884659f9ee95374811015381c976fa3b4f6e01db` + - Commit date: `Thu Nov 23 00:23:29 2023 +0800` + +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_sot_asr_model_valid.acc.best/dev_2spk|3315|226216|81.3|7.3|11.4|11.0|29.7|96.8| +|decode_sot_asr_model_valid.acc.best/dev_2spk_spk1|3315|226216|40.0|14.4|45.6|3.1|63.1|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_spk2|3315|226216|38.6|15.5|45.9|3.1|64.5|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_sys7_8khz_spk1_large|1606|135101|41.7|11.2|47.0|2.5|60.8|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_sys7_8khz_spk2_large|1606|135101|39.9|13.0|47.1|2.5|62.6|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk|2059|209679|69.9|14.7|15.4|10.0|40.1|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk_kaldi_fmt|1004|124462|73.7|11.7|14.5|6.0|32.3|99.9| +|decode_sot_asr_model_valid.acc.best/dev_4spk|1467|200029|57.8|21.1|21.0|8.2|50.3|100.0| +|decode_sot_asr_model_valid.acc.best/dev_4spk_kaldi_fmt|721|119166|61.5|16.5|22.0|5.4|43.9|100.0| +|decode_sot_asr_model_valid.acc.best/dev_kaldi_fmt|605|47659|84.5|10.0|5.5|4.2|19.6|93.6| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|301042|81.9|6.6|11.5|10.2|28.3|97.6| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk_spk1|4570|301042|38.2|15.0|46.8|3.0|64.9|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk_spk2|4570|301042|37.4|14.5|48.1|2.3|65.0|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|212871|71.0|13.2|15.7|10.9|39.8|99.9| +|decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|185394|59.8|19.4|20.8|9.1|49.3|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|336490|81.0|8.8|10.2|10.8|29.8|99.1| +|decode_sot_asr_model_valid.acc.best/test-other_2spk_spk1|4663|336490|34.2|18.6|47.2|3.5|69.3|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk_spk2|4663|336490|40.5|14.8|44.8|3.0|62.5|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|266074|68.4|17.2|14.4|11.0|42.6|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|259138|55.9|23.8|20.3|8.7|52.8|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|178761|85.1|5.1|9.7|5.9|20.8|95.1| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_sys7_8khz_spk1_large|2180|178761|39.3|12.4|48.3|2.5|63.2|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_sys7_8khz_spk2_large|2180|178761|39.4|12.6|48.0|2.0|62.6|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_3spk_kaldi_fmt|977|124741|74.1|10.8|15.1|7.1|33.0|99.8| +|decode_sot_asr_model_valid.acc.best/test_clean_4spk_kaldi_fmt|632|109072|63.2|15.2|21.6|6.1|42.9|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_kaldi_fmt|961|64007|86.8|8.4|4.7|4.0|17.2|94.8| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|205496|83.9|7.3|8.8|6.5|22.6|98.4| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_sys7_8khz_spk1_large|2363|205496|35.2|16.2|48.6|2.9|67.6|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_sys7_8khz_spk2_large|2363|205496|41.9|12.8|45.3|2.5|60.6|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_3spk_kaldi_fmt|1246|162996|70.6|15.3|14.1|7.6|37.0|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_4spk_kaldi_fmt|901|157123|59.0|19.4|21.7|6.4|47.4|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_kaldi_fmt|992|80370|81.6|12.2|6.2|5.9|24.3|97.5| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_sot_asr_model_valid.acc.best/dev_2spk|3315|1230801|85.8|4.2|10.0|9.8|24.0|96.8| +|decode_sot_asr_model_valid.acc.best/dev_2spk_spk1|3315|1230801|47.9|6.1|46.0|3.4|55.5|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_spk2|3315|1230801|47.2|6.5|46.2|3.4|56.2|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_sys7_8khz_spk1_large|1606|735694|48.0|4.5|47.4|2.6|54.5|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_sys7_8khz_spk2_large|1606|735694|47.3|5.3|47.3|2.6|55.2|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk|2059|1140428|77.7|8.0|14.3|8.6|30.9|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk_kaldi_fmt|1004|677017|80.1|6.0|13.9|5.0|24.9|99.9| +|decode_sot_asr_model_valid.acc.best/dev_4spk|1467|1087409|68.5|10.4|21.1|7.4|39.0|100.0| +|decode_sot_asr_model_valid.acc.best/dev_4spk_kaldi_fmt|721|647884|70.2|7.5|22.3|4.8|34.6|100.0| +|decode_sot_asr_model_valid.acc.best/dev_kaldi_fmt|605|258151|90.1|4.1|5.8|4.0|13.9|93.6| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|1550429|85.9|4.2|9.9|8.8|22.9|97.6| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk_spk1|4570|1550429|46.2|6.4|47.4|3.1|56.9|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk_spk2|4570|1550429|45.8|5.8|48.4|2.7|56.9|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|1084475|77.9|8.0|14.1|9.1|31.2|99.9| +|decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|938467|69.5|10.1|20.4|8.2|38.7|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|1742136|86.1|4.9|9.0|9.6|23.5|99.1| +|decode_sot_asr_model_valid.acc.best/test-other_2spk_spk1|4663|1742136|44.1|7.9|48.0|3.6|59.5|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk_spk2|4663|1742136|48.7|6.2|45.1|3.3|54.6|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|1381987|77.3|9.2|13.6|9.6|32.3|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|1346646|67.6|11.6|20.8|8.0|40.4|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|921344|88.2|2.9|8.9|5.0|16.8|95.1| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_sys7_8khz_spk1_large|2180|921344|46.1|5.0|48.9|2.5|56.4|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_sys7_8khz_spk2_large|2180|921344|46.9|4.8|48.2|2.2|55.3|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_3spk_kaldi_fmt|977|635802|79.7|6.3|14.0|5.7|26.1|99.8| +|decode_sot_asr_model_valid.acc.best/test_clean_4spk_kaldi_fmt|632|552325|71.1|7.2|21.7|5.5|34.4|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_kaldi_fmt|961|329390|91.8|3.4|4.8|4.0|12.2|94.8| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|1064868|88.0|3.8|8.2|5.6|17.6|98.4| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_sys7_8khz_spk1_large|2363|1064868|44.0|6.7|49.3|2.8|58.7|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_sys7_8khz_spk2_large|2363|1064868|49.1|5.2|45.7|2.6|53.5|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_3spk_kaldi_fmt|1246|847159|78.5|7.7|13.8|6.6|28.1|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_4spk_kaldi_fmt|901|817228|68.9|8.8|22.3|5.9|37.0|100.0| +|decode_sot_asr_model_valid.acc.best/test_other_kaldi_fmt|992|416899|88.3|5.2|6.5|5.7|17.4|97.5| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/decode_sot_asr_model_valid.acc.best +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_2spk_kaldi_fmt|1606|135101|84.1|5.7|10.2|6.8|22.7|93.5| +|org/tt_mix_clean_reverb_max_16k|3000|3000|0.0|100.0|0.0|3712.3|3812.3|100.0| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|org/dev_2spk_kaldi_fmt|1606|735694|87.5|3.0|9.4|5.9|18.4|93.5| +|org/tt_mix_clean_reverb_max_16k|3000|143026|17.0|82.9|0.1|370.7|453.8|100.0| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth new file mode 100644 index 0000000000000000000000000000000000000000..a4702c1f7594467010cadd2f21d8713a0e5e2692 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c781ea07d1d9c8d8325bbfc5c127a3fe0ffb838d3f2c07a826f804c91a4b339a +size 516979614 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..83275f2edeaa3ee668da14989f9fbee7eb4630e8 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml @@ -0,0 +1,227 @@ +config: conf/tuning/train_sot_asr_conformer_large.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new +ngpu: 1 +seed: 0 +num_workers: 16 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: 4 +dist_rank: 0 +local_rank: 0 +dist_master_addr: localhost +dist_master_port: 58141 +dist_launcher: null +multiprocessing_distributed: true +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: false +write_collected_feats: false +max_epoch: 60 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 4 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: +- /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 16000000 +valid_batch_bins: null +train_shape_file: +- exp/asr_stats_raw_en_char/train/speech_shape +- exp/asr_stats_raw_en_char/train/text_shape.char +valid_shape_file: +- exp/asr_stats_raw_en_char/valid/speech_shape +- exp/asr_stats_raw_en_char/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/train_large_kaldi_fmt/wav.scp + - speech + - kaldi_ark +- - dump/raw/train_large_kaldi_fmt/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/raw/dev_2spk_kaldi_fmt/wav.scp + - speech + - kaldi_ark +- - dump/raw/dev_2spk_kaldi_fmt/text + - text + - text +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 20000 +token_list: +- +- +- +- +- E +- T +- A +- O +- N +- I +- H +- S +- R +- D +- L +- U +- M +- C +- W +- F +- G +- Y +- P +- B +- V +- K +- '''' +- X +- J +- Q +- Z +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true +joint_net_conf: null +use_preprocessor: true +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + fs: 16k +specaug: null +specaug_conf: {} +normalize: global_mvn +normalize_conf: + stats_file: exp/asr_stats_raw_en_char/train/feats_stats.npz +model: espnet +model_conf: + ctc_weight: 0.0 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: conformer +encoder_conf: + output_size: 256 + attention_heads: 4 + linear_units: 2048 + num_blocks: 12 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + normalize_before: true + macaron_style: true + rel_pos_type: latest + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + activation_type: swish + use_cnn_module: true + cnn_module_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 +preprocessor: multi +preprocessor_conf: + speaker_change_symbol: + - +required: +- output_dir +- token_list +version: '202308' +distributed: true diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/acc.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..eaa202c9a7263e1b5f5cece9b748ef1eda2f6f2a Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/acc.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/backward_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..565755131d4df37c66a0565147896f02120add59 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/backward_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/cer.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..615fb2ab82e85c8eb17c5c06c86bf4511799c27a Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/cer.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/clip.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..8dcd44eeb8e61cefd50db5514c608663bad42973 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/clip.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/forward_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..1fa6334ab2dfa3ab542119aada164fb6ee0516a9 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/forward_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/gpu_max_cached_mem_GB.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..16696d2cdf58aee8ac2bfda000faabee644cd50e Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/gpu_max_cached_mem_GB.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/grad_norm.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..5aa3069a11b1fef2aa10854ee56187560ca01afd Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/grad_norm.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/iter_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..a1d84024156ebc8ad64550fa10e6a070f6cc4901 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/iter_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..70bba12c2cff94360c9c17dc0a8628cebdb588c9 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_att.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..6d864cc34d45c0f0644580380a8639faf07ed8a9 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_att.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_scale.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..460bfdb13d9e33389b68b962751b15221d3e531f Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/loss_scale.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim0_lr0.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..aa60fb980c720ad00b302295b4aa5dc8ffc2ec0a Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim0_lr0.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim_step_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..957891746d71c89f1f5e5b022e82d38552a4cd94 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/optim_step_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/train_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..d2f5116afaa5ae73132142e8bde948fbdc7cdc83 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/train_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/wer.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..87968cf96760302c071bbb71924e63a782bf4eab Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/images/wer.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/latest.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/latest.pth new file mode 100644 index 0000000000000000000000000000000000000000..586c06fc68b637b6f569bcc3a2cf5486b8798763 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/latest.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c215ed61c217a50608410e89c77ae5f208ca92c388249d897ad93e33561e3ba +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/run.sh b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..1b29454affdc7cb4d6531b0096ee99180ee85602 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/run.sh @@ -0,0 +1 @@ +./asr.sh --lang en --audio_format flac.ark --stage 2 --feats_type raw --token_type char --sot_asr true --max_wav_duration 50 --feats_normalize global_mvn --use_lm false --pretrained_model /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --asr_config conf/tuning/train_sot_asr_conformer_large.yaml --lm_config conf/tuning/train_lm_transformer.yaml --inference_config conf/tuning/decode_sot.yaml --train_set train_large_kaldi_fmt --valid_set dev_2spk_kaldi_fmt --test_sets 'dev_kaldi_fmt test_clean_kaldi_fmt test_other_kaldi_fmt' --ngpu 4 --asr_tag train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --lm_train_text data/local/other_text/text --bpe_train_text data/train_large_kaldi_fmt/text --stage 10 --stage 11 "$@"; exit $? diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1704228297.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1813079.0 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1704228297.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1813079.0 new file mode 100644 index 0000000000000000000000000000000000000000..17ec6fa8a2ee6ce7e3ee7df6b913849fc4ae7044 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1704228297.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1813079.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f651320400a9314034267172216336b3e21b9e518a65553fa75cbfa69d5955 +size 667008502 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1705380444.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.3520216.0 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1705380444.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.3520216.0 new file mode 100644 index 0000000000000000000000000000000000000000..49fc02d8377555d72ed6bbbeb708a77b386ee1f0 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/train/events.out.tfevents.1705380444.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.3520216.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86beae4916acd5f20539b2daeb461e38342baf385c48af42d18f8b0e979504e7 +size 193681524 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1704228297.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1813079.1 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1704228297.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1813079.1 new file mode 100644 index 0000000000000000000000000000000000000000..aeb7eb32b1afae922301720a87f810efdb4dcbdd --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1704228297.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.1813079.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c0ffbc3c49d3f0d972dd24e2a750899a4c9c1e7df1ba0b28bcec22aefee522 +size 12778 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1705380444.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.3520216.1 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1705380444.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.3520216.1 new file mode 100644 index 0000000000000000000000000000000000000000..7f4db58ed9e4ab98496c3a5e393943eee5bfbe92 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/tensorboard/valid/events.out.tfevents.1705380444.de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr.3520216.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd9de87f3d29c3dfcbbfa6762a59e413eee1bfaf26e3327da03f5327430c76f2 +size 4330 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.1.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.1.log new file mode 100644 index 0000000000000000000000000000000000000000..43da5f4594d321214670c45c331f0fda1b691b11 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.1.log @@ -0,0 +1,8475 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_large_kaldi_fmt/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_large_kaldi_fmt/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Wed Jan 3 04:42:54 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_large_kaldi_fmt/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_large_kaldi_fmt/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:46773 (errno: 99 - Cannot assign requested address). +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:46773 (errno: 99 - Cannot assign requested address). +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:46773 (errno: 99 - Cannot assign requested address). +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:19,178 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:19,179 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:19,217 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:23,139 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:23,148 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:23,149 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:23,149 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:23,151 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:23,164 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:30,228 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:52,705 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/train_large_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/train_large_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:52,705 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=35996, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:52,713 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=35996, mean=53.8, min=15, max=258 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:53,001 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:53,008 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:53,008 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=12, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:53,008 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=12, mean=50.4, min=17, max=82 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:53,012 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:53,032 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:53,032 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=605, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:53,032 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813082 [2] NCCL INFO Using network Socket +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813081 [1] NCCL INFO Using network Socket +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813083 [3] NCCL INFO Using network Socket +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO Setting affinity for GPU 6 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO Setting affinity for GPU 5 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO Setting affinity for GPU 7 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO Setting affinity for GPU 4 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO Channel 00 : 2[b4000] -> 3[b5000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO Channel 00 : 1[b2000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO Channel 00 : 0[b1000] -> 1[b2000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO Channel 00 : 3[b5000] -> 0[b1000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO Channel 01 : 2[b4000] -> 3[b5000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO Channel 01 : 1[b2000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO Channel 01 : 0[b1000] -> 1[b2000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO Channel 01 : 3[b5000] -> 0[b1000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO Channel 00 : 3[b5000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO Channel 01 : 3[b5000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO Channel 00 : 2[b4000] -> 1[b2000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO Channel 00 : 1[b2000] -> 0[b1000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO Channel 01 : 2[b4000] -> 1[b2000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO Channel 01 : 1[b2000] -> 0[b1000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813854 [2] NCCL INFO comm 0x7f503c002f70 rank 2 nranks 4 cudaDev 2 busId b4000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813866 [3] NCCL INFO comm 0x7effbc002f70 rank 3 nranks 4 cudaDev 3 busId b5000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813849 [0] NCCL INFO comm 0x7f65bc002f70 rank 0 nranks 4 cudaDev 0 busId b1000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813858 [1] NCCL INFO comm 0x7f48e4002f70 rank 1 nranks 4 cudaDev 1 busId b2000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813079 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:44:57,866 (trainer:284) INFO: 1/60epoch started +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 04:50:12,104 (distributed:995) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 05:10:16,620 (trainer:732) INFO: 1epoch:train:1-1799batch: iter_time=9.371e-04, forward_time=0.207, loss_att=631.025, acc=0.519, loss=631.025, backward_time=0.299, grad_norm=259.749, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=2.260e-05, train_time=3.378 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 05:30:20,346 (trainer:732) INFO: 1epoch:train:1800-3598batch: iter_time=2.359e-04, forward_time=0.201, loss_att=463.436, acc=0.613, loss=463.436, backward_time=0.298, grad_norm=83.133, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.755e-05, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 05:50:24,635 (trainer:732) INFO: 1epoch:train:3599-5397batch: iter_time=2.342e-04, forward_time=0.202, loss_att=421.992, acc=0.644, loss=421.992, backward_time=0.298, grad_norm=74.787, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=1.125e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 06:10:30,258 (trainer:732) INFO: 1epoch:train:5398-7196batch: iter_time=2.302e-04, forward_time=0.202, loss_att=399.091, acc=0.666, loss=399.091, backward_time=0.298, grad_norm=77.015, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=1.576e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 06:30:36,502 (trainer:732) INFO: 1epoch:train:7197-8995batch: iter_time=2.296e-04, forward_time=0.202, loss_att=377.673, acc=0.684, loss=377.673, backward_time=0.299, grad_norm=84.557, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=2.025e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 06:50:43,263 (trainer:732) INFO: 1epoch:train:8996-10794batch: iter_time=2.265e-04, forward_time=0.202, loss_att=360.272, acc=0.699, loss=360.272, backward_time=0.299, grad_norm=89.241, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=2.474e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 07:10:47,784 (trainer:732) INFO: 1epoch:train:10795-12593batch: iter_time=2.210e-04, forward_time=0.202, loss_att=339.724, acc=0.712, loss=339.724, backward_time=0.298, grad_norm=93.701, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=2.924e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 07:30:52,245 (trainer:732) INFO: 1epoch:train:12594-14392batch: iter_time=2.191e-04, forward_time=0.201, loss_att=324.988, acc=0.724, loss=324.988, backward_time=0.298, grad_norm=94.620, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=3.374e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 07:50:53,707 (trainer:732) INFO: 1epoch:train:14393-16191batch: iter_time=2.125e-04, forward_time=0.201, loss_att=313.758, acc=0.733, loss=313.758, backward_time=0.297, grad_norm=94.641, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=3.824e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 08:10:59,305 (trainer:732) INFO: 1epoch:train:16192-17990batch: iter_time=2.111e-04, forward_time=0.202, loss_att=304.230, acc=0.743, loss=304.230, backward_time=0.299, grad_norm=98.492, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.273e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 08:31:02,608 (trainer:732) INFO: 1epoch:train:17991-19789batch: iter_time=2.088e-04, forward_time=0.201, loss_att=290.692, acc=0.751, loss=290.692, backward_time=0.298, grad_norm=99.396, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.724e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 08:51:03,958 (trainer:732) INFO: 1epoch:train:19790-21588batch: iter_time=2.075e-04, forward_time=0.201, loss_att=283.895, acc=0.758, loss=283.895, backward_time=0.298, grad_norm=100.060, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.173e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 09:11:07,751 (trainer:732) INFO: 1epoch:train:21589-23387batch: iter_time=2.113e-04, forward_time=0.202, loss_att=282.315, acc=0.763, loss=282.315, backward_time=0.298, grad_norm=98.835, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.623e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 09:31:09,383 (trainer:732) INFO: 1epoch:train:23388-25186batch: iter_time=2.117e-04, forward_time=0.201, loss_att=268.577, acc=0.770, loss=268.577, backward_time=0.298, grad_norm=96.696, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.072e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 09:51:12,636 (trainer:732) INFO: 1epoch:train:25187-26985batch: iter_time=2.143e-04, forward_time=0.202, loss_att=259.087, acc=0.778, loss=259.087, backward_time=0.298, grad_norm=97.924, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.523e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 10:11:17,413 (trainer:732) INFO: 1epoch:train:26986-28784batch: iter_time=2.210e-04, forward_time=0.202, loss_att=247.842, acc=0.790, loss=247.842, backward_time=0.298, grad_norm=99.830, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.972e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 10:31:23,725 (trainer:732) INFO: 1epoch:train:28785-30583batch: iter_time=2.127e-04, forward_time=0.202, loss_att=236.647, acc=0.800, loss=236.647, backward_time=0.299, grad_norm=103.114, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.422e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 10:51:27,525 (trainer:732) INFO: 1epoch:train:30584-32382batch: iter_time=2.083e-04, forward_time=0.202, loss_att=222.378, acc=0.811, loss=222.378, backward_time=0.298, grad_norm=103.620, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.871e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 11:11:31,205 (trainer:732) INFO: 1epoch:train:32383-34181batch: iter_time=2.070e-04, forward_time=0.202, loss_att=212.574, acc=0.819, loss=212.574, backward_time=0.298, grad_norm=103.873, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=8.321e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 11:31:33,694 (trainer:732) INFO: 1epoch:train:34182-35980batch: iter_time=2.080e-04, forward_time=0.202, loss_att=203.580, acc=0.825, loss=203.580, backward_time=0.298, grad_norm=106.686, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=8.771e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 11:39:57,462 (trainer:338) INFO: 1epoch results: [train] iter_time=2.533e-04, forward_time=0.202, loss_att=321.966, acc=0.730, loss=321.966, backward_time=0.298, grad_norm=102.981, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.501e-04, train_time=2.712, time=6 hours, 46 minutes and 55.7 seconds, total_count=35996, gpu_max_cached_mem_GB=30.176, [valid] loss_att=152.373, acc=0.864, cer=0.171, wer=0.430, loss=152.373, time=4 minutes and 28.26 seconds, total_count=12, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 35.6 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 11:40:01,135 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 11:40:01,136 (trainer:272) INFO: 2/60epoch started. Estimated time to finish: 2 weeks, 3 days and 8 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 12:04:27,780 (trainer:732) INFO: 2epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=195.879, acc=0.834, loss=195.879, backward_time=0.298, grad_norm=107.800, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=9.225e-04, train_time=3.262 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 12:24:34,167 (trainer:732) INFO: 2epoch:train:1800-3598batch: iter_time=2.437e-04, forward_time=0.203, loss_att=191.331, acc=0.837, loss=191.331, backward_time=0.298, grad_norm=107.283, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=9.674e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 12:44:39,086 (trainer:732) INFO: 2epoch:train:3599-5397batch: iter_time=2.426e-04, forward_time=0.202, loss_att=184.085, acc=0.842, loss=184.085, backward_time=0.297, grad_norm=108.686, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 13:04:42,843 (trainer:732) INFO: 2epoch:train:5398-7196batch: iter_time=2.417e-04, forward_time=0.202, loss_att=181.295, acc=0.847, loss=181.295, backward_time=0.297, grad_norm=107.876, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 13:24:49,803 (trainer:732) INFO: 2epoch:train:7197-8995batch: iter_time=2.388e-04, forward_time=0.203, loss_att=176.325, acc=0.851, loss=176.325, backward_time=0.298, grad_norm=108.677, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 13:44:54,499 (trainer:732) INFO: 2epoch:train:8996-10794batch: iter_time=2.400e-04, forward_time=0.203, loss_att=171.506, acc=0.854, loss=171.506, backward_time=0.298, grad_norm=110.228, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 14:05:01,460 (trainer:732) INFO: 2epoch:train:10795-12593batch: iter_time=2.385e-04, forward_time=0.203, loss_att=169.178, acc=0.857, loss=169.178, backward_time=0.298, grad_norm=109.652, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 14:25:05,975 (trainer:732) INFO: 2epoch:train:12594-14392batch: iter_time=2.382e-04, forward_time=0.202, loss_att=163.001, acc=0.861, loss=163.001, backward_time=0.297, grad_norm=108.115, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 14:45:12,167 (trainer:732) INFO: 2epoch:train:14393-16191batch: iter_time=2.370e-04, forward_time=0.203, loss_att=163.338, acc=0.862, loss=163.338, backward_time=0.298, grad_norm=110.811, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 15:05:18,217 (trainer:732) INFO: 2epoch:train:16192-17990batch: iter_time=2.411e-04, forward_time=0.202, loss_att=155.676, acc=0.867, loss=155.676, backward_time=0.298, grad_norm=108.726, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 15:25:22,424 (trainer:732) INFO: 2epoch:train:17991-19789batch: iter_time=2.396e-04, forward_time=0.202, loss_att=154.453, acc=0.868, loss=154.453, backward_time=0.297, grad_norm=110.386, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 15:45:27,155 (trainer:732) INFO: 2epoch:train:19790-21588batch: iter_time=2.426e-04, forward_time=0.202, loss_att=151.284, acc=0.871, loss=151.284, backward_time=0.297, grad_norm=109.666, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 16:05:27,174 (trainer:732) INFO: 2epoch:train:21589-23387batch: iter_time=2.407e-04, forward_time=0.201, loss_att=148.905, acc=0.871, loss=148.905, backward_time=0.296, grad_norm=132.383, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 16:25:32,187 (trainer:732) INFO: 2epoch:train:23388-25186batch: iter_time=2.383e-04, forward_time=0.202, loss_att=147.329, acc=0.874, loss=147.329, backward_time=0.298, grad_norm=108.512, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 16:45:38,351 (trainer:732) INFO: 2epoch:train:25187-26985batch: iter_time=2.401e-04, forward_time=0.202, loss_att=144.091, acc=0.877, loss=144.091, backward_time=0.298, grad_norm=111.022, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 17:05:45,072 (trainer:732) INFO: 2epoch:train:26986-28784batch: iter_time=2.412e-04, forward_time=0.203, loss_att=142.434, acc=0.878, loss=142.434, backward_time=0.298, grad_norm=111.777, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 17:25:50,336 (trainer:732) INFO: 2epoch:train:28785-30583batch: iter_time=2.378e-04, forward_time=0.202, loss_att=142.343, acc=0.878, loss=142.343, backward_time=0.298, grad_norm=111.222, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 17:45:59,589 (trainer:732) INFO: 2epoch:train:30584-32382batch: iter_time=2.388e-04, forward_time=0.203, loss_att=142.135, acc=0.879, loss=142.135, backward_time=0.299, grad_norm=119.605, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 18:06:03,578 (trainer:732) INFO: 2epoch:train:32383-34181batch: iter_time=2.380e-04, forward_time=0.202, loss_att=139.436, acc=0.881, loss=139.436, backward_time=0.297, grad_norm=111.174, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 18:26:10,084 (trainer:732) INFO: 2epoch:train:34182-35980batch: iter_time=2.369e-04, forward_time=0.202, loss_att=136.603, acc=0.884, loss=136.603, backward_time=0.297, grad_norm=112.719, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 18:34:23,359 (trainer:338) INFO: 2epoch results: [train] iter_time=2.797e-04, forward_time=0.202, loss_att=160.010, acc=0.864, loss=160.010, backward_time=0.298, grad_norm=111.306, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.709, time=6 hours, 46 minutes and 30.3 seconds, total_count=71992, gpu_max_cached_mem_GB=30.176, [valid] loss_att=98.960, acc=0.911, cer=0.111, wer=0.307, loss=98.960, time=4 minutes and 21.1 seconds, total_count=24, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 30.82 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 18:34:26,517 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 18:34:26,519 (trainer:272) INFO: 3/60epoch started. Estimated time to finish: 2 weeks, 2 days and 16 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 18:58:58,329 (trainer:732) INFO: 3epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=134.933, acc=0.885, loss=134.933, backward_time=0.298, grad_norm=114.278, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=3.274 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 19:19:03,391 (trainer:732) INFO: 3epoch:train:1800-3598batch: iter_time=2.499e-04, forward_time=0.202, loss_att=133.121, acc=0.886, loss=133.121, backward_time=0.297, grad_norm=108.296, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 19:39:08,975 (trainer:732) INFO: 3epoch:train:3599-5397batch: iter_time=2.508e-04, forward_time=0.202, loss_att=132.771, acc=0.887, loss=132.771, backward_time=0.297, grad_norm=112.695, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 19:59:13,409 (trainer:732) INFO: 3epoch:train:5398-7196batch: iter_time=2.497e-04, forward_time=0.202, loss_att=131.368, acc=0.887, loss=131.368, backward_time=0.297, grad_norm=113.035, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 20:19:20,487 (trainer:732) INFO: 3epoch:train:7197-8995batch: iter_time=2.423e-04, forward_time=0.203, loss_att=131.914, acc=0.888, loss=131.914, backward_time=0.298, grad_norm=112.261, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 20:39:25,863 (trainer:732) INFO: 3epoch:train:8996-10794batch: iter_time=2.442e-04, forward_time=0.202, loss_att=129.049, acc=0.890, loss=129.049, backward_time=0.297, grad_norm=109.349, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 20:59:30,234 (trainer:732) INFO: 3epoch:train:10795-12593batch: iter_time=2.439e-04, forward_time=0.202, loss_att=126.212, acc=0.892, loss=126.212, backward_time=0.297, grad_norm=109.914, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 21:19:33,357 (trainer:732) INFO: 3epoch:train:12594-14392batch: iter_time=2.462e-04, forward_time=0.202, loss_att=122.989, acc=0.893, loss=122.989, backward_time=0.297, grad_norm=108.018, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 21:39:43,069 (trainer:732) INFO: 3epoch:train:14393-16191batch: iter_time=2.434e-04, forward_time=0.203, loss_att=122.356, acc=0.895, loss=122.356, backward_time=0.299, grad_norm=110.535, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 21:59:48,911 (trainer:732) INFO: 3epoch:train:16192-17990batch: iter_time=2.395e-04, forward_time=0.203, loss_att=122.106, acc=0.896, loss=122.106, backward_time=0.298, grad_norm=107.436, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 22:19:55,068 (trainer:732) INFO: 3epoch:train:17991-19789batch: iter_time=2.448e-04, forward_time=0.202, loss_att=117.759, acc=0.899, loss=117.759, backward_time=0.297, grad_norm=106.385, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 22:40:01,849 (trainer:732) INFO: 3epoch:train:19790-21588batch: iter_time=2.476e-04, forward_time=0.203, loss_att=118.235, acc=0.899, loss=118.235, backward_time=0.298, grad_norm=103.886, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 23:00:08,437 (trainer:732) INFO: 3epoch:train:21589-23387batch: iter_time=2.384e-04, forward_time=0.203, loss_att=117.565, acc=0.900, loss=117.565, backward_time=0.298, grad_norm=106.593, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 23:20:18,468 (trainer:732) INFO: 3epoch:train:23388-25186batch: iter_time=2.450e-04, forward_time=0.203, loss_att=115.717, acc=0.902, loss=115.717, backward_time=0.299, grad_norm=110.031, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-03 23:40:25,994 (trainer:732) INFO: 3epoch:train:25187-26985batch: iter_time=2.440e-04, forward_time=0.203, loss_att=113.051, acc=0.903, loss=113.051, backward_time=0.298, grad_norm=104.019, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 00:00:35,192 (trainer:732) INFO: 3epoch:train:26986-28784batch: iter_time=2.441e-04, forward_time=0.203, loss_att=112.375, acc=0.904, loss=112.375, backward_time=0.299, grad_norm=106.707, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 00:20:39,988 (trainer:732) INFO: 3epoch:train:28785-30583batch: iter_time=2.486e-04, forward_time=0.202, loss_att=112.387, acc=0.904, loss=112.387, backward_time=0.297, grad_norm=108.973, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 00:40:41,269 (trainer:732) INFO: 3epoch:train:30584-32382batch: iter_time=2.449e-04, forward_time=0.202, loss_att=108.855, acc=0.905, loss=108.855, backward_time=0.296, grad_norm=106.741, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 01:00:43,252 (trainer:732) INFO: 3epoch:train:32383-34181batch: iter_time=2.452e-04, forward_time=0.202, loss_att=107.073, acc=0.907, loss=107.073, backward_time=0.296, grad_norm=105.724, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 01:20:48,595 (trainer:732) INFO: 3epoch:train:34182-35980batch: iter_time=2.405e-04, forward_time=0.202, loss_att=108.309, acc=0.907, loss=108.309, backward_time=0.297, grad_norm=105.323, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 01:29:17,301 (trainer:338) INFO: 3epoch results: [train] iter_time=2.836e-04, forward_time=0.202, loss_att=120.889, acc=0.896, loss=120.889, backward_time=0.298, grad_norm=108.505, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.710, time=6 hours, 46 minutes and 49.69 seconds, total_count=107988, gpu_max_cached_mem_GB=30.176, [valid] loss_att=77.974, acc=0.930, cer=0.087, wer=0.253, loss=77.974, time=4 minutes and 36.01 seconds, total_count=36, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 25.08 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 01:29:21,213 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 01:29:21,215 (trainer:272) INFO: 4/60epoch started. Estimated time to finish: 2 weeks, 2 days and 10 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 01:54:00,498 (trainer:732) INFO: 4epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=106.711, acc=0.909, loss=106.711, backward_time=0.298, grad_norm=105.838, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=3.290 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 02:14:02,567 (trainer:732) INFO: 4epoch:train:1800-3598batch: iter_time=2.396e-04, forward_time=0.202, loss_att=104.034, acc=0.909, loss=104.034, backward_time=0.296, grad_norm=103.325, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 02:34:05,914 (trainer:732) INFO: 4epoch:train:3599-5397batch: iter_time=2.333e-04, forward_time=0.202, loss_att=103.406, acc=0.911, loss=103.406, backward_time=0.297, grad_norm=103.531, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 02:54:12,456 (trainer:732) INFO: 4epoch:train:5398-7196batch: iter_time=2.325e-04, forward_time=0.203, loss_att=102.744, acc=0.912, loss=102.744, backward_time=0.298, grad_norm=105.799, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 03:14:17,661 (trainer:732) INFO: 4epoch:train:7197-8995batch: iter_time=2.346e-04, forward_time=0.202, loss_att=102.165, acc=0.912, loss=102.165, backward_time=0.297, grad_norm=104.209, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 03:34:23,619 (trainer:732) INFO: 4epoch:train:8996-10794batch: iter_time=2.324e-04, forward_time=0.202, loss_att=102.394, acc=0.913, loss=102.394, backward_time=0.298, grad_norm=102.811, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 03:54:34,484 (trainer:732) INFO: 4epoch:train:10795-12593batch: iter_time=2.279e-04, forward_time=0.203, loss_att=102.429, acc=0.913, loss=102.429, backward_time=0.299, grad_norm=105.252, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.002, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 04:14:45,786 (trainer:732) INFO: 4epoch:train:12594-14392batch: iter_time=2.264e-04, forward_time=0.203, loss_att=100.746, acc=0.914, loss=100.746, backward_time=0.299, grad_norm=105.878, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 04:34:51,472 (trainer:732) INFO: 4epoch:train:14393-16191batch: iter_time=2.267e-04, forward_time=0.202, loss_att=97.669, acc=0.916, loss=97.669, backward_time=0.298, grad_norm=104.813, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 04:55:00,517 (trainer:732) INFO: 4epoch:train:16192-17990batch: iter_time=2.260e-04, forward_time=0.203, loss_att=99.593, acc=0.915, loss=99.593, backward_time=0.298, grad_norm=106.281, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 05:15:03,825 (trainer:732) INFO: 4epoch:train:17991-19789batch: iter_time=2.279e-04, forward_time=0.202, loss_att=97.983, acc=0.915, loss=97.983, backward_time=0.297, grad_norm=104.101, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 05:35:09,741 (trainer:732) INFO: 4epoch:train:19790-21588batch: iter_time=2.268e-04, forward_time=0.202, loss_att=97.601, acc=0.916, loss=97.601, backward_time=0.298, grad_norm=104.181, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 05:55:13,920 (trainer:732) INFO: 4epoch:train:21589-23387batch: iter_time=2.293e-04, forward_time=0.202, loss_att=96.626, acc=0.917, loss=96.626, backward_time=0.297, grad_norm=101.403, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 06:15:19,554 (trainer:732) INFO: 4epoch:train:23388-25186batch: iter_time=2.307e-04, forward_time=0.202, loss_att=96.133, acc=0.917, loss=96.133, backward_time=0.298, grad_norm=99.753, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 06:35:21,880 (trainer:732) INFO: 4epoch:train:25187-26985batch: iter_time=2.258e-04, forward_time=0.202, loss_att=94.156, acc=0.918, loss=94.156, backward_time=0.297, grad_norm=99.817, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 06:55:28,419 (trainer:732) INFO: 4epoch:train:26986-28784batch: iter_time=2.252e-04, forward_time=0.203, loss_att=95.411, acc=0.918, loss=95.411, backward_time=0.298, grad_norm=104.313, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 07:15:34,648 (trainer:732) INFO: 4epoch:train:28785-30583batch: iter_time=2.228e-04, forward_time=0.202, loss_att=94.434, acc=0.919, loss=94.434, backward_time=0.298, grad_norm=105.781, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 07:35:37,540 (trainer:732) INFO: 4epoch:train:30584-32382batch: iter_time=2.204e-04, forward_time=0.202, loss_att=92.667, acc=0.920, loss=92.667, backward_time=0.297, grad_norm=103.391, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 07:55:41,061 (trainer:732) INFO: 4epoch:train:32383-34181batch: iter_time=2.260e-04, forward_time=0.202, loss_att=93.365, acc=0.920, loss=93.365, backward_time=0.297, grad_norm=103.940, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.002, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 08:15:47,795 (trainer:732) INFO: 4epoch:train:34182-35980batch: iter_time=2.171e-04, forward_time=0.203, loss_att=92.966, acc=0.920, loss=92.966, backward_time=0.298, grad_norm=103.532, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 08:23:59,361 (trainer:338) INFO: 4epoch results: [train] iter_time=2.693e-04, forward_time=0.202, loss_att=98.648, acc=0.915, loss=98.648, backward_time=0.298, grad_norm=103.889, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.002, train_time=2.711, time=6 hours, 46 minutes and 47.59 seconds, total_count=143984, gpu_max_cached_mem_GB=30.176, [valid] loss_att=67.858, acc=0.939, cer=0.077, wer=0.222, loss=67.858, time=4 minutes and 20.9 seconds, total_count=48, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 29.66 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 08:24:02,765 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 08:24:02,768 (trainer:272) INFO: 5/60epoch started. Estimated time to finish: 2 weeks, 2 days and 3 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 08:48:34,238 (trainer:732) INFO: 5epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=92.333, acc=0.921, loss=92.333, backward_time=0.298, grad_norm=103.044, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=3.273 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 09:08:37,246 (trainer:732) INFO: 5epoch:train:1800-3598batch: iter_time=2.289e-04, forward_time=0.202, loss_att=90.460, acc=0.922, loss=90.460, backward_time=0.297, grad_norm=106.115, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 09:28:39,781 (trainer:732) INFO: 5epoch:train:3599-5397batch: iter_time=2.259e-04, forward_time=0.202, loss_att=89.615, acc=0.922, loss=89.615, backward_time=0.297, grad_norm=101.601, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 09:48:46,341 (trainer:732) INFO: 5epoch:train:5398-7196batch: iter_time=2.302e-04, forward_time=0.203, loss_att=90.059, acc=0.923, loss=90.059, backward_time=0.298, grad_norm=105.201, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 10:08:52,260 (trainer:732) INFO: 5epoch:train:7197-8995batch: iter_time=2.274e-04, forward_time=0.203, loss_att=88.682, acc=0.924, loss=88.682, backward_time=0.298, grad_norm=103.024, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 10:29:00,012 (trainer:732) INFO: 5epoch:train:8996-10794batch: iter_time=2.288e-04, forward_time=0.203, loss_att=89.200, acc=0.923, loss=89.200, backward_time=0.298, grad_norm=101.427, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 10:49:02,938 (trainer:732) INFO: 5epoch:train:10795-12593batch: iter_time=2.234e-04, forward_time=0.202, loss_att=86.842, acc=0.924, loss=86.842, backward_time=0.297, grad_norm=102.963, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 11:09:06,061 (trainer:732) INFO: 5epoch:train:12594-14392batch: iter_time=2.258e-04, forward_time=0.202, loss_att=88.507, acc=0.923, loss=88.507, backward_time=0.297, grad_norm=99.919, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 11:29:14,599 (trainer:732) INFO: 5epoch:train:14393-16191batch: iter_time=2.284e-04, forward_time=0.203, loss_att=88.798, acc=0.925, loss=88.798, backward_time=0.298, grad_norm=102.413, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 11:49:21,636 (trainer:732) INFO: 5epoch:train:16192-17990batch: iter_time=2.267e-04, forward_time=0.203, loss_att=87.057, acc=0.926, loss=87.057, backward_time=0.298, grad_norm=103.390, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 12:09:29,560 (trainer:732) INFO: 5epoch:train:17991-19789batch: iter_time=2.289e-04, forward_time=0.203, loss_att=87.421, acc=0.925, loss=87.421, backward_time=0.298, grad_norm=103.305, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 12:29:32,166 (trainer:732) INFO: 5epoch:train:19790-21588batch: iter_time=2.359e-04, forward_time=0.202, loss_att=85.716, acc=0.925, loss=85.716, backward_time=0.297, grad_norm=98.140, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 12:49:39,593 (trainer:732) INFO: 5epoch:train:21589-23387batch: iter_time=2.368e-04, forward_time=0.203, loss_att=86.209, acc=0.926, loss=86.209, backward_time=0.298, grad_norm=98.921, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 13:09:46,396 (trainer:732) INFO: 5epoch:train:23388-25186batch: iter_time=2.332e-04, forward_time=0.203, loss_att=86.460, acc=0.926, loss=86.460, backward_time=0.298, grad_norm=97.492, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 13:29:53,252 (trainer:732) INFO: 5epoch:train:25187-26985batch: iter_time=2.402e-04, forward_time=0.203, loss_att=85.025, acc=0.927, loss=85.025, backward_time=0.298, grad_norm=103.244, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 13:49:56,554 (trainer:732) INFO: 5epoch:train:26986-28784batch: iter_time=2.460e-04, forward_time=0.202, loss_att=84.506, acc=0.926, loss=84.506, backward_time=0.297, grad_norm=97.449, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 14:10:04,140 (trainer:732) INFO: 5epoch:train:28785-30583batch: iter_time=2.416e-04, forward_time=0.203, loss_att=84.068, acc=0.927, loss=84.068, backward_time=0.298, grad_norm=101.177, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 14:31:26,886 (trainer:732) INFO: 5epoch:train:30584-32382batch: iter_time=5.461e-04, forward_time=0.219, loss_att=84.799, acc=0.927, loss=84.799, backward_time=0.302, grad_norm=100.245, clip=100.000, loss_scale=1.000, optim_step_time=0.095, optim0_lr0=0.001, train_time=2.851 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 14:53:04,011 (trainer:732) INFO: 5epoch:train:32383-34181batch: iter_time=6.227e-04, forward_time=0.224, loss_att=84.841, acc=0.927, loss=84.841, backward_time=0.304, grad_norm=97.882, clip=100.000, loss_scale=1.000, optim_step_time=0.099, optim0_lr0=0.001, train_time=2.883 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 15:14:32,504 (trainer:732) INFO: 5epoch:train:34182-35980batch: iter_time=5.642e-04, forward_time=0.222, loss_att=84.120, acc=0.928, loss=84.120, backward_time=0.303, grad_norm=99.826, clip=100.000, loss_scale=1.000, optim_step_time=0.097, optim0_lr0=0.001, train_time=2.864 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 15:22:59,523 (trainer:338) INFO: 5epoch results: [train] iter_time=3.231e-04, forward_time=0.205, loss_att=87.224, acc=0.925, loss=87.224, backward_time=0.298, grad_norm=101.325, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.001, train_time=2.738, time=6 hours, 50 minutes and 48.42 seconds, total_count=179980, gpu_max_cached_mem_GB=30.176, [valid] loss_att=61.610, acc=0.945, cer=0.071, wer=0.206, loss=61.610, time=4 minutes and 34.81 seconds, total_count=60, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 33.52 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 15:23:03,101 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 15:23:03,104 (trainer:272) INFO: 6/60epoch started. Estimated time to finish: 2 weeks, 1 day and 20 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 15:49:05,080 (trainer:732) INFO: 6epoch:train:1-1799batch: iter_time=0.001, forward_time=0.220, loss_att=83.897, acc=0.928, loss=83.897, backward_time=0.303, grad_norm=102.023, clip=100.000, loss_scale=1.000, optim_step_time=0.098, optim0_lr0=0.001, train_time=3.473 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 16:10:16,634 (trainer:732) INFO: 6epoch:train:1800-3598batch: iter_time=5.303e-04, forward_time=0.218, loss_att=81.881, acc=0.929, loss=81.881, backward_time=0.302, grad_norm=103.896, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.827 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 16:31:30,596 (trainer:732) INFO: 6epoch:train:3599-5397batch: iter_time=5.138e-04, forward_time=0.217, loss_att=82.313, acc=0.930, loss=82.313, backward_time=0.303, grad_norm=105.396, clip=100.000, loss_scale=1.000, optim_step_time=0.088, optim0_lr0=0.001, train_time=2.832 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 16:52:35,791 (trainer:732) INFO: 6epoch:train:5398-7196batch: iter_time=4.836e-04, forward_time=0.216, loss_att=82.681, acc=0.929, loss=82.681, backward_time=0.302, grad_norm=102.913, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.812 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 17:13:59,790 (trainer:732) INFO: 6epoch:train:7197-8995batch: iter_time=5.790e-04, forward_time=0.218, loss_att=81.668, acc=0.929, loss=81.668, backward_time=0.302, grad_norm=98.175, clip=100.000, loss_scale=1.000, optim_step_time=0.099, optim0_lr0=0.001, train_time=2.855 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 17:35:04,389 (trainer:732) INFO: 6epoch:train:8996-10794batch: iter_time=4.652e-04, forward_time=0.215, loss_att=80.951, acc=0.930, loss=80.951, backward_time=0.300, grad_norm=97.234, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=2.810 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 17:56:15,753 (trainer:732) INFO: 6epoch:train:10795-12593batch: iter_time=5.342e-04, forward_time=0.217, loss_att=81.342, acc=0.930, loss=81.342, backward_time=0.302, grad_norm=100.649, clip=100.000, loss_scale=1.000, optim_step_time=0.089, optim0_lr0=0.001, train_time=2.826 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 18:17:27,523 (trainer:732) INFO: 6epoch:train:12594-14392batch: iter_time=5.538e-04, forward_time=0.218, loss_att=80.216, acc=0.931, loss=80.216, backward_time=0.301, grad_norm=98.392, clip=100.000, loss_scale=1.000, optim_step_time=0.092, optim0_lr0=0.001, train_time=2.827 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 18:37:35,469 (trainer:732) INFO: 6epoch:train:14393-16191batch: iter_time=2.389e-04, forward_time=0.203, loss_att=79.667, acc=0.931, loss=79.667, backward_time=0.298, grad_norm=95.680, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 18:57:43,118 (trainer:732) INFO: 6epoch:train:16192-17990batch: iter_time=2.450e-04, forward_time=0.203, loss_att=79.291, acc=0.932, loss=79.291, backward_time=0.298, grad_norm=95.524, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 19:17:49,440 (trainer:732) INFO: 6epoch:train:17991-19789batch: iter_time=2.395e-04, forward_time=0.203, loss_att=80.181, acc=0.931, loss=80.181, backward_time=0.298, grad_norm=100.609, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 19:37:56,966 (trainer:732) INFO: 6epoch:train:19790-21588batch: iter_time=2.430e-04, forward_time=0.203, loss_att=78.661, acc=0.932, loss=78.661, backward_time=0.298, grad_norm=99.540, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 19:58:03,424 (trainer:732) INFO: 6epoch:train:21589-23387batch: iter_time=2.363e-04, forward_time=0.203, loss_att=79.905, acc=0.931, loss=79.905, backward_time=0.298, grad_norm=93.814, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 20:18:07,232 (trainer:732) INFO: 6epoch:train:23388-25186batch: iter_time=2.405e-04, forward_time=0.203, loss_att=78.785, acc=0.932, loss=78.785, backward_time=0.297, grad_norm=95.391, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 20:38:14,667 (trainer:732) INFO: 6epoch:train:25187-26985batch: iter_time=2.394e-04, forward_time=0.203, loss_att=78.830, acc=0.932, loss=78.830, backward_time=0.298, grad_norm=95.597, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 20:58:18,707 (trainer:732) INFO: 6epoch:train:26986-28784batch: iter_time=2.364e-04, forward_time=0.202, loss_att=78.715, acc=0.932, loss=78.715, backward_time=0.297, grad_norm=98.856, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 21:18:26,753 (trainer:732) INFO: 6epoch:train:28785-30583batch: iter_time=2.632e-04, forward_time=0.203, loss_att=78.147, acc=0.932, loss=78.147, backward_time=0.297, grad_norm=98.498, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 21:38:37,436 (trainer:732) INFO: 6epoch:train:30584-32382batch: iter_time=2.585e-04, forward_time=0.203, loss_att=78.128, acc=0.933, loss=78.128, backward_time=0.298, grad_norm=93.640, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 21:58:47,406 (trainer:732) INFO: 6epoch:train:32383-34181batch: iter_time=2.590e-04, forward_time=0.203, loss_att=77.355, acc=0.933, loss=77.355, backward_time=0.298, grad_norm=96.237, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 22:18:51,753 (trainer:732) INFO: 6epoch:train:34182-35980batch: iter_time=2.408e-04, forward_time=0.202, loss_att=77.082, acc=0.933, loss=77.082, backward_time=0.297, grad_norm=95.812, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 22:27:26,115 (trainer:338) INFO: 6epoch results: [train] iter_time=3.866e-04, forward_time=0.209, loss_att=79.978, acc=0.931, loss=79.978, backward_time=0.299, grad_norm=98.397, clip=100.000, loss_scale=1.000, optim_step_time=0.071, optim0_lr0=0.001, train_time=2.773, time=6 hours, 56 minutes and 16.58 seconds, total_count=215976, gpu_max_cached_mem_GB=30.176, [valid] loss_att=57.384, acc=0.949, cer=0.064, wer=0.191, loss=57.384, time=4 minutes and 34.79 seconds, total_count=72, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 31.64 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 22:27:29,902 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 22:27:29,906 (trainer:272) INFO: 7/60epoch started. Estimated time to finish: 2 weeks, 1 day and 15 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 22:52:06,228 (trainer:732) INFO: 7epoch:train:1-1799batch: iter_time=9.661e-04, forward_time=0.203, loss_att=76.251, acc=0.934, loss=76.251, backward_time=0.297, grad_norm=97.175, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=3.283 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 23:12:11,737 (trainer:732) INFO: 7epoch:train:1800-3598batch: iter_time=2.347e-04, forward_time=0.202, loss_att=75.304, acc=0.934, loss=75.304, backward_time=0.297, grad_norm=100.216, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 23:32:17,812 (trainer:732) INFO: 7epoch:train:3599-5397batch: iter_time=2.252e-04, forward_time=0.202, loss_att=76.656, acc=0.934, loss=76.656, backward_time=0.297, grad_norm=97.903, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-04 23:52:19,963 (trainer:732) INFO: 7epoch:train:5398-7196batch: iter_time=2.264e-04, forward_time=0.201, loss_att=75.265, acc=0.934, loss=75.265, backward_time=0.296, grad_norm=97.236, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 00:12:25,134 (trainer:732) INFO: 7epoch:train:7197-8995batch: iter_time=2.300e-04, forward_time=0.202, loss_att=75.986, acc=0.935, loss=75.986, backward_time=0.297, grad_norm=96.110, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 00:32:27,965 (trainer:732) INFO: 7epoch:train:8996-10794batch: iter_time=2.276e-04, forward_time=0.202, loss_att=75.413, acc=0.935, loss=75.413, backward_time=0.297, grad_norm=95.468, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 00:52:34,886 (trainer:732) INFO: 7epoch:train:10795-12593batch: iter_time=2.270e-04, forward_time=0.202, loss_att=76.504, acc=0.934, loss=76.504, backward_time=0.298, grad_norm=98.025, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 01:12:41,012 (trainer:732) INFO: 7epoch:train:12594-14392batch: iter_time=2.260e-04, forward_time=0.202, loss_att=75.635, acc=0.935, loss=75.635, backward_time=0.297, grad_norm=97.256, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 01:32:51,371 (trainer:732) INFO: 7epoch:train:14393-16191batch: iter_time=2.233e-04, forward_time=0.203, loss_att=75.594, acc=0.935, loss=75.594, backward_time=0.299, grad_norm=96.396, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 01:52:55,490 (trainer:732) INFO: 7epoch:train:16192-17990batch: iter_time=2.244e-04, forward_time=0.202, loss_att=73.881, acc=0.936, loss=73.881, backward_time=0.297, grad_norm=95.663, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 02:13:03,856 (trainer:732) INFO: 7epoch:train:17991-19789batch: iter_time=2.268e-04, forward_time=0.203, loss_att=75.110, acc=0.936, loss=75.110, backward_time=0.298, grad_norm=96.937, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 02:33:06,440 (trainer:732) INFO: 7epoch:train:19790-21588batch: iter_time=2.236e-04, forward_time=0.202, loss_att=74.247, acc=0.936, loss=74.247, backward_time=0.296, grad_norm=99.635, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 02:53:07,363 (trainer:732) INFO: 7epoch:train:21589-23387batch: iter_time=2.325e-04, forward_time=0.202, loss_att=74.868, acc=0.935, loss=74.868, backward_time=0.296, grad_norm=94.358, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 03:13:15,200 (trainer:732) INFO: 7epoch:train:23388-25186batch: iter_time=2.232e-04, forward_time=0.202, loss_att=75.018, acc=0.936, loss=75.018, backward_time=0.298, grad_norm=96.603, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 03:33:21,293 (trainer:732) INFO: 7epoch:train:25187-26985batch: iter_time=2.210e-04, forward_time=0.202, loss_att=73.307, acc=0.937, loss=73.307, backward_time=0.298, grad_norm=98.173, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 03:53:25,382 (trainer:732) INFO: 7epoch:train:26986-28784batch: iter_time=2.165e-04, forward_time=0.202, loss_att=73.757, acc=0.936, loss=73.757, backward_time=0.297, grad_norm=94.716, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 04:13:31,350 (trainer:732) INFO: 7epoch:train:28785-30583batch: iter_time=2.210e-04, forward_time=0.202, loss_att=73.914, acc=0.936, loss=73.914, backward_time=0.298, grad_norm=96.193, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 04:33:39,440 (trainer:732) INFO: 7epoch:train:30584-32382batch: iter_time=2.210e-04, forward_time=0.203, loss_att=72.449, acc=0.937, loss=72.449, backward_time=0.298, grad_norm=96.838, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 04:53:48,768 (trainer:732) INFO: 7epoch:train:32383-34181batch: iter_time=2.197e-04, forward_time=0.203, loss_att=74.314, acc=0.937, loss=74.314, backward_time=0.299, grad_norm=98.137, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 05:13:53,778 (trainer:732) INFO: 7epoch:train:34182-35980batch: iter_time=2.203e-04, forward_time=0.202, loss_att=73.559, acc=0.937, loss=73.559, backward_time=0.297, grad_norm=98.780, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 05:22:17,058 (trainer:338) INFO: 7epoch results: [train] iter_time=2.618e-04, forward_time=0.202, loss_att=74.852, acc=0.935, loss=74.852, backward_time=0.297, grad_norm=97.098, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.710, time=6 hours, 46 minutes and 46.92 seconds, total_count=251972, gpu_max_cached_mem_GB=30.176, [valid] loss_att=53.876, acc=0.952, cer=0.061, wer=0.181, loss=53.876, time=4 minutes and 28.77 seconds, total_count=84, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 31.46 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 05:22:20,863 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 05:22:20,866 (trainer:272) INFO: 8/60epoch started. Estimated time to finish: 2 weeks, 1 day and 8 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 05:46:53,463 (trainer:732) INFO: 8epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=72.285, acc=0.938, loss=72.285, backward_time=0.298, grad_norm=96.813, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=3.275 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 06:06:56,842 (trainer:732) INFO: 8epoch:train:1800-3598batch: iter_time=2.239e-04, forward_time=0.202, loss_att=71.288, acc=0.938, loss=71.288, backward_time=0.297, grad_norm=94.432, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 06:27:01,360 (trainer:732) INFO: 8epoch:train:3599-5397batch: iter_time=2.197e-04, forward_time=0.202, loss_att=72.018, acc=0.938, loss=72.018, backward_time=0.297, grad_norm=95.992, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 06:47:06,924 (trainer:732) INFO: 8epoch:train:5398-7196batch: iter_time=2.210e-04, forward_time=0.202, loss_att=70.716, acc=0.939, loss=70.716, backward_time=0.297, grad_norm=101.215, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 07:07:12,300 (trainer:732) INFO: 8epoch:train:7197-8995batch: iter_time=2.173e-04, forward_time=0.202, loss_att=71.718, acc=0.938, loss=71.718, backward_time=0.297, grad_norm=99.608, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=0.001, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 07:27:14,926 (trainer:732) INFO: 8epoch:train:8996-10794batch: iter_time=2.166e-04, forward_time=0.202, loss_att=71.373, acc=0.938, loss=71.373, backward_time=0.297, grad_norm=98.304, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 07:47:16,649 (trainer:732) INFO: 8epoch:train:10795-12593batch: iter_time=2.162e-04, forward_time=0.202, loss_att=72.305, acc=0.938, loss=72.305, backward_time=0.296, grad_norm=100.904, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 08:07:23,042 (trainer:732) INFO: 8epoch:train:12594-14392batch: iter_time=2.176e-04, forward_time=0.202, loss_att=72.231, acc=0.938, loss=72.231, backward_time=0.298, grad_norm=98.008, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 08:27:29,264 (trainer:732) INFO: 8epoch:train:14393-16191batch: iter_time=2.166e-04, forward_time=0.202, loss_att=72.242, acc=0.938, loss=72.242, backward_time=0.297, grad_norm=98.902, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 08:47:35,358 (trainer:732) INFO: 8epoch:train:16192-17990batch: iter_time=2.162e-04, forward_time=0.202, loss_att=71.069, acc=0.939, loss=71.069, backward_time=0.298, grad_norm=96.064, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 09:07:40,532 (trainer:732) INFO: 8epoch:train:17991-19789batch: iter_time=2.128e-04, forward_time=0.202, loss_att=72.290, acc=0.938, loss=72.290, backward_time=0.297, grad_norm=98.206, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 09:27:45,490 (trainer:732) INFO: 8epoch:train:19790-21588batch: iter_time=2.150e-04, forward_time=0.202, loss_att=71.668, acc=0.938, loss=71.668, backward_time=0.298, grad_norm=98.234, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 09:47:51,921 (trainer:732) INFO: 8epoch:train:21589-23387batch: iter_time=2.168e-04, forward_time=0.202, loss_att=70.087, acc=0.939, loss=70.087, backward_time=0.298, grad_norm=97.070, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 10:07:58,597 (trainer:732) INFO: 8epoch:train:23388-25186batch: iter_time=2.184e-04, forward_time=0.202, loss_att=71.617, acc=0.939, loss=71.617, backward_time=0.298, grad_norm=94.351, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 10:28:02,644 (trainer:732) INFO: 8epoch:train:25187-26985batch: iter_time=2.167e-04, forward_time=0.202, loss_att=70.448, acc=0.939, loss=70.448, backward_time=0.297, grad_norm=101.585, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 10:48:05,885 (trainer:732) INFO: 8epoch:train:26986-28784batch: iter_time=2.125e-04, forward_time=0.202, loss_att=70.280, acc=0.939, loss=70.280, backward_time=0.297, grad_norm=95.339, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 11:08:10,163 (trainer:732) INFO: 8epoch:train:28785-30583batch: iter_time=2.125e-04, forward_time=0.202, loss_att=69.613, acc=0.939, loss=69.613, backward_time=0.297, grad_norm=99.267, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 11:28:56,992 (trainer:732) INFO: 8epoch:train:30584-32382batch: iter_time=2.825e-04, forward_time=0.210, loss_att=70.122, acc=0.939, loss=70.122, backward_time=0.300, grad_norm=97.507, clip=100.000, loss_scale=1.000, optim_step_time=0.070, optim0_lr0=0.001, train_time=2.771 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 11:50:54,743 (trainer:732) INFO: 8epoch:train:32383-34181batch: iter_time=2.946e-04, forward_time=0.222, loss_att=70.095, acc=0.940, loss=70.095, backward_time=0.303, grad_norm=97.328, clip=100.000, loss_scale=1.000, optim_step_time=0.086, optim0_lr0=0.001, train_time=2.929 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<42947> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<43057> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<56267> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<44362> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<61450> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<61506> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<24124> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<24239> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<19703> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<19859> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<59892> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<48520> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<51344> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<51346> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<59182> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<38053> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<29078> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<29254> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<33616> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<44677> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<19103> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<44971> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<34111> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<34112> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<52842> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<35693> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<16364> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<45144> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<23195> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<23183> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<58230> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.241<30015> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 12:12:53,754 (trainer:732) INFO: 8epoch:train:34182-35980batch: iter_time=2.857e-04, forward_time=0.222, loss_att=69.683, acc=0.940, loss=69.683, backward_time=0.303, grad_norm=96.461, clip=100.000, loss_scale=1.000, optim_step_time=0.085, optim0_lr0=0.001, train_time=2.932 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 12:22:33,585 (trainer:338) INFO: 8epoch results: [train] iter_time=2.731e-04, forward_time=0.205, loss_att=71.158, acc=0.939, loss=71.158, backward_time=0.298, grad_norm=97.782, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=0.001, train_time=2.738, time=6 hours, 50 minutes and 53.38 seconds, total_count=287968, gpu_max_cached_mem_GB=30.176, [valid] loss_att=52.225, acc=0.953, cer=0.058, wer=0.176, loss=52.225, time=4 minutes and 33.7 seconds, total_count=96, gpu_max_cached_mem_GB=30.176, [att_plot] time=4 minutes and 45.63 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 12:22:39,541 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 12:22:39,546 (trainer:272) INFO: 9/60epoch started. Estimated time to finish: 2 weeks, 1 day and 1 hour +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 12:49:30,535 (trainer:732) INFO: 9epoch:train:1-1799batch: iter_time=0.001, forward_time=0.216, loss_att=68.810, acc=0.941, loss=68.810, backward_time=0.301, grad_norm=95.660, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=0.001, train_time=3.583 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 13:10:40,293 (trainer:732) INFO: 9epoch:train:1800-3598batch: iter_time=3.696e-04, forward_time=0.214, loss_att=68.912, acc=0.940, loss=68.912, backward_time=0.300, grad_norm=98.032, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=0.001, train_time=2.823 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 13:31:49,300 (trainer:732) INFO: 9epoch:train:3599-5397batch: iter_time=3.187e-04, forward_time=0.214, loss_att=68.718, acc=0.941, loss=68.718, backward_time=0.301, grad_norm=102.774, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=0.001, train_time=2.821 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 13:52:58,863 (trainer:732) INFO: 9epoch:train:5398-7196batch: iter_time=3.033e-04, forward_time=0.214, loss_att=68.136, acc=0.941, loss=68.136, backward_time=0.301, grad_norm=98.016, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=0.001, train_time=2.822 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 14:14:07,836 (trainer:732) INFO: 9epoch:train:7197-8995batch: iter_time=3.335e-04, forward_time=0.214, loss_att=68.276, acc=0.941, loss=68.276, backward_time=0.300, grad_norm=98.462, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=0.001, train_time=2.821 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 14:35:18,513 (trainer:732) INFO: 9epoch:train:8996-10794batch: iter_time=3.230e-04, forward_time=0.215, loss_att=70.140, acc=0.940, loss=70.140, backward_time=0.302, grad_norm=99.834, clip=100.000, loss_scale=1.000, optim_step_time=0.078, optim0_lr0=0.001, train_time=2.825 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 14:56:23,301 (trainer:732) INFO: 9epoch:train:10795-12593batch: iter_time=3.472e-04, forward_time=0.214, loss_att=67.763, acc=0.941, loss=67.763, backward_time=0.299, grad_norm=96.866, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=0.001, train_time=2.812 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 15:17:42,362 (trainer:732) INFO: 9epoch:train:12594-14392batch: iter_time=3.273e-04, forward_time=0.216, loss_att=68.742, acc=0.941, loss=68.742, backward_time=0.302, grad_norm=98.158, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=0.001, train_time=2.843 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 15:38:57,173 (trainer:732) INFO: 9epoch:train:14393-16191batch: iter_time=3.404e-04, forward_time=0.215, loss_att=68.866, acc=0.941, loss=68.866, backward_time=0.301, grad_norm=95.362, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=0.001, train_time=2.834 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 16:00:02,615 (trainer:732) INFO: 9epoch:train:16192-17990batch: iter_time=3.256e-04, forward_time=0.214, loss_att=67.791, acc=0.941, loss=67.791, backward_time=0.300, grad_norm=92.944, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=0.001, train_time=2.813 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 16:21:12,868 (trainer:732) INFO: 9epoch:train:17991-19789batch: iter_time=3.123e-04, forward_time=0.214, loss_att=68.469, acc=0.941, loss=68.469, backward_time=0.301, grad_norm=98.401, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=0.001, train_time=2.824 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 16:42:32,947 (trainer:732) INFO: 9epoch:train:19790-21588batch: iter_time=3.579e-04, forward_time=0.217, loss_att=67.694, acc=0.941, loss=67.694, backward_time=0.301, grad_norm=97.807, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=0.001, train_time=2.846 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 17:03:50,444 (trainer:732) INFO: 9epoch:train:21589-23387batch: iter_time=3.531e-04, forward_time=0.216, loss_att=68.270, acc=0.941, loss=68.270, backward_time=0.302, grad_norm=94.284, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=0.001, train_time=2.840 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 17:25:01,711 (trainer:732) INFO: 9epoch:train:23388-25186batch: iter_time=3.453e-04, forward_time=0.215, loss_att=68.324, acc=0.941, loss=68.324, backward_time=0.301, grad_norm=99.709, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=0.001, train_time=2.826 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 17:46:10,492 (trainer:732) INFO: 9epoch:train:25187-26985batch: iter_time=3.288e-04, forward_time=0.215, loss_att=67.194, acc=0.941, loss=67.194, backward_time=0.301, grad_norm=97.340, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=0.001, train_time=2.821 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 18:07:16,973 (trainer:732) INFO: 9epoch:train:26986-28784batch: iter_time=3.406e-04, forward_time=0.214, loss_att=67.538, acc=0.941, loss=67.538, backward_time=0.301, grad_norm=96.502, clip=100.000, loss_scale=1.000, optim_step_time=0.078, optim0_lr0=0.001, train_time=2.816 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 18:28:28,833 (trainer:732) INFO: 9epoch:train:28785-30583batch: iter_time=3.177e-04, forward_time=0.214, loss_att=68.803, acc=0.941, loss=68.803, backward_time=0.301, grad_norm=98.900, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=0.001, train_time=2.828 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 18:49:38,427 (trainer:732) INFO: 9epoch:train:30584-32382batch: iter_time=3.125e-04, forward_time=0.215, loss_att=67.081, acc=0.942, loss=67.081, backward_time=0.301, grad_norm=93.939, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=0.001, train_time=2.822 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 19:10:49,790 (trainer:732) INFO: 9epoch:train:32383-34181batch: iter_time=3.156e-04, forward_time=0.215, loss_att=67.816, acc=0.942, loss=67.816, backward_time=0.301, grad_norm=105.546, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=9.980e-04, train_time=2.827 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 19:32:00,661 (trainer:732) INFO: 9epoch:train:34182-35980batch: iter_time=3.302e-04, forward_time=0.215, loss_att=67.128, acc=0.942, loss=67.128, backward_time=0.302, grad_norm=95.581, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.953e-04, train_time=2.825 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 109) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 109) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 19:41:21,470 (trainer:338) INFO: 9epoch results: [train] iter_time=3.757e-04, forward_time=0.215, loss_att=68.221, acc=0.941, loss=68.221, backward_time=0.301, grad_norm=97.717, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=0.001, train_time=2.863, time=7 hours, 9 minutes and 44.03 seconds, total_count=323964, gpu_max_cached_mem_GB=30.176, [valid] loss_att=50.760, acc=0.954, cer=0.056, wer=0.170, loss=50.760, time=4 minutes and 31.08 seconds, total_count=108, gpu_max_cached_mem_GB=30.176, [att_plot] time=4 minutes and 26.81 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 19:41:28,617 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 19:41:28,631 (trainer:272) INFO: 10/60epoch started. Estimated time to finish: 2 weeks, 20 hours and 40 minutes + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 20:08:11,977 (trainer:732) INFO: 10epoch:train:1-1799batch: iter_time=0.002, forward_time=0.215, loss_att=65.958, acc=0.943, loss=65.958, backward_time=0.301, grad_norm=100.805, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=9.925e-04, train_time=3.565 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 20:29:22,000 (trainer:732) INFO: 10epoch:train:1800-3598batch: iter_time=3.448e-04, forward_time=0.215, loss_att=65.937, acc=0.943, loss=65.937, backward_time=0.300, grad_norm=100.290, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.897e-04, train_time=2.824 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 20:50:31,088 (trainer:732) INFO: 10epoch:train:3599-5397batch: iter_time=3.277e-04, forward_time=0.214, loss_att=65.565, acc=0.943, loss=65.565, backward_time=0.300, grad_norm=93.091, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.870e-04, train_time=2.821 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 21:11:42,614 (trainer:732) INFO: 10epoch:train:5398-7196batch: iter_time=3.401e-04, forward_time=0.215, loss_att=66.285, acc=0.943, loss=66.285, backward_time=0.301, grad_norm=96.943, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.843e-04, train_time=2.826 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 21:32:53,313 (trainer:732) INFO: 10epoch:train:7197-8995batch: iter_time=3.267e-04, forward_time=0.215, loss_att=66.436, acc=0.943, loss=66.436, backward_time=0.301, grad_norm=101.532, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.817e-04, train_time=2.825 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 21:54:04,969 (trainer:732) INFO: 10epoch:train:8996-10794batch: iter_time=3.339e-04, forward_time=0.215, loss_att=66.296, acc=0.943, loss=66.296, backward_time=0.301, grad_norm=98.396, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.790e-04, train_time=2.827 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 22:15:17,461 (trainer:732) INFO: 10epoch:train:10795-12593batch: iter_time=3.379e-04, forward_time=0.215, loss_att=65.489, acc=0.944, loss=65.489, backward_time=0.301, grad_norm=97.197, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=9.764e-04, train_time=2.829 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 22:36:25,950 (trainer:732) INFO: 10epoch:train:12594-14392batch: iter_time=3.267e-04, forward_time=0.215, loss_att=65.625, acc=0.943, loss=65.625, backward_time=0.300, grad_norm=99.583, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.738e-04, train_time=2.820 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 22:57:35,272 (trainer:732) INFO: 10epoch:train:14393-16191batch: iter_time=3.037e-04, forward_time=0.215, loss_att=66.366, acc=0.943, loss=66.366, backward_time=0.301, grad_norm=99.796, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.712e-04, train_time=2.822 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 23:18:46,150 (trainer:732) INFO: 10epoch:train:16192-17990batch: iter_time=3.317e-04, forward_time=0.215, loss_att=65.552, acc=0.943, loss=65.552, backward_time=0.301, grad_norm=99.186, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=9.686e-04, train_time=2.825 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-05 23:39:52,891 (trainer:732) INFO: 10epoch:train:17991-19789batch: iter_time=3.049e-04, forward_time=0.214, loss_att=66.986, acc=0.942, loss=66.986, backward_time=0.300, grad_norm=99.778, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.661e-04, train_time=2.816 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 00:01:03,116 (trainer:732) INFO: 10epoch:train:19790-21588batch: iter_time=3.209e-04, forward_time=0.215, loss_att=65.327, acc=0.943, loss=65.327, backward_time=0.300, grad_norm=100.914, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=9.636e-04, train_time=2.823 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 00:22:15,903 (trainer:732) INFO: 10epoch:train:21589-23387batch: iter_time=3.692e-04, forward_time=0.216, loss_att=65.943, acc=0.943, loss=65.943, backward_time=0.302, grad_norm=102.181, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=9.611e-04, train_time=2.829 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 00:43:22,218 (trainer:732) INFO: 10epoch:train:23388-25186batch: iter_time=3.347e-04, forward_time=0.215, loss_att=64.313, acc=0.944, loss=64.313, backward_time=0.300, grad_norm=98.938, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.586e-04, train_time=2.816 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 01:04:36,380 (trainer:732) INFO: 10epoch:train:25187-26985batch: iter_time=3.600e-04, forward_time=0.216, loss_att=66.605, acc=0.943, loss=66.605, backward_time=0.301, grad_norm=103.345, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=9.561e-04, train_time=2.832 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 01:25:47,645 (trainer:732) INFO: 10epoch:train:26986-28784batch: iter_time=3.173e-04, forward_time=0.214, loss_att=65.202, acc=0.943, loss=65.202, backward_time=0.301, grad_norm=99.921, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.537e-04, train_time=2.826 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 01:46:53,018 (trainer:732) INFO: 10epoch:train:28785-30583batch: iter_time=3.430e-04, forward_time=0.214, loss_att=65.322, acc=0.943, loss=65.322, backward_time=0.300, grad_norm=97.830, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.512e-04, train_time=2.814 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 02:08:04,369 (trainer:732) INFO: 10epoch:train:30584-32382batch: iter_time=3.563e-04, forward_time=0.215, loss_att=66.719, acc=0.943, loss=66.719, backward_time=0.301, grad_norm=100.522, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=9.488e-04, train_time=2.826 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 02:29:10,857 (trainer:732) INFO: 10epoch:train:32383-34181batch: iter_time=3.114e-04, forward_time=0.214, loss_att=65.932, acc=0.942, loss=65.932, backward_time=0.300, grad_norm=99.858, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.464e-04, train_time=2.815 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 02:50:17,857 (trainer:732) INFO: 10epoch:train:34182-35980batch: iter_time=3.234e-04, forward_time=0.214, loss_att=65.092, acc=0.944, loss=65.092, backward_time=0.301, grad_norm=102.633, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.441e-04, train_time=2.817 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 02:59:57,226 (trainer:338) INFO: 10epoch results: [train] iter_time=4.028e-04, forward_time=0.215, loss_att=65.847, acc=0.943, loss=65.847, backward_time=0.301, grad_norm=99.626, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.677e-04, train_time=2.860, time=7 hours, 9 minutes and 11.26 seconds, total_count=359960, gpu_max_cached_mem_GB=30.176, [valid] loss_att=49.072, acc=0.956, cer=0.056, wer=0.168, loss=49.072, time=4 minutes and 38.62 seconds, total_count=120, gpu_max_cached_mem_GB=30.176, [att_plot] time=4 minutes and 38.7 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 03:00:04,218 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 03:00:04,226 (trainer:272) INFO: 11/60epoch started. Estimated time to finish: 2 weeks, 15 hours and 15 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 03:26:54,421 (trainer:732) INFO: 11epoch:train:1-1799batch: iter_time=0.001, forward_time=0.214, loss_att=63.049, acc=0.945, loss=63.049, backward_time=0.299, grad_norm=97.489, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=9.417e-04, train_time=3.581 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 03:48:03,740 (trainer:732) INFO: 11epoch:train:1800-3598batch: iter_time=3.397e-04, forward_time=0.215, loss_att=64.356, acc=0.945, loss=64.356, backward_time=0.300, grad_norm=98.072, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.393e-04, train_time=2.822 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 04:09:11,879 (trainer:732) INFO: 11epoch:train:3599-5397batch: iter_time=3.284e-04, forward_time=0.214, loss_att=64.639, acc=0.944, loss=64.639, backward_time=0.301, grad_norm=99.290, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.370e-04, train_time=2.819 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 04:30:19,868 (trainer:732) INFO: 11epoch:train:5398-7196batch: iter_time=3.291e-04, forward_time=0.214, loss_att=63.733, acc=0.945, loss=63.733, backward_time=0.301, grad_norm=101.774, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.347e-04, train_time=2.818 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 04:51:26,631 (trainer:732) INFO: 11epoch:train:7197-8995batch: iter_time=3.541e-04, forward_time=0.214, loss_att=64.243, acc=0.944, loss=64.243, backward_time=0.300, grad_norm=102.044, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.324e-04, train_time=2.816 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 05:12:37,114 (trainer:732) INFO: 11epoch:train:8996-10794batch: iter_time=3.011e-04, forward_time=0.215, loss_att=65.009, acc=0.944, loss=65.009, backward_time=0.301, grad_norm=96.697, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.302e-04, train_time=2.825 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 05:33:47,819 (trainer:732) INFO: 11epoch:train:10795-12593batch: iter_time=3.350e-04, forward_time=0.215, loss_att=63.943, acc=0.945, loss=63.943, backward_time=0.301, grad_norm=100.694, clip=100.000, loss_scale=1.000, optim_step_time=0.083, optim0_lr0=9.279e-04, train_time=2.825 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 05:54:53,979 (trainer:732) INFO: 11epoch:train:12594-14392batch: iter_time=3.143e-04, forward_time=0.214, loss_att=64.113, acc=0.944, loss=64.113, backward_time=0.300, grad_norm=97.908, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.257e-04, train_time=2.815 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 06:16:02,825 (trainer:732) INFO: 11epoch:train:14393-16191batch: iter_time=3.547e-04, forward_time=0.214, loss_att=63.898, acc=0.945, loss=63.898, backward_time=0.301, grad_norm=105.033, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.234e-04, train_time=2.821 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 06:37:08,043 (trainer:732) INFO: 11epoch:train:16192-17990batch: iter_time=3.438e-04, forward_time=0.214, loss_att=63.647, acc=0.944, loss=63.647, backward_time=0.300, grad_norm=96.529, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.212e-04, train_time=2.813 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 06:58:13,641 (trainer:732) INFO: 11epoch:train:17991-19789batch: iter_time=3.113e-04, forward_time=0.213, loss_att=63.800, acc=0.945, loss=63.800, backward_time=0.301, grad_norm=98.597, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.191e-04, train_time=2.813 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 07:19:21,903 (trainer:732) INFO: 11epoch:train:19790-21588batch: iter_time=3.227e-04, forward_time=0.214, loss_att=63.397, acc=0.945, loss=63.397, backward_time=0.301, grad_norm=99.539, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.169e-04, train_time=2.819 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 07:40:34,641 (trainer:732) INFO: 11epoch:train:21589-23387batch: iter_time=3.512e-04, forward_time=0.215, loss_att=65.467, acc=0.944, loss=65.467, backward_time=0.301, grad_norm=105.135, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=9.147e-04, train_time=2.830 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 08:01:42,110 (trainer:732) INFO: 11epoch:train:23388-25186batch: iter_time=3.064e-04, forward_time=0.214, loss_att=63.562, acc=0.945, loss=63.562, backward_time=0.300, grad_norm=96.730, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.126e-04, train_time=2.817 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 08:22:46,681 (trainer:732) INFO: 11epoch:train:25187-26985batch: iter_time=3.271e-04, forward_time=0.214, loss_att=62.925, acc=0.945, loss=62.925, backward_time=0.300, grad_norm=96.238, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.104e-04, train_time=2.811 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 08:43:59,849 (trainer:732) INFO: 11epoch:train:26986-28784batch: iter_time=3.367e-04, forward_time=0.215, loss_att=63.607, acc=0.945, loss=63.607, backward_time=0.301, grad_norm=98.439, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.083e-04, train_time=2.830 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 09:05:10,729 (trainer:732) INFO: 11epoch:train:28785-30583batch: iter_time=3.200e-04, forward_time=0.215, loss_att=63.341, acc=0.946, loss=63.341, backward_time=0.302, grad_norm=98.488, clip=100.000, loss_scale=1.000, optim_step_time=0.078, optim0_lr0=9.062e-04, train_time=2.826 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 09:26:20,531 (trainer:732) INFO: 11epoch:train:30584-32382batch: iter_time=3.001e-04, forward_time=0.215, loss_att=63.647, acc=0.945, loss=63.647, backward_time=0.301, grad_norm=95.652, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=9.041e-04, train_time=2.823 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 09:47:28,597 (trainer:732) INFO: 11epoch:train:32383-34181batch: iter_time=3.164e-04, forward_time=0.214, loss_att=62.962, acc=0.946, loss=62.962, backward_time=0.300, grad_norm=95.211, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.021e-04, train_time=2.819 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 10:08:39,537 (trainer:732) INFO: 11epoch:train:34182-35980batch: iter_time=3.191e-04, forward_time=0.215, loss_att=63.988, acc=0.945, loss=63.988, backward_time=0.302, grad_norm=95.949, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=9.000e-04, train_time=2.825 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 10:17:57,858 (trainer:338) INFO: 11epoch results: [train] iter_time=3.744e-04, forward_time=0.214, loss_att=63.865, acc=0.945, loss=63.865, backward_time=0.301, grad_norm=98.793, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=9.204e-04, train_time=2.858, time=7 hours, 8 minutes and 57.57 seconds, total_count=395956, gpu_max_cached_mem_GB=30.176, [valid] loss_att=47.824, acc=0.957, cer=0.053, wer=0.162, loss=47.824, time=4 minutes and 29.79 seconds, total_count=132, gpu_max_cached_mem_GB=30.176, [att_plot] time=4 minutes and 26.26 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 10:18:03,291 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 10:18:03,322 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/1epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 10:18:03,322 (trainer:272) INFO: 12/60epoch started. Estimated time to finish: 2 weeks, 9 hours and 27 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 10:44:57,930 (trainer:732) INFO: 12epoch:train:1-1799batch: iter_time=0.001, forward_time=0.216, loss_att=62.664, acc=0.946, loss=62.664, backward_time=0.302, grad_norm=95.752, clip=100.000, loss_scale=1.000, optim_step_time=0.084, optim0_lr0=8.980e-04, train_time=3.591 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 11:06:10,952 (trainer:732) INFO: 12epoch:train:1800-3598batch: iter_time=3.336e-04, forward_time=0.214, loss_att=61.999, acc=0.946, loss=61.999, backward_time=0.301, grad_norm=93.681, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=8.959e-04, train_time=2.830 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 11:27:19,437 (trainer:732) INFO: 12epoch:train:3599-5397batch: iter_time=3.555e-04, forward_time=0.214, loss_att=63.161, acc=0.946, loss=63.161, backward_time=0.300, grad_norm=95.967, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=8.939e-04, train_time=2.820 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 11:48:29,323 (trainer:732) INFO: 12epoch:train:5398-7196batch: iter_time=3.159e-04, forward_time=0.214, loss_att=62.648, acc=0.946, loss=62.648, backward_time=0.301, grad_norm=104.202, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=8.919e-04, train_time=2.823 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 12:09:41,020 (trainer:732) INFO: 12epoch:train:7197-8995batch: iter_time=3.586e-04, forward_time=0.215, loss_att=61.661, acc=0.946, loss=61.661, backward_time=0.301, grad_norm=99.179, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=8.899e-04, train_time=2.827 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 12:30:48,233 (trainer:732) INFO: 12epoch:train:8996-10794batch: iter_time=3.034e-04, forward_time=0.214, loss_att=61.388, acc=0.946, loss=61.388, backward_time=0.301, grad_norm=98.807, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=8.880e-04, train_time=2.817 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 12:51:58,398 (trainer:732) INFO: 12epoch:train:10795-12593batch: iter_time=3.417e-04, forward_time=0.215, loss_att=62.491, acc=0.946, loss=62.491, backward_time=0.301, grad_norm=98.940, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=8.860e-04, train_time=2.824 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 13:13:03,070 (trainer:732) INFO: 12epoch:train:12594-14392batch: iter_time=3.333e-04, forward_time=0.214, loss_att=61.418, acc=0.946, loss=61.418, backward_time=0.300, grad_norm=91.753, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=8.840e-04, train_time=2.811 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 13:34:08,873 (trainer:732) INFO: 12epoch:train:14393-16191batch: iter_time=3.261e-04, forward_time=0.214, loss_att=62.362, acc=0.946, loss=62.362, backward_time=0.300, grad_norm=94.581, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=8.821e-04, train_time=2.814 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 13:55:18,818 (trainer:732) INFO: 12epoch:train:16192-17990batch: iter_time=3.124e-04, forward_time=0.215, loss_att=62.417, acc=0.946, loss=62.417, backward_time=0.301, grad_norm=104.446, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=8.802e-04, train_time=2.824 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 14:16:23,659 (trainer:732) INFO: 12epoch:train:17991-19789batch: iter_time=3.392e-04, forward_time=0.214, loss_att=61.708, acc=0.946, loss=61.708, backward_time=0.299, grad_norm=98.306, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=8.783e-04, train_time=2.812 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<41002> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 14:37:33,229 (trainer:732) INFO: 12epoch:train:19790-21588batch: iter_time=3.294e-04, forward_time=0.214, loss_att=61.359, acc=0.947, loss=61.359, backward_time=0.300, grad_norm=96.191, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=8.764e-04, train_time=2.822 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 14:58:44,547 (trainer:732) INFO: 12epoch:train:21589-23387batch: iter_time=3.053e-04, forward_time=0.215, loss_att=62.236, acc=0.947, loss=62.236, backward_time=0.302, grad_norm=102.899, clip=100.000, loss_scale=1.000, optim_step_time=0.077, optim0_lr0=8.745e-04, train_time=2.827 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 15:19:57,047 (trainer:732) INFO: 12epoch:train:23388-25186batch: iter_time=3.290e-04, forward_time=0.215, loss_att=62.021, acc=0.947, loss=62.021, backward_time=0.302, grad_norm=93.465, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=8.726e-04, train_time=2.829 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 15:41:06,834 (trainer:732) INFO: 12epoch:train:25187-26985batch: iter_time=3.187e-04, forward_time=0.214, loss_att=62.029, acc=0.946, loss=62.029, backward_time=0.301, grad_norm=99.685, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=8.708e-04, train_time=2.823 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 16:02:15,222 (trainer:732) INFO: 12epoch:train:26986-28784batch: iter_time=3.215e-04, forward_time=0.214, loss_att=61.712, acc=0.946, loss=61.712, backward_time=0.301, grad_norm=96.383, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=8.689e-04, train_time=2.819 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 16:23:20,768 (trainer:732) INFO: 12epoch:train:28785-30583batch: iter_time=3.174e-04, forward_time=0.214, loss_att=62.436, acc=0.946, loss=62.436, backward_time=0.300, grad_norm=100.093, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=8.671e-04, train_time=2.814 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 16:44:30,564 (trainer:732) INFO: 12epoch:train:30584-32382batch: iter_time=3.435e-04, forward_time=0.214, loss_att=61.637, acc=0.947, loss=61.637, backward_time=0.301, grad_norm=98.701, clip=100.000, loss_scale=1.000, optim_step_time=0.077, optim0_lr0=8.652e-04, train_time=2.823 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 17:05:43,273 (trainer:732) INFO: 12epoch:train:32383-34181batch: iter_time=3.031e-04, forward_time=0.214, loss_att=62.138, acc=0.946, loss=62.138, backward_time=0.301, grad_norm=100.883, clip=100.000, loss_scale=1.000, optim_step_time=0.081, optim0_lr0=8.634e-04, train_time=2.829 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 17:26:55,102 (trainer:732) INFO: 12epoch:train:34182-35980batch: iter_time=3.493e-04, forward_time=0.215, loss_att=61.947, acc=0.946, loss=61.947, backward_time=0.301, grad_norm=96.153, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=8.616e-04, train_time=2.827 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 17:36:11,652 (trainer:338) INFO: 12epoch results: [train] iter_time=3.628e-04, forward_time=0.214, loss_att=62.072, acc=0.946, loss=62.072, backward_time=0.301, grad_norm=98.004, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=8.794e-04, train_time=2.860, time=7 hours, 9 minutes and 15.43 seconds, total_count=431952, gpu_max_cached_mem_GB=30.176, [valid] loss_att=46.921, acc=0.958, cer=0.051, wer=0.159, loss=46.921, time=4 minutes and 26.13 seconds, total_count=144, gpu_max_cached_mem_GB=30.176, [att_plot] time=4 minutes and 26.77 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 17:36:17,517 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 17:36:17,525 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/2epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 17:36:17,525 (trainer:272) INFO: 13/60epoch started. Estimated time to finish: 2 weeks, 3 hours and 25 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 18:02:59,356 (trainer:732) INFO: 13epoch:train:1-1799batch: iter_time=0.001, forward_time=0.215, loss_att=60.879, acc=0.947, loss=60.879, backward_time=0.300, grad_norm=97.671, clip=100.000, loss_scale=1.000, optim_step_time=0.082, optim0_lr0=8.598e-04, train_time=3.562 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 18:24:12,176 (trainer:732) INFO: 13epoch:train:1800-3598batch: iter_time=3.369e-04, forward_time=0.215, loss_att=60.904, acc=0.947, loss=60.904, backward_time=0.301, grad_norm=97.298, clip=100.000, loss_scale=1.000, optim_step_time=0.080, optim0_lr0=8.580e-04, train_time=2.830 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 18:44:22,427 (trainer:732) INFO: 13epoch:train:3599-5397batch: iter_time=2.307e-04, forward_time=0.203, loss_att=61.157, acc=0.948, loss=61.157, backward_time=0.298, grad_norm=98.049, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.563e-04, train_time=2.691 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<53408> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<53652> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<57547> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<64887> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<46110> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<46100> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<20326> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<20390> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<62781> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<62977> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<14783> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<10195> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<10210> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<10226> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<26245> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<28368> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<52385> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<52569> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<39900> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<21904> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<28832> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<38467> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<38453> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<13956> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<14026> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<28926> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<18934> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<16717> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<44999> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<45005> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<19306> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<19514> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 19:04:28,263 (trainer:732) INFO: 13epoch:train:5398-7196batch: iter_time=2.338e-04, forward_time=0.202, loss_att=59.775, acc=0.948, loss=59.775, backward_time=0.297, grad_norm=97.274, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.545e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 19:24:31,889 (trainer:732) INFO: 13epoch:train:7197-8995batch: iter_time=2.136e-04, forward_time=0.202, loss_att=60.849, acc=0.947, loss=60.849, backward_time=0.297, grad_norm=96.183, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.528e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 19:44:36,530 (trainer:732) INFO: 13epoch:train:8996-10794batch: iter_time=2.128e-04, forward_time=0.202, loss_att=60.908, acc=0.947, loss=60.908, backward_time=0.297, grad_norm=101.962, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.510e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 20:04:44,616 (trainer:732) INFO: 13epoch:train:10795-12593batch: iter_time=2.173e-04, forward_time=0.203, loss_att=62.001, acc=0.947, loss=62.001, backward_time=0.298, grad_norm=101.749, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.493e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 20:24:51,851 (trainer:732) INFO: 13epoch:train:12594-14392batch: iter_time=2.177e-04, forward_time=0.202, loss_att=60.494, acc=0.948, loss=60.494, backward_time=0.298, grad_norm=99.043, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.476e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 20:44:54,825 (trainer:732) INFO: 13epoch:train:14393-16191batch: iter_time=2.130e-04, forward_time=0.202, loss_att=60.281, acc=0.947, loss=60.281, backward_time=0.297, grad_norm=95.380, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.459e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 21:04:57,214 (trainer:732) INFO: 13epoch:train:16192-17990batch: iter_time=2.114e-04, forward_time=0.202, loss_att=59.482, acc=0.948, loss=59.482, backward_time=0.297, grad_norm=100.461, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.442e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 21:25:04,630 (trainer:732) INFO: 13epoch:train:17991-19789batch: iter_time=2.141e-04, forward_time=0.202, loss_att=60.456, acc=0.948, loss=60.456, backward_time=0.298, grad_norm=101.487, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.425e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 21:45:13,071 (trainer:732) INFO: 13epoch:train:19790-21588batch: iter_time=2.141e-04, forward_time=0.202, loss_att=60.575, acc=0.948, loss=60.575, backward_time=0.298, grad_norm=105.777, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.408e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 22:05:16,357 (trainer:732) INFO: 13epoch:train:21589-23387batch: iter_time=2.199e-04, forward_time=0.202, loss_att=60.740, acc=0.947, loss=60.740, backward_time=0.297, grad_norm=101.653, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.391e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 22:25:22,127 (trainer:732) INFO: 13epoch:train:23388-25186batch: iter_time=2.228e-04, forward_time=0.202, loss_att=60.893, acc=0.947, loss=60.893, backward_time=0.297, grad_norm=97.607, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.375e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 22:45:29,186 (trainer:732) INFO: 13epoch:train:25187-26985batch: iter_time=2.236e-04, forward_time=0.202, loss_att=61.086, acc=0.948, loss=61.086, backward_time=0.298, grad_norm=99.982, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=8.358e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 23:05:35,515 (trainer:732) INFO: 13epoch:train:26986-28784batch: iter_time=2.255e-04, forward_time=0.202, loss_att=60.421, acc=0.948, loss=60.421, backward_time=0.298, grad_norm=97.499, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.342e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 23:25:40,343 (trainer:732) INFO: 13epoch:train:28785-30583batch: iter_time=2.207e-04, forward_time=0.202, loss_att=60.170, acc=0.948, loss=60.170, backward_time=0.297, grad_norm=100.332, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.326e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-06 23:45:44,842 (trainer:732) INFO: 13epoch:train:30584-32382batch: iter_time=2.215e-04, forward_time=0.202, loss_att=59.710, acc=0.948, loss=59.710, backward_time=0.297, grad_norm=97.490, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.310e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 00:05:52,080 (trainer:732) INFO: 13epoch:train:32383-34181batch: iter_time=2.279e-04, forward_time=0.202, loss_att=60.025, acc=0.948, loss=60.025, backward_time=0.298, grad_norm=96.875, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.293e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 00:25:53,689 (trainer:732) INFO: 13epoch:train:34182-35980batch: iter_time=2.216e-04, forward_time=0.201, loss_att=60.022, acc=0.948, loss=60.022, backward_time=0.296, grad_norm=100.890, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=8.277e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 00:34:28,685 (trainer:338) INFO: 13epoch results: [train] iter_time=2.757e-04, forward_time=0.203, loss_att=60.540, acc=0.948, loss=60.540, backward_time=0.298, grad_norm=99.228, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=8.435e-04, train_time=2.732, time=6 hours, 50 minutes and 0.35 seconds, total_count=467948, gpu_max_cached_mem_GB=30.176, [valid] loss_att=46.345, acc=0.958, cer=0.052, wer=0.158, loss=46.345, time=4 minutes and 42.65 seconds, total_count=156, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 28.1 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 00:34:32,180 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 00:34:32,186 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/3epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 00:34:32,186 (trainer:272) INFO: 14/60epoch started. Estimated time to finish: 1 week, 6 days and 19 hours + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 136) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 136) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 00:59:02,986 (trainer:732) INFO: 14epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=58.501, acc=0.949, loss=58.501, backward_time=0.298, grad_norm=101.550, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=8.261e-04, train_time=3.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 01:19:09,796 (trainer:732) INFO: 14epoch:train:1800-3598batch: iter_time=2.337e-04, forward_time=0.202, loss_att=59.565, acc=0.948, loss=59.565, backward_time=0.297, grad_norm=99.021, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.246e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 01:39:17,604 (trainer:732) INFO: 14epoch:train:3599-5397batch: iter_time=2.301e-04, forward_time=0.202, loss_att=59.651, acc=0.949, loss=59.651, backward_time=0.298, grad_norm=95.864, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=8.230e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 01:59:24,769 (trainer:732) INFO: 14epoch:train:5398-7196batch: iter_time=2.368e-04, forward_time=0.203, loss_att=59.546, acc=0.949, loss=59.546, backward_time=0.298, grad_norm=95.272, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=8.214e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 02:19:28,268 (trainer:732) INFO: 14epoch:train:7197-8995batch: iter_time=2.303e-04, forward_time=0.202, loss_att=59.476, acc=0.948, loss=59.476, backward_time=0.297, grad_norm=96.566, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.199e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 02:39:32,296 (trainer:732) INFO: 14epoch:train:8996-10794batch: iter_time=2.283e-04, forward_time=0.202, loss_att=59.240, acc=0.949, loss=59.240, backward_time=0.297, grad_norm=93.477, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.183e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 02:59:40,724 (trainer:732) INFO: 14epoch:train:10795-12593batch: iter_time=2.296e-04, forward_time=0.203, loss_att=58.701, acc=0.949, loss=58.701, backward_time=0.298, grad_norm=95.746, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.168e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 03:19:45,679 (trainer:732) INFO: 14epoch:train:12594-14392batch: iter_time=2.259e-04, forward_time=0.202, loss_att=59.156, acc=0.949, loss=59.156, backward_time=0.297, grad_norm=96.592, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.153e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 03:39:52,368 (trainer:732) INFO: 14epoch:train:14393-16191batch: iter_time=2.277e-04, forward_time=0.202, loss_att=59.443, acc=0.948, loss=59.443, backward_time=0.298, grad_norm=96.039, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.138e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 03:59:58,430 (trainer:732) INFO: 14epoch:train:16192-17990batch: iter_time=2.284e-04, forward_time=0.202, loss_att=58.641, acc=0.949, loss=58.641, backward_time=0.297, grad_norm=93.547, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=8.122e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 04:20:04,354 (trainer:732) INFO: 14epoch:train:17991-19789batch: iter_time=2.263e-04, forward_time=0.202, loss_att=59.450, acc=0.949, loss=59.450, backward_time=0.298, grad_norm=100.821, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.107e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 04:40:11,627 (trainer:732) INFO: 14epoch:train:19790-21588batch: iter_time=2.307e-04, forward_time=0.202, loss_att=59.026, acc=0.949, loss=59.026, backward_time=0.298, grad_norm=100.589, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.092e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 05:00:16,153 (trainer:732) INFO: 14epoch:train:21589-23387batch: iter_time=2.283e-04, forward_time=0.202, loss_att=59.106, acc=0.949, loss=59.106, backward_time=0.297, grad_norm=103.228, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.078e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 05:20:19,710 (trainer:732) INFO: 14epoch:train:23388-25186batch: iter_time=2.293e-04, forward_time=0.202, loss_att=58.815, acc=0.949, loss=58.815, backward_time=0.297, grad_norm=95.526, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.063e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 05:40:27,620 (trainer:732) INFO: 14epoch:train:25187-26985batch: iter_time=2.275e-04, forward_time=0.202, loss_att=60.167, acc=0.949, loss=60.167, backward_time=0.298, grad_norm=101.329, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.048e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 06:00:35,334 (trainer:732) INFO: 14epoch:train:26986-28784batch: iter_time=2.193e-04, forward_time=0.202, loss_att=59.719, acc=0.949, loss=59.719, backward_time=0.298, grad_norm=99.990, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.034e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 06:20:42,301 (trainer:732) INFO: 14epoch:train:28785-30583batch: iter_time=2.174e-04, forward_time=0.202, loss_att=59.191, acc=0.949, loss=59.191, backward_time=0.298, grad_norm=102.619, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.019e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 06:40:50,059 (trainer:732) INFO: 14epoch:train:30584-32382batch: iter_time=2.155e-04, forward_time=0.202, loss_att=58.746, acc=0.949, loss=58.746, backward_time=0.298, grad_norm=98.630, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=8.005e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 07:00:54,104 (trainer:732) INFO: 14epoch:train:32383-34181batch: iter_time=2.204e-04, forward_time=0.202, loss_att=59.288, acc=0.948, loss=59.288, backward_time=0.297, grad_norm=101.122, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.990e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 07:20:57,441 (trainer:732) INFO: 14epoch:train:34182-35980batch: iter_time=2.198e-04, forward_time=0.201, loss_att=58.655, acc=0.949, loss=58.655, backward_time=0.296, grad_norm=99.406, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.976e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 07:29:06,704 (trainer:338) INFO: 14epoch results: [train] iter_time=2.665e-04, forward_time=0.202, loss_att=59.203, acc=0.949, loss=59.203, backward_time=0.297, grad_norm=98.338, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=8.116e-04, train_time=2.711, time=6 hours, 46 minutes and 47.59 seconds, total_count=503944, gpu_max_cached_mem_GB=30.176, [valid] loss_att=44.103, acc=0.961, cer=0.050, wer=0.151, loss=44.103, time=4 minutes and 23.53 seconds, total_count=168, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 23.33 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 07:29:10,155 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 07:29:10,166 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/4epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 07:29:10,166 (trainer:272) INFO: 15/60epoch started. Estimated time to finish: 1 week, 6 days and 12 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 07:53:32,408 (trainer:732) INFO: 15epoch:train:1-1799batch: iter_time=7.195e-04, forward_time=0.202, loss_att=58.011, acc=0.950, loss=58.011, backward_time=0.298, grad_norm=96.385, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.961e-04, train_time=3.252 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 08:13:38,818 (trainer:732) INFO: 15epoch:train:1800-3598batch: iter_time=2.260e-04, forward_time=0.202, loss_att=58.510, acc=0.949, loss=58.510, backward_time=0.297, grad_norm=99.942, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.947e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 08:33:47,082 (trainer:732) INFO: 15epoch:train:3599-5397batch: iter_time=2.212e-04, forward_time=0.202, loss_att=57.780, acc=0.950, loss=57.780, backward_time=0.298, grad_norm=98.602, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.933e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 08:53:53,254 (trainer:732) INFO: 15epoch:train:5398-7196batch: iter_time=2.265e-04, forward_time=0.202, loss_att=57.842, acc=0.950, loss=57.842, backward_time=0.297, grad_norm=97.053, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.919e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 09:13:59,256 (trainer:732) INFO: 15epoch:train:7197-8995batch: iter_time=2.189e-04, forward_time=0.202, loss_att=58.906, acc=0.949, loss=58.906, backward_time=0.298, grad_norm=100.440, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.905e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 09:34:02,154 (trainer:732) INFO: 15epoch:train:8996-10794batch: iter_time=2.228e-04, forward_time=0.202, loss_att=57.700, acc=0.950, loss=57.700, backward_time=0.297, grad_norm=97.211, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.892e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 09:54:08,367 (trainer:732) INFO: 15epoch:train:10795-12593batch: iter_time=2.228e-04, forward_time=0.202, loss_att=57.945, acc=0.950, loss=57.945, backward_time=0.297, grad_norm=101.017, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.878e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 10:14:14,950 (trainer:732) INFO: 15epoch:train:12594-14392batch: iter_time=2.225e-04, forward_time=0.202, loss_att=57.798, acc=0.950, loss=57.798, backward_time=0.297, grad_norm=101.413, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.864e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 10:34:21,652 (trainer:732) INFO: 15epoch:train:14393-16191batch: iter_time=2.193e-04, forward_time=0.202, loss_att=58.327, acc=0.949, loss=58.327, backward_time=0.298, grad_norm=96.013, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.850e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 10:54:28,361 (trainer:732) INFO: 15epoch:train:16192-17990batch: iter_time=2.162e-04, forward_time=0.202, loss_att=58.082, acc=0.950, loss=58.082, backward_time=0.298, grad_norm=93.101, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.837e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 11:14:34,203 (trainer:732) INFO: 15epoch:train:17991-19789batch: iter_time=2.197e-04, forward_time=0.202, loss_att=57.793, acc=0.950, loss=57.793, backward_time=0.297, grad_norm=102.402, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.823e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 11:34:40,097 (trainer:732) INFO: 15epoch:train:19790-21588batch: iter_time=2.209e-04, forward_time=0.202, loss_att=58.112, acc=0.949, loss=58.112, backward_time=0.298, grad_norm=97.277, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.810e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 11:54:46,458 (trainer:732) INFO: 15epoch:train:21589-23387batch: iter_time=2.153e-04, forward_time=0.202, loss_att=57.905, acc=0.950, loss=57.905, backward_time=0.298, grad_norm=99.024, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.797e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 12:14:50,985 (trainer:732) INFO: 15epoch:train:23388-25186batch: iter_time=2.197e-04, forward_time=0.202, loss_att=58.617, acc=0.950, loss=58.617, backward_time=0.297, grad_norm=96.623, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.783e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 12:34:57,310 (trainer:732) INFO: 15epoch:train:25187-26985batch: iter_time=2.207e-04, forward_time=0.202, loss_att=57.692, acc=0.950, loss=57.692, backward_time=0.298, grad_norm=100.166, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.770e-04, train_time=2.682 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 12:55:02,941 (trainer:732) INFO: 15epoch:train:26986-28784batch: iter_time=2.182e-04, forward_time=0.202, loss_att=58.051, acc=0.950, loss=58.051, backward_time=0.298, grad_norm=96.330, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.757e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 13:15:08,566 (trainer:732) INFO: 15epoch:train:28785-30583batch: iter_time=2.217e-04, forward_time=0.202, loss_att=57.758, acc=0.950, loss=57.758, backward_time=0.297, grad_norm=99.120, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.744e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 13:35:15,651 (trainer:732) INFO: 15epoch:train:30584-32382batch: iter_time=2.195e-04, forward_time=0.202, loss_att=57.663, acc=0.950, loss=57.663, backward_time=0.298, grad_norm=102.029, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.731e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 13:55:20,967 (trainer:732) INFO: 15epoch:train:32383-34181batch: iter_time=2.195e-04, forward_time=0.202, loss_att=58.006, acc=0.950, loss=58.006, backward_time=0.297, grad_norm=98.294, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.718e-04, train_time=2.679 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 14:15:22,708 (trainer:732) INFO: 15epoch:train:34182-35980batch: iter_time=2.176e-04, forward_time=0.202, loss_att=57.403, acc=0.950, loss=57.403, backward_time=0.297, grad_norm=98.614, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.705e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 14:23:33,280 (trainer:338) INFO: 15epoch results: [train] iter_time=2.454e-04, forward_time=0.202, loss_att=57.995, acc=0.950, loss=57.995, backward_time=0.297, grad_norm=98.551, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.831e-04, train_time=2.709, time=6 hours, 46 minutes and 33.11 seconds, total_count=539940, gpu_max_cached_mem_GB=30.176, [valid] loss_att=44.157, acc=0.960, cer=0.049, wer=0.149, loss=44.157, time=4 minutes and 20.96 seconds, total_count=180, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 29.04 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 14:23:36,634 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 14:23:36,640 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/5epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 14:23:36,640 (trainer:272) INFO: 16/60epoch started. Estimated time to finish: 1 week, 6 days and 4 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 14:48:05,268 (trainer:732) INFO: 16epoch:train:1-1799batch: iter_time=8.766e-04, forward_time=0.202, loss_att=56.414, acc=0.951, loss=56.414, backward_time=0.297, grad_norm=103.794, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.692e-04, train_time=3.266 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 15:08:06,974 (trainer:732) INFO: 16epoch:train:1800-3598batch: iter_time=2.253e-04, forward_time=0.201, loss_att=56.766, acc=0.951, loss=56.766, backward_time=0.296, grad_norm=97.439, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.679e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 15:28:13,171 (trainer:732) INFO: 16epoch:train:3599-5397batch: iter_time=2.260e-04, forward_time=0.202, loss_att=57.351, acc=0.950, loss=57.351, backward_time=0.298, grad_norm=101.405, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.667e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 15:48:18,226 (trainer:732) INFO: 16epoch:train:5398-7196batch: iter_time=2.251e-04, forward_time=0.202, loss_att=57.014, acc=0.951, loss=57.014, backward_time=0.297, grad_norm=98.714, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.654e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 16:08:24,873 (trainer:732) INFO: 16epoch:train:7197-8995batch: iter_time=2.243e-04, forward_time=0.202, loss_att=56.903, acc=0.951, loss=56.903, backward_time=0.298, grad_norm=106.270, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.641e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 16:28:33,554 (trainer:732) INFO: 16epoch:train:8996-10794batch: iter_time=2.229e-04, forward_time=0.203, loss_att=57.756, acc=0.951, loss=57.756, backward_time=0.299, grad_norm=102.362, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.629e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 16:48:38,719 (trainer:732) INFO: 16epoch:train:10795-12593batch: iter_time=2.277e-04, forward_time=0.202, loss_att=56.386, acc=0.951, loss=56.386, backward_time=0.297, grad_norm=96.241, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.616e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 17:08:42,358 (trainer:732) INFO: 16epoch:train:12594-14392batch: iter_time=2.217e-04, forward_time=0.202, loss_att=57.232, acc=0.950, loss=57.232, backward_time=0.297, grad_norm=96.685, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.604e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 17:28:49,706 (trainer:732) INFO: 16epoch:train:14393-16191batch: iter_time=2.269e-04, forward_time=0.203, loss_att=57.429, acc=0.950, loss=57.429, backward_time=0.298, grad_norm=99.403, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.592e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 17:48:56,059 (trainer:732) INFO: 16epoch:train:16192-17990batch: iter_time=2.294e-04, forward_time=0.202, loss_att=57.577, acc=0.951, loss=57.577, backward_time=0.298, grad_norm=102.449, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.579e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 18:09:04,116 (trainer:732) INFO: 16epoch:train:17991-19789batch: iter_time=2.233e-04, forward_time=0.202, loss_att=56.401, acc=0.951, loss=56.401, backward_time=0.298, grad_norm=96.240, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.567e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 18:29:09,949 (trainer:732) INFO: 16epoch:train:19790-21588batch: iter_time=2.232e-04, forward_time=0.202, loss_att=57.362, acc=0.950, loss=57.362, backward_time=0.297, grad_norm=101.134, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.555e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 18:49:16,701 (trainer:732) INFO: 16epoch:train:21589-23387batch: iter_time=2.242e-04, forward_time=0.203, loss_att=56.614, acc=0.951, loss=56.614, backward_time=0.298, grad_norm=101.049, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.543e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 19:09:24,527 (trainer:732) INFO: 16epoch:train:23388-25186batch: iter_time=2.235e-04, forward_time=0.203, loss_att=57.493, acc=0.950, loss=57.493, backward_time=0.298, grad_norm=98.254, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.531e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 19:29:27,391 (trainer:732) INFO: 16epoch:train:25187-26985batch: iter_time=2.310e-04, forward_time=0.202, loss_att=56.819, acc=0.950, loss=56.819, backward_time=0.297, grad_norm=95.472, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.519e-04, train_time=2.674 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 19:49:33,075 (trainer:732) INFO: 16epoch:train:26986-28784batch: iter_time=2.256e-04, forward_time=0.202, loss_att=57.705, acc=0.951, loss=57.705, backward_time=0.298, grad_norm=102.746, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.507e-04, train_time=2.680 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 20:09:43,775 (trainer:732) INFO: 16epoch:train:28785-30583batch: iter_time=2.237e-04, forward_time=0.203, loss_att=56.327, acc=0.952, loss=56.327, backward_time=0.299, grad_norm=99.405, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.495e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 20:29:43,718 (trainer:732) INFO: 16epoch:train:30584-32382batch: iter_time=2.166e-04, forward_time=0.201, loss_att=55.960, acc=0.951, loss=55.960, backward_time=0.296, grad_norm=94.259, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.483e-04, train_time=2.667 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 20:49:50,578 (trainer:732) INFO: 16epoch:train:32383-34181batch: iter_time=2.217e-04, forward_time=0.202, loss_att=57.469, acc=0.950, loss=57.469, backward_time=0.298, grad_norm=96.080, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.472e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 21:09:56,348 (trainer:732) INFO: 16epoch:train:34182-35980batch: iter_time=2.218e-04, forward_time=0.202, loss_att=55.626, acc=0.951, loss=55.626, backward_time=0.297, grad_norm=95.825, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.460e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 21:18:08,799 (trainer:338) INFO: 16epoch results: [train] iter_time=2.570e-04, forward_time=0.202, loss_att=56.929, acc=0.951, loss=56.929, backward_time=0.297, grad_norm=99.251, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.574e-04, train_time=2.710, time=6 hours, 46 minutes and 39.4 seconds, total_count=575936, gpu_max_cached_mem_GB=30.176, [valid] loss_att=42.989, acc=0.961, cer=0.048, wer=0.148, loss=42.989, time=4 minutes and 24.73 seconds, total_count=192, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 28.02 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 21:18:12,351 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 21:18:12,378 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/6epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 21:18:12,379 (trainer:272) INFO: 17/60epoch started. Estimated time to finish: 1 week, 5 days and 21 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 21:42:47,324 (trainer:732) INFO: 17epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=56.539, acc=0.952, loss=56.539, backward_time=0.298, grad_norm=102.999, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.448e-04, train_time=3.280 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 22:02:51,098 (trainer:732) INFO: 17epoch:train:1800-3598batch: iter_time=2.276e-04, forward_time=0.202, loss_att=54.808, acc=0.952, loss=54.808, backward_time=0.297, grad_norm=98.691, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.437e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 22:22:57,083 (trainer:732) INFO: 17epoch:train:3599-5397batch: iter_time=2.328e-04, forward_time=0.202, loss_att=56.624, acc=0.951, loss=56.624, backward_time=0.298, grad_norm=100.249, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.425e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 22:43:05,743 (trainer:732) INFO: 17epoch:train:5398-7196batch: iter_time=2.299e-04, forward_time=0.202, loss_att=55.851, acc=0.952, loss=55.851, backward_time=0.298, grad_norm=100.555, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.414e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 23:03:10,019 (trainer:732) INFO: 17epoch:train:7197-8995batch: iter_time=2.264e-04, forward_time=0.202, loss_att=55.767, acc=0.951, loss=55.767, backward_time=0.297, grad_norm=96.295, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.402e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 23:23:20,676 (trainer:732) INFO: 17epoch:train:8996-10794batch: iter_time=2.257e-04, forward_time=0.203, loss_att=55.642, acc=0.952, loss=55.642, backward_time=0.299, grad_norm=98.231, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.391e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-07 23:43:29,059 (trainer:732) INFO: 17epoch:train:10795-12593batch: iter_time=2.414e-04, forward_time=0.203, loss_att=56.796, acc=0.951, loss=56.796, backward_time=0.298, grad_norm=97.937, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=7.379e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 00:03:37,700 (trainer:732) INFO: 17epoch:train:12594-14392batch: iter_time=2.298e-04, forward_time=0.202, loss_att=55.266, acc=0.952, loss=55.266, backward_time=0.298, grad_norm=96.305, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.368e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 00:23:41,481 (trainer:732) INFO: 17epoch:train:14393-16191batch: iter_time=2.325e-04, forward_time=0.202, loss_att=56.028, acc=0.951, loss=56.028, backward_time=0.297, grad_norm=98.275, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.357e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 00:43:50,007 (trainer:732) INFO: 17epoch:train:16192-17990batch: iter_time=2.345e-04, forward_time=0.203, loss_att=56.142, acc=0.951, loss=56.142, backward_time=0.298, grad_norm=93.730, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.346e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 01:03:56,263 (trainer:732) INFO: 17epoch:train:17991-19789batch: iter_time=2.292e-04, forward_time=0.202, loss_att=56.054, acc=0.951, loss=56.054, backward_time=0.297, grad_norm=103.809, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.335e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 01:24:01,108 (trainer:732) INFO: 17epoch:train:19790-21588batch: iter_time=2.305e-04, forward_time=0.202, loss_att=55.592, acc=0.952, loss=55.592, backward_time=0.297, grad_norm=101.418, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.324e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 01:44:08,072 (trainer:732) INFO: 17epoch:train:21589-23387batch: iter_time=2.401e-04, forward_time=0.203, loss_att=55.942, acc=0.952, loss=55.942, backward_time=0.297, grad_norm=102.832, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=7.313e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 02:04:16,620 (trainer:732) INFO: 17epoch:train:23388-25186batch: iter_time=2.344e-04, forward_time=0.203, loss_att=56.427, acc=0.952, loss=56.427, backward_time=0.298, grad_norm=101.151, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=7.302e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 02:24:24,161 (trainer:732) INFO: 17epoch:train:25187-26985batch: iter_time=2.354e-04, forward_time=0.203, loss_att=55.557, acc=0.952, loss=55.557, backward_time=0.297, grad_norm=103.806, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=7.291e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 02:44:31,384 (trainer:732) INFO: 17epoch:train:26986-28784batch: iter_time=2.384e-04, forward_time=0.203, loss_att=55.738, acc=0.952, loss=55.738, backward_time=0.297, grad_norm=100.101, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=7.280e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 03:04:37,825 (trainer:732) INFO: 17epoch:train:28785-30583batch: iter_time=2.258e-04, forward_time=0.202, loss_att=55.738, acc=0.952, loss=55.738, backward_time=0.298, grad_norm=102.477, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.269e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 03:24:44,353 (trainer:732) INFO: 17epoch:train:30584-32382batch: iter_time=2.226e-04, forward_time=0.202, loss_att=55.990, acc=0.952, loss=55.990, backward_time=0.298, grad_norm=97.411, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.258e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 03:44:51,599 (trainer:732) INFO: 17epoch:train:32383-34181batch: iter_time=2.188e-04, forward_time=0.202, loss_att=56.306, acc=0.952, loss=56.306, backward_time=0.298, grad_norm=99.561, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.247e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 04:04:55,560 (trainer:732) INFO: 17epoch:train:34182-35980batch: iter_time=2.221e-04, forward_time=0.202, loss_att=55.331, acc=0.952, loss=55.331, backward_time=0.297, grad_norm=100.712, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.237e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 04:13:21,635 (trainer:338) INFO: 17epoch results: [train] iter_time=2.704e-04, forward_time=0.202, loss_att=55.906, acc=0.952, loss=55.906, backward_time=0.297, grad_norm=99.824, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.341e-04, train_time=2.713, time=6 hours, 47 minutes and 3.16 seconds, total_count=611932, gpu_max_cached_mem_GB=30.176, [valid] loss_att=44.063, acc=0.961, cer=0.047, wer=0.146, loss=44.063, time=4 minutes and 34.33 seconds, total_count=204, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 31.76 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 04:13:25,165 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 04:13:25,172 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/7epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 04:13:25,173 (trainer:272) INFO: 18/60epoch started. Estimated time to finish: 1 week, 5 days and 14 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 04:37:46,041 (trainer:732) INFO: 18epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=54.422, acc=0.953, loss=54.422, backward_time=0.297, grad_norm=99.442, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.226e-04, train_time=3.249 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 04:57:51,057 (trainer:732) INFO: 18epoch:train:1800-3598batch: iter_time=2.278e-04, forward_time=0.202, loss_att=55.485, acc=0.952, loss=55.485, backward_time=0.297, grad_norm=99.271, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.216e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 05:17:55,805 (trainer:732) INFO: 18epoch:train:3599-5397batch: iter_time=2.236e-04, forward_time=0.202, loss_att=54.724, acc=0.952, loss=54.724, backward_time=0.297, grad_norm=97.601, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.205e-04, train_time=2.678 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 05:38:01,670 (trainer:732) INFO: 18epoch:train:5398-7196batch: iter_time=2.247e-04, forward_time=0.202, loss_att=55.228, acc=0.952, loss=55.228, backward_time=0.298, grad_norm=99.628, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.194e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 05:58:10,148 (trainer:732) INFO: 18epoch:train:7197-8995batch: iter_time=2.236e-04, forward_time=0.203, loss_att=55.257, acc=0.952, loss=55.257, backward_time=0.298, grad_norm=94.897, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=7.184e-04, train_time=2.687 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 06:18:17,315 (trainer:732) INFO: 18epoch:train:8996-10794batch: iter_time=2.165e-04, forward_time=0.202, loss_att=55.748, acc=0.952, loss=55.748, backward_time=0.298, grad_norm=97.972, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.174e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 06:38:21,058 (trainer:732) INFO: 18epoch:train:10795-12593batch: iter_time=2.163e-04, forward_time=0.202, loss_att=55.025, acc=0.952, loss=55.025, backward_time=0.297, grad_norm=108.099, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.163e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 06:58:24,518 (trainer:732) INFO: 18epoch:train:12594-14392batch: iter_time=2.155e-04, forward_time=0.202, loss_att=55.338, acc=0.952, loss=55.338, backward_time=0.297, grad_norm=101.445, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.153e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 07:18:32,691 (trainer:732) INFO: 18epoch:train:14393-16191batch: iter_time=2.189e-04, forward_time=0.202, loss_att=55.081, acc=0.953, loss=55.081, backward_time=0.298, grad_norm=96.137, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.143e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 07:38:36,751 (trainer:732) INFO: 18epoch:train:16192-17990batch: iter_time=2.136e-04, forward_time=0.202, loss_att=54.980, acc=0.952, loss=54.980, backward_time=0.297, grad_norm=102.996, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.132e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 07:58:40,778 (trainer:732) INFO: 18epoch:train:17991-19789batch: iter_time=2.153e-04, forward_time=0.202, loss_att=54.674, acc=0.952, loss=54.674, backward_time=0.297, grad_norm=98.129, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.122e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 08:18:46,030 (trainer:732) INFO: 18epoch:train:19790-21588batch: iter_time=2.140e-04, forward_time=0.202, loss_att=55.121, acc=0.952, loss=55.121, backward_time=0.297, grad_norm=105.497, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.112e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 08:38:50,560 (trainer:732) INFO: 18epoch:train:21589-23387batch: iter_time=2.116e-04, forward_time=0.202, loss_att=54.455, acc=0.952, loss=54.455, backward_time=0.297, grad_norm=97.215, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.102e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 08:58:54,789 (trainer:732) INFO: 18epoch:train:23388-25186batch: iter_time=2.139e-04, forward_time=0.202, loss_att=54.210, acc=0.953, loss=54.210, backward_time=0.297, grad_norm=101.406, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.092e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 09:18:58,116 (trainer:732) INFO: 18epoch:train:25187-26985batch: iter_time=2.102e-04, forward_time=0.202, loss_att=55.444, acc=0.952, loss=55.444, backward_time=0.297, grad_norm=105.350, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.082e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 09:39:05,608 (trainer:732) INFO: 18epoch:train:26986-28784batch: iter_time=2.148e-04, forward_time=0.202, loss_att=55.029, acc=0.952, loss=55.029, backward_time=0.298, grad_norm=102.988, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.072e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 09:59:10,413 (trainer:732) INFO: 18epoch:train:28785-30583batch: iter_time=2.164e-04, forward_time=0.202, loss_att=55.062, acc=0.952, loss=55.062, backward_time=0.297, grad_norm=99.909, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.062e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 10:19:15,307 (trainer:732) INFO: 18epoch:train:30584-32382batch: iter_time=2.117e-04, forward_time=0.202, loss_att=55.083, acc=0.953, loss=55.083, backward_time=0.297, grad_norm=97.534, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.052e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 10:39:20,101 (trainer:732) INFO: 18epoch:train:32383-34181batch: iter_time=2.148e-04, forward_time=0.202, loss_att=55.073, acc=0.952, loss=55.073, backward_time=0.297, grad_norm=112.922, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.042e-04, train_time=2.678 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<63229> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<63353> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<36822> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<20464> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<62600> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<39674> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<39678> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<62858> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<26220> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 156) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<27524> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<36173> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<55034> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<35335> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<35341> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<50752> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<20454> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<58471> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<58771> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<48088> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<17048> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<22256> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<22278> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<41575> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<53256> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<54870> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<41877> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<27937> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<38935> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<64416> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<64432> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<45513> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<45771> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 10:59:29,186 (trainer:732) INFO: 18epoch:train:34182-35980batch: iter_time=2.111e-04, forward_time=0.203, loss_att=55.140, acc=0.953, loss=55.140, backward_time=0.299, grad_norm=101.828, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.033e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 11:07:42,840 (trainer:338) INFO: 18epoch results: [train] iter_time=2.561e-04, forward_time=0.202, loss_att=55.022, acc=0.952, loss=55.022, backward_time=0.297, grad_norm=101.003, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.128e-04, train_time=2.708, time=6 hours, 46 minutes and 21.35 seconds, total_count=647928, gpu_max_cached_mem_GB=30.176, [valid] loss_att=42.312, acc=0.962, cer=0.048, wer=0.145, loss=42.312, time=4 minutes and 24.38 seconds, total_count=216, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 31.93 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 11:07:46,638 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 11:07:46,651 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/8epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 11:07:46,652 (trainer:272) INFO: 19/60epoch started. Estimated time to finish: 1 week, 5 days and 6 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 11:32:16,839 (trainer:732) INFO: 19epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=54.158, acc=0.953, loss=54.158, backward_time=0.298, grad_norm=98.524, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=7.023e-04, train_time=3.270 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 11:52:23,875 (trainer:732) INFO: 19epoch:train:1800-3598batch: iter_time=2.391e-04, forward_time=0.202, loss_att=54.536, acc=0.953, loss=54.536, backward_time=0.298, grad_norm=106.681, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.013e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 12:12:34,230 (trainer:732) INFO: 19epoch:train:3599-5397batch: iter_time=2.350e-04, forward_time=0.203, loss_att=54.295, acc=0.953, loss=54.295, backward_time=0.299, grad_norm=100.208, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=7.003e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 12:32:38,285 (trainer:732) INFO: 19epoch:train:5398-7196batch: iter_time=2.404e-04, forward_time=0.202, loss_att=53.719, acc=0.953, loss=53.719, backward_time=0.297, grad_norm=105.892, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.994e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 12:52:44,709 (trainer:732) INFO: 19epoch:train:7197-8995batch: iter_time=2.303e-04, forward_time=0.202, loss_att=53.935, acc=0.953, loss=53.935, backward_time=0.298, grad_norm=100.981, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.984e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 13:12:48,065 (trainer:732) INFO: 19epoch:train:8996-10794batch: iter_time=2.313e-04, forward_time=0.202, loss_att=54.546, acc=0.953, loss=54.546, backward_time=0.297, grad_norm=98.700, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.975e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 13:32:56,015 (trainer:732) INFO: 19epoch:train:10795-12593batch: iter_time=2.311e-04, forward_time=0.203, loss_att=55.045, acc=0.953, loss=55.045, backward_time=0.298, grad_norm=98.217, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.965e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 13:52:59,839 (trainer:732) INFO: 19epoch:train:12594-14392batch: iter_time=2.307e-04, forward_time=0.202, loss_att=53.983, acc=0.953, loss=53.983, backward_time=0.297, grad_norm=100.810, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.956e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 14:13:05,773 (trainer:732) INFO: 19epoch:train:14393-16191batch: iter_time=2.536e-04, forward_time=0.203, loss_att=54.279, acc=0.953, loss=54.279, backward_time=0.297, grad_norm=98.446, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.946e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 14:33:11,153 (trainer:732) INFO: 19epoch:train:16192-17990batch: iter_time=2.556e-04, forward_time=0.202, loss_att=54.812, acc=0.952, loss=54.812, backward_time=0.297, grad_norm=99.197, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.937e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 14:53:20,703 (trainer:732) INFO: 19epoch:train:17991-19789batch: iter_time=2.620e-04, forward_time=0.203, loss_att=54.677, acc=0.953, loss=54.677, backward_time=0.298, grad_norm=105.669, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.927e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 15:13:29,219 (trainer:732) INFO: 19epoch:train:19790-21588batch: iter_time=2.602e-04, forward_time=0.203, loss_att=53.683, acc=0.953, loss=53.683, backward_time=0.298, grad_norm=100.043, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.918e-04, train_time=2.686 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 15:33:36,496 (trainer:732) INFO: 19epoch:train:21589-23387batch: iter_time=2.591e-04, forward_time=0.203, loss_att=53.870, acc=0.953, loss=53.870, backward_time=0.298, grad_norm=101.526, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.909e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 15:53:43,043 (trainer:732) INFO: 19epoch:train:23388-25186batch: iter_time=2.606e-04, forward_time=0.203, loss_att=53.873, acc=0.953, loss=53.873, backward_time=0.298, grad_norm=100.371, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.900e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 16:13:47,659 (trainer:732) INFO: 19epoch:train:25187-26985batch: iter_time=2.591e-04, forward_time=0.202, loss_att=53.947, acc=0.953, loss=53.947, backward_time=0.297, grad_norm=105.077, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.890e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 16:33:56,227 (trainer:732) INFO: 19epoch:train:26986-28784batch: iter_time=2.600e-04, forward_time=0.203, loss_att=54.060, acc=0.953, loss=54.060, backward_time=0.298, grad_norm=98.778, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.881e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 16:54:06,504 (trainer:732) INFO: 19epoch:train:28785-30583batch: iter_time=2.727e-04, forward_time=0.204, loss_att=54.499, acc=0.953, loss=54.499, backward_time=0.298, grad_norm=95.606, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=6.872e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 17:14:13,614 (trainer:732) INFO: 19epoch:train:30584-32382batch: iter_time=2.524e-04, forward_time=0.203, loss_att=53.934, acc=0.953, loss=53.934, backward_time=0.298, grad_norm=103.626, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.863e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 17:34:21,988 (trainer:732) INFO: 19epoch:train:32383-34181batch: iter_time=2.539e-04, forward_time=0.203, loss_att=54.401, acc=0.953, loss=54.401, backward_time=0.298, grad_norm=98.387, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.854e-04, train_time=2.686 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 17:54:24,593 (trainer:732) INFO: 19epoch:train:34182-35980batch: iter_time=2.529e-04, forward_time=0.202, loss_att=54.109, acc=0.953, loss=54.109, backward_time=0.296, grad_norm=97.306, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.845e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 18:02:41,463 (trainer:338) INFO: 19epoch results: [train] iter_time=3.026e-04, forward_time=0.203, loss_att=54.214, acc=0.953, loss=54.214, backward_time=0.298, grad_norm=100.696, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.933e-04, train_time=2.712, time=6 hours, 46 minutes and 55.92 seconds, total_count=683924, gpu_max_cached_mem_GB=30.176, [valid] loss_att=42.664, acc=0.962, cer=0.047, wer=0.145, loss=42.664, time=4 minutes and 25.48 seconds, total_count=228, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 33.41 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 18:02:45,198 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 18:02:45,225 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/9epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 18:02:45,225 (trainer:272) INFO: 20/60epoch started. Estimated time to finish: 1 week, 4 days and 23 hours + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 18:27:21,431 (trainer:732) INFO: 20epoch:train:1-1799batch: iter_time=9.260e-04, forward_time=0.202, loss_att=52.972, acc=0.954, loss=52.972, backward_time=0.297, grad_norm=99.178, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.836e-04, train_time=3.283 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 18:47:30,135 (trainer:732) INFO: 20epoch:train:1800-3598batch: iter_time=2.408e-04, forward_time=0.203, loss_att=53.060, acc=0.954, loss=53.060, backward_time=0.298, grad_norm=98.987, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.827e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 19:07:36,861 (trainer:732) INFO: 20epoch:train:3599-5397batch: iter_time=2.381e-04, forward_time=0.202, loss_att=53.564, acc=0.954, loss=53.564, backward_time=0.297, grad_norm=98.807, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.818e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 19:27:44,217 (trainer:732) INFO: 20epoch:train:5398-7196batch: iter_time=2.161e-04, forward_time=0.202, loss_att=53.370, acc=0.954, loss=53.370, backward_time=0.298, grad_norm=96.399, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.809e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 19:47:50,849 (trainer:732) INFO: 20epoch:train:7197-8995batch: iter_time=2.100e-04, forward_time=0.202, loss_att=52.751, acc=0.954, loss=52.751, backward_time=0.298, grad_norm=104.414, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.800e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 20:07:55,600 (trainer:732) INFO: 20epoch:train:8996-10794batch: iter_time=2.109e-04, forward_time=0.202, loss_att=54.468, acc=0.953, loss=54.468, backward_time=0.297, grad_norm=101.451, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.791e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 20:28:02,884 (trainer:732) INFO: 20epoch:train:10795-12593batch: iter_time=2.099e-04, forward_time=0.202, loss_att=52.941, acc=0.954, loss=52.941, backward_time=0.298, grad_norm=92.801, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.782e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 20:48:08,233 (trainer:732) INFO: 20epoch:train:12594-14392batch: iter_time=2.104e-04, forward_time=0.202, loss_att=53.669, acc=0.954, loss=53.669, backward_time=0.298, grad_norm=99.167, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.774e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 21:08:11,918 (trainer:732) INFO: 20epoch:train:14393-16191batch: iter_time=2.119e-04, forward_time=0.202, loss_att=53.617, acc=0.954, loss=53.617, backward_time=0.297, grad_norm=95.094, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.765e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 21:28:17,121 (trainer:732) INFO: 20epoch:train:16192-17990batch: iter_time=2.105e-04, forward_time=0.202, loss_att=53.124, acc=0.954, loss=53.124, backward_time=0.297, grad_norm=104.730, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.756e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 21:48:22,596 (trainer:732) INFO: 20epoch:train:17991-19789batch: iter_time=2.090e-04, forward_time=0.202, loss_att=54.317, acc=0.953, loss=54.317, backward_time=0.297, grad_norm=100.032, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.748e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 22:08:26,362 (trainer:732) INFO: 20epoch:train:19790-21588batch: iter_time=2.141e-04, forward_time=0.202, loss_att=53.169, acc=0.954, loss=53.169, backward_time=0.297, grad_norm=106.881, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.739e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 22:28:30,781 (trainer:732) INFO: 20epoch:train:21589-23387batch: iter_time=2.136e-04, forward_time=0.202, loss_att=52.904, acc=0.953, loss=52.904, backward_time=0.297, grad_norm=98.088, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.730e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 22:48:36,802 (trainer:732) INFO: 20epoch:train:23388-25186batch: iter_time=2.169e-04, forward_time=0.202, loss_att=52.949, acc=0.954, loss=52.949, backward_time=0.298, grad_norm=101.015, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.722e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 23:08:42,678 (trainer:732) INFO: 20epoch:train:25187-26985batch: iter_time=2.231e-04, forward_time=0.202, loss_att=53.010, acc=0.954, loss=53.010, backward_time=0.297, grad_norm=103.456, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.713e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 23:28:52,431 (trainer:732) INFO: 20epoch:train:26986-28784batch: iter_time=2.257e-04, forward_time=0.203, loss_att=53.568, acc=0.954, loss=53.568, backward_time=0.298, grad_norm=99.252, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.705e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-08 23:48:59,981 (trainer:732) INFO: 20epoch:train:28785-30583batch: iter_time=2.413e-04, forward_time=0.203, loss_att=54.138, acc=0.954, loss=54.138, backward_time=0.298, grad_norm=106.815, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.696e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 00:09:04,745 (trainer:732) INFO: 20epoch:train:30584-32382batch: iter_time=2.342e-04, forward_time=0.202, loss_att=53.559, acc=0.953, loss=53.559, backward_time=0.297, grad_norm=100.753, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.688e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 00:29:10,668 (trainer:732) INFO: 20epoch:train:32383-34181batch: iter_time=2.397e-04, forward_time=0.202, loss_att=52.761, acc=0.954, loss=52.761, backward_time=0.297, grad_norm=96.672, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.680e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 00:49:18,448 (trainer:732) INFO: 20epoch:train:34182-35980batch: iter_time=2.393e-04, forward_time=0.203, loss_att=53.582, acc=0.954, loss=53.582, backward_time=0.298, grad_norm=99.563, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.671e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 00:57:50,048 (trainer:338) INFO: 20epoch results: [train] iter_time=2.570e-04, forward_time=0.202, loss_att=53.374, acc=0.954, loss=53.374, backward_time=0.297, grad_norm=100.167, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.752e-04, train_time=2.712, time=6 hours, 46 minutes and 51.95 seconds, total_count=719920, gpu_max_cached_mem_GB=30.176, [valid] loss_att=42.373, acc=0.962, cer=0.047, wer=0.145, loss=42.373, time=4 minutes and 42.3 seconds, total_count=240, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 30.57 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 00:57:53,807 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 00:57:53,815 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/10epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 00:57:53,815 (trainer:272) INFO: 21/60epoch started. Estimated time to finish: 1 week, 4 days and 16 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 01:22:24,409 (trainer:732) INFO: 21epoch:train:1-1799batch: iter_time=9.130e-04, forward_time=0.202, loss_att=52.665, acc=0.954, loss=52.665, backward_time=0.297, grad_norm=103.762, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.663e-04, train_time=3.271 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 01:42:30,224 (trainer:732) INFO: 21epoch:train:1800-3598batch: iter_time=2.464e-04, forward_time=0.202, loss_att=51.804, acc=0.955, loss=51.804, backward_time=0.297, grad_norm=99.864, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.655e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 02:02:40,554 (trainer:732) INFO: 21epoch:train:3599-5397batch: iter_time=2.431e-04, forward_time=0.203, loss_att=53.250, acc=0.954, loss=53.250, backward_time=0.298, grad_norm=98.950, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.646e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 02:22:50,420 (trainer:732) INFO: 21epoch:train:5398-7196batch: iter_time=2.382e-04, forward_time=0.203, loss_att=52.658, acc=0.955, loss=52.658, backward_time=0.298, grad_norm=100.552, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.638e-04, train_time=2.689 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 02:42:58,100 (trainer:732) INFO: 21epoch:train:7197-8995batch: iter_time=2.396e-04, forward_time=0.203, loss_att=52.761, acc=0.954, loss=52.761, backward_time=0.298, grad_norm=97.724, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.630e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 03:03:04,754 (trainer:732) INFO: 21epoch:train:8996-10794batch: iter_time=2.353e-04, forward_time=0.202, loss_att=53.312, acc=0.954, loss=53.312, backward_time=0.297, grad_norm=100.948, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.622e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 03:23:12,677 (trainer:732) INFO: 21epoch:train:10795-12593batch: iter_time=2.373e-04, forward_time=0.203, loss_att=52.528, acc=0.954, loss=52.528, backward_time=0.298, grad_norm=104.937, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.614e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 03:43:20,920 (trainer:732) INFO: 21epoch:train:12594-14392batch: iter_time=2.417e-04, forward_time=0.203, loss_att=52.414, acc=0.955, loss=52.414, backward_time=0.298, grad_norm=101.611, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=6.605e-04, train_time=2.686 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 04:03:28,131 (trainer:732) INFO: 21epoch:train:14393-16191batch: iter_time=2.379e-04, forward_time=0.203, loss_att=52.726, acc=0.954, loss=52.726, backward_time=0.297, grad_norm=97.494, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.597e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 04:23:37,503 (trainer:732) INFO: 21epoch:train:16192-17990batch: iter_time=2.396e-04, forward_time=0.203, loss_att=52.955, acc=0.954, loss=52.955, backward_time=0.298, grad_norm=102.301, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.589e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 04:43:41,893 (trainer:732) INFO: 21epoch:train:17991-19789batch: iter_time=2.331e-04, forward_time=0.202, loss_att=52.923, acc=0.954, loss=52.923, backward_time=0.297, grad_norm=98.476, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.581e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 05:03:48,181 (trainer:732) INFO: 21epoch:train:19790-21588batch: iter_time=2.230e-04, forward_time=0.202, loss_att=52.241, acc=0.955, loss=52.241, backward_time=0.297, grad_norm=103.989, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.573e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 05:23:59,017 (trainer:732) INFO: 21epoch:train:21589-23387batch: iter_time=2.181e-04, forward_time=0.203, loss_att=52.854, acc=0.955, loss=52.854, backward_time=0.299, grad_norm=95.208, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.565e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 05:44:03,093 (trainer:732) INFO: 21epoch:train:23388-25186batch: iter_time=2.203e-04, forward_time=0.202, loss_att=52.192, acc=0.954, loss=52.192, backward_time=0.297, grad_norm=99.855, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.557e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 06:04:07,407 (trainer:732) INFO: 21epoch:train:25187-26985batch: iter_time=2.146e-04, forward_time=0.202, loss_att=52.795, acc=0.954, loss=52.795, backward_time=0.297, grad_norm=100.477, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.549e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 06:24:12,153 (trainer:732) INFO: 21epoch:train:26986-28784batch: iter_time=2.109e-04, forward_time=0.202, loss_att=52.915, acc=0.954, loss=52.915, backward_time=0.297, grad_norm=104.925, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.542e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 06:44:17,746 (trainer:732) INFO: 21epoch:train:28785-30583batch: iter_time=2.124e-04, forward_time=0.202, loss_att=53.174, acc=0.954, loss=53.174, backward_time=0.298, grad_norm=102.242, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.534e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 07:04:20,417 (trainer:732) INFO: 21epoch:train:30584-32382batch: iter_time=2.121e-04, forward_time=0.202, loss_att=52.187, acc=0.954, loss=52.187, backward_time=0.297, grad_norm=98.334, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.526e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 07:24:22,263 (trainer:732) INFO: 21epoch:train:32383-34181batch: iter_time=2.133e-04, forward_time=0.202, loss_att=52.393, acc=0.954, loss=52.393, backward_time=0.297, grad_norm=103.705, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.518e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 07:44:29,748 (trainer:732) INFO: 21epoch:train:34182-35980batch: iter_time=2.110e-04, forward_time=0.202, loss_att=52.317, acc=0.955, loss=52.317, backward_time=0.298, grad_norm=98.808, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.510e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 07:52:44,912 (trainer:338) INFO: 21epoch results: [train] iter_time=2.620e-04, forward_time=0.202, loss_att=52.651, acc=0.954, loss=52.651, backward_time=0.297, grad_norm=100.703, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.586e-04, train_time=2.712, time=6 hours, 46 minutes and 54.68 seconds, total_count=755916, gpu_max_cached_mem_GB=30.176, [valid] loss_att=41.924, acc=0.962, cer=0.046, wer=0.143, loss=41.924, time=4 minutes and 27.71 seconds, total_count=252, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 28.7 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 07:52:48,554 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 07:52:48,581 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/11epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 07:52:48,581 (trainer:272) INFO: 22/60epoch started. Estimated time to finish: 1 week, 4 days and 9 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 08:17:15,891 (trainer:732) INFO: 22epoch:train:1-1799batch: iter_time=8.829e-04, forward_time=0.203, loss_att=51.587, acc=0.955, loss=51.587, backward_time=0.298, grad_norm=100.700, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.502e-04, train_time=3.263 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 08:37:20,991 (trainer:732) INFO: 22epoch:train:1800-3598batch: iter_time=2.173e-04, forward_time=0.202, loss_att=52.253, acc=0.955, loss=52.253, backward_time=0.297, grad_norm=103.846, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.495e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 08:57:26,678 (trainer:732) INFO: 22epoch:train:3599-5397batch: iter_time=2.172e-04, forward_time=0.202, loss_att=52.253, acc=0.954, loss=52.253, backward_time=0.297, grad_norm=100.280, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.487e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 09:17:31,199 (trainer:732) INFO: 22epoch:train:5398-7196batch: iter_time=2.130e-04, forward_time=0.202, loss_att=51.361, acc=0.955, loss=51.361, backward_time=0.297, grad_norm=98.455, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.479e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 09:37:38,035 (trainer:732) INFO: 22epoch:train:7197-8995batch: iter_time=2.165e-04, forward_time=0.202, loss_att=51.294, acc=0.956, loss=51.294, backward_time=0.298, grad_norm=99.583, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.472e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 09:57:45,415 (trainer:732) INFO: 22epoch:train:8996-10794batch: iter_time=2.130e-04, forward_time=0.202, loss_att=52.939, acc=0.954, loss=52.939, backward_time=0.298, grad_norm=104.706, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.464e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 10:17:50,830 (trainer:732) INFO: 22epoch:train:10795-12593batch: iter_time=2.130e-04, forward_time=0.202, loss_att=52.134, acc=0.955, loss=52.134, backward_time=0.298, grad_norm=99.996, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.457e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 10:37:53,390 (trainer:732) INFO: 22epoch:train:12594-14392batch: iter_time=2.124e-04, forward_time=0.202, loss_att=51.739, acc=0.954, loss=51.739, backward_time=0.297, grad_norm=104.342, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.449e-04, train_time=2.673 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 10:57:57,908 (trainer:732) INFO: 22epoch:train:14393-16191batch: iter_time=2.156e-04, forward_time=0.202, loss_att=51.908, acc=0.955, loss=51.908, backward_time=0.297, grad_norm=101.805, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.442e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 11:18:02,191 (trainer:732) INFO: 22epoch:train:16192-17990batch: iter_time=2.154e-04, forward_time=0.202, loss_att=51.464, acc=0.955, loss=51.464, backward_time=0.297, grad_norm=94.940, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.434e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 11:38:10,112 (trainer:732) INFO: 22epoch:train:17991-19789batch: iter_time=2.175e-04, forward_time=0.202, loss_att=52.975, acc=0.955, loss=52.975, backward_time=0.298, grad_norm=99.709, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.427e-04, train_time=2.685 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 11:58:15,900 (trainer:732) INFO: 22epoch:train:19790-21588batch: iter_time=2.122e-04, forward_time=0.202, loss_att=52.026, acc=0.955, loss=52.026, backward_time=0.298, grad_norm=94.357, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.419e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 12:18:21,150 (trainer:732) INFO: 22epoch:train:21589-23387batch: iter_time=2.124e-04, forward_time=0.202, loss_att=52.326, acc=0.955, loss=52.326, backward_time=0.297, grad_norm=101.413, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.412e-04, train_time=2.680 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 12:38:26,731 (trainer:732) INFO: 22epoch:train:23388-25186batch: iter_time=2.185e-04, forward_time=0.202, loss_att=51.042, acc=0.955, loss=51.042, backward_time=0.297, grad_norm=100.850, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.404e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 12:58:32,594 (trainer:732) INFO: 22epoch:train:25187-26985batch: iter_time=2.200e-04, forward_time=0.202, loss_att=52.211, acc=0.955, loss=52.211, backward_time=0.298, grad_norm=100.367, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.397e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 13:18:39,294 (trainer:732) INFO: 22epoch:train:26986-28784batch: iter_time=2.176e-04, forward_time=0.202, loss_att=52.340, acc=0.955, loss=52.340, backward_time=0.298, grad_norm=106.646, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.390e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 13:38:43,724 (trainer:732) INFO: 22epoch:train:28785-30583batch: iter_time=2.176e-04, forward_time=0.202, loss_att=51.966, acc=0.955, loss=51.966, backward_time=0.297, grad_norm=99.287, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.382e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 13:58:47,393 (trainer:732) INFO: 22epoch:train:30584-32382batch: iter_time=2.186e-04, forward_time=0.202, loss_att=52.552, acc=0.954, loss=52.552, backward_time=0.297, grad_norm=106.255, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.375e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 14:18:54,013 (trainer:732) INFO: 22epoch:train:32383-34181batch: iter_time=2.183e-04, forward_time=0.202, loss_att=51.839, acc=0.955, loss=51.839, backward_time=0.298, grad_norm=97.231, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.368e-04, train_time=2.682 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 14:38:58,955 (trainer:732) INFO: 22epoch:train:34182-35980batch: iter_time=2.140e-04, forward_time=0.202, loss_att=52.083, acc=0.955, loss=52.083, backward_time=0.297, grad_norm=101.344, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.360e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 14:47:25,077 (trainer:338) INFO: 22epoch results: [train] iter_time=2.491e-04, forward_time=0.202, loss_att=52.013, acc=0.955, loss=52.013, backward_time=0.297, grad_norm=100.805, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.431e-04, train_time=2.709, time=6 hours, 46 minutes and 28.08 seconds, total_count=791912, gpu_max_cached_mem_GB=30.176, [valid] loss_att=40.829, acc=0.963, cer=0.047, wer=0.141, loss=40.829, time=4 minutes and 31.77 seconds, total_count=264, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 36.64 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 14:47:28,490 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 14:47:28,498 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/12epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 14:47:28,498 (trainer:272) INFO: 23/60epoch started. Estimated time to finish: 1 week, 4 days and 2 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 15:11:57,931 (trainer:732) INFO: 23epoch:train:1-1799batch: iter_time=8.223e-04, forward_time=0.202, loss_att=50.974, acc=0.955, loss=50.974, backward_time=0.298, grad_norm=98.343, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.353e-04, train_time=3.268 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 15:32:05,239 (trainer:732) INFO: 23epoch:train:1800-3598batch: iter_time=2.248e-04, forward_time=0.202, loss_att=51.250, acc=0.955, loss=51.250, backward_time=0.298, grad_norm=101.969, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.346e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 15:52:11,977 (trainer:732) INFO: 23epoch:train:3599-5397batch: iter_time=2.250e-04, forward_time=0.202, loss_att=50.971, acc=0.956, loss=50.971, backward_time=0.297, grad_norm=96.990, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.339e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 16:12:18,503 (trainer:732) INFO: 23epoch:train:5398-7196batch: iter_time=2.203e-04, forward_time=0.202, loss_att=51.467, acc=0.955, loss=51.467, backward_time=0.298, grad_norm=97.644, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.332e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 16:32:24,210 (trainer:732) INFO: 23epoch:train:7197-8995batch: iter_time=2.199e-04, forward_time=0.202, loss_att=52.107, acc=0.955, loss=52.107, backward_time=0.297, grad_norm=98.862, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.325e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 16:52:33,081 (trainer:732) INFO: 23epoch:train:8996-10794batch: iter_time=2.160e-04, forward_time=0.203, loss_att=51.426, acc=0.956, loss=51.426, backward_time=0.298, grad_norm=95.482, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.317e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 17:12:40,387 (trainer:732) INFO: 23epoch:train:10795-12593batch: iter_time=2.164e-04, forward_time=0.202, loss_att=50.939, acc=0.956, loss=50.939, backward_time=0.297, grad_norm=96.341, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.310e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 17:32:43,841 (trainer:732) INFO: 23epoch:train:12594-14392batch: iter_time=2.186e-04, forward_time=0.202, loss_att=51.337, acc=0.955, loss=51.337, backward_time=0.297, grad_norm=96.960, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.303e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 17:52:48,253 (trainer:732) INFO: 23epoch:train:14393-16191batch: iter_time=2.181e-04, forward_time=0.202, loss_att=51.080, acc=0.955, loss=51.080, backward_time=0.297, grad_norm=96.596, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.296e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 18:12:52,269 (trainer:732) INFO: 23epoch:train:16192-17990batch: iter_time=2.168e-04, forward_time=0.202, loss_att=51.631, acc=0.955, loss=51.631, backward_time=0.297, grad_norm=99.081, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.289e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 18:32:57,537 (trainer:732) INFO: 23epoch:train:17991-19789batch: iter_time=2.181e-04, forward_time=0.202, loss_att=51.346, acc=0.955, loss=51.346, backward_time=0.297, grad_norm=102.864, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.282e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 18:53:01,375 (trainer:732) INFO: 23epoch:train:19790-21588batch: iter_time=2.199e-04, forward_time=0.202, loss_att=51.120, acc=0.955, loss=51.120, backward_time=0.297, grad_norm=100.634, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.275e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 19:13:07,969 (trainer:732) INFO: 23epoch:train:21589-23387batch: iter_time=2.233e-04, forward_time=0.202, loss_att=51.201, acc=0.956, loss=51.201, backward_time=0.298, grad_norm=101.977, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.268e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 19:33:11,876 (trainer:732) INFO: 23epoch:train:23388-25186batch: iter_time=2.179e-04, forward_time=0.202, loss_att=50.825, acc=0.956, loss=50.825, backward_time=0.297, grad_norm=100.070, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.261e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 19:53:16,820 (trainer:732) INFO: 23epoch:train:25187-26985batch: iter_time=2.154e-04, forward_time=0.202, loss_att=51.099, acc=0.956, loss=51.099, backward_time=0.297, grad_norm=102.090, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.255e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 20:13:23,592 (trainer:732) INFO: 23epoch:train:26986-28784batch: iter_time=2.128e-04, forward_time=0.202, loss_att=51.118, acc=0.956, loss=51.118, backward_time=0.298, grad_norm=102.864, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.248e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 20:33:27,660 (trainer:732) INFO: 23epoch:train:28785-30583batch: iter_time=2.177e-04, forward_time=0.202, loss_att=51.694, acc=0.955, loss=51.694, backward_time=0.297, grad_norm=97.943, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.241e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 20:53:33,628 (trainer:732) INFO: 23epoch:train:30584-32382batch: iter_time=2.125e-04, forward_time=0.202, loss_att=50.777, acc=0.956, loss=50.777, backward_time=0.297, grad_norm=97.824, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.234e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 21:13:39,063 (trainer:732) INFO: 23epoch:train:32383-34181batch: iter_time=2.107e-04, forward_time=0.202, loss_att=51.822, acc=0.955, loss=51.822, backward_time=0.297, grad_norm=106.602, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.227e-04, train_time=2.680 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 21:33:44,146 (trainer:732) INFO: 23epoch:train:34182-35980batch: iter_time=2.144e-04, forward_time=0.202, loss_att=51.125, acc=0.956, loss=51.125, backward_time=0.297, grad_norm=104.601, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.220e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 21:41:54,860 (trainer:338) INFO: 23epoch results: [train] iter_time=2.480e-04, forward_time=0.202, loss_att=51.262, acc=0.956, loss=51.262, backward_time=0.297, grad_norm=99.787, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.286e-04, train_time=2.710, time=6 hours, 46 minutes and 32.83 seconds, total_count=827908, gpu_max_cached_mem_GB=30.176, [valid] loss_att=41.010, acc=0.963, cer=0.046, wer=0.140, loss=41.010, time=4 minutes and 22.73 seconds, total_count=276, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 30.8 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 21:41:58,611 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 21:41:58,619 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/13epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 21:41:58,620 (trainer:272) INFO: 24/60epoch started. Estimated time to finish: 1 week, 3 days and 18 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 22:06:29,357 (trainer:732) INFO: 24epoch:train:1-1799batch: iter_time=9.006e-04, forward_time=0.202, loss_att=50.660, acc=0.956, loss=50.660, backward_time=0.298, grad_norm=100.715, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.214e-04, train_time=3.271 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 22:26:37,012 (trainer:732) INFO: 24epoch:train:1800-3598batch: iter_time=2.332e-04, forward_time=0.203, loss_att=50.308, acc=0.956, loss=50.308, backward_time=0.298, grad_norm=105.874, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.207e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 22:46:42,256 (trainer:732) INFO: 24epoch:train:3599-5397batch: iter_time=2.271e-04, forward_time=0.202, loss_att=50.522, acc=0.956, loss=50.522, backward_time=0.297, grad_norm=98.866, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.200e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 23:06:44,386 (trainer:732) INFO: 24epoch:train:5398-7196batch: iter_time=2.250e-04, forward_time=0.201, loss_att=50.305, acc=0.956, loss=50.305, backward_time=0.296, grad_norm=102.931, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.194e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 23:26:50,878 (trainer:732) INFO: 24epoch:train:7197-8995batch: iter_time=2.263e-04, forward_time=0.202, loss_att=50.939, acc=0.956, loss=50.939, backward_time=0.298, grad_norm=104.782, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.187e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-09 23:46:52,651 (trainer:732) INFO: 24epoch:train:8996-10794batch: iter_time=2.265e-04, forward_time=0.202, loss_att=50.258, acc=0.956, loss=50.258, backward_time=0.296, grad_norm=103.750, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.180e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 00:06:56,096 (trainer:732) INFO: 24epoch:train:10795-12593batch: iter_time=2.274e-04, forward_time=0.202, loss_att=51.233, acc=0.956, loss=51.233, backward_time=0.297, grad_norm=102.679, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.174e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 00:27:05,248 (trainer:732) INFO: 24epoch:train:12594-14392batch: iter_time=2.246e-04, forward_time=0.203, loss_att=50.571, acc=0.956, loss=50.571, backward_time=0.298, grad_norm=99.887, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.167e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 00:47:13,613 (trainer:732) INFO: 24epoch:train:14393-16191batch: iter_time=2.245e-04, forward_time=0.202, loss_att=50.506, acc=0.956, loss=50.506, backward_time=0.298, grad_norm=97.469, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.160e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 01:07:19,042 (trainer:732) INFO: 24epoch:train:16192-17990batch: iter_time=2.256e-04, forward_time=0.202, loss_att=50.475, acc=0.956, loss=50.475, backward_time=0.297, grad_norm=96.754, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.154e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 01:27:23,491 (trainer:732) INFO: 24epoch:train:17991-19789batch: iter_time=2.225e-04, forward_time=0.202, loss_att=51.356, acc=0.955, loss=51.356, backward_time=0.297, grad_norm=107.068, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.147e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 01:47:29,719 (trainer:732) INFO: 24epoch:train:19790-21588batch: iter_time=2.197e-04, forward_time=0.202, loss_att=50.442, acc=0.956, loss=50.442, backward_time=0.297, grad_norm=97.649, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.141e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 02:07:35,794 (trainer:732) INFO: 24epoch:train:21589-23387batch: iter_time=2.230e-04, forward_time=0.202, loss_att=51.815, acc=0.955, loss=51.815, backward_time=0.297, grad_norm=100.307, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.134e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 02:27:42,428 (trainer:732) INFO: 24epoch:train:23388-25186batch: iter_time=2.247e-04, forward_time=0.202, loss_att=49.878, acc=0.956, loss=49.878, backward_time=0.297, grad_norm=97.959, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.128e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 02:47:47,565 (trainer:732) INFO: 24epoch:train:25187-26985batch: iter_time=2.213e-04, forward_time=0.202, loss_att=50.663, acc=0.956, loss=50.663, backward_time=0.297, grad_norm=98.961, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.121e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 03:07:53,514 (trainer:732) INFO: 24epoch:train:26986-28784batch: iter_time=2.193e-04, forward_time=0.202, loss_att=49.933, acc=0.956, loss=49.933, backward_time=0.297, grad_norm=97.528, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.115e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 03:27:57,848 (trainer:732) INFO: 24epoch:train:28785-30583batch: iter_time=2.225e-04, forward_time=0.202, loss_att=50.869, acc=0.956, loss=50.869, backward_time=0.297, grad_norm=101.238, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.108e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 03:48:02,725 (trainer:732) INFO: 24epoch:train:30584-32382batch: iter_time=2.235e-04, forward_time=0.202, loss_att=51.268, acc=0.956, loss=51.268, backward_time=0.297, grad_norm=106.087, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=6.102e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 04:08:08,074 (trainer:732) INFO: 24epoch:train:32383-34181batch: iter_time=2.240e-04, forward_time=0.202, loss_att=50.467, acc=0.957, loss=50.467, backward_time=0.298, grad_norm=98.560, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.096e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 04:28:17,421 (trainer:732) INFO: 24epoch:train:34182-35980batch: iter_time=2.233e-04, forward_time=0.203, loss_att=50.932, acc=0.956, loss=50.932, backward_time=0.298, grad_norm=100.028, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.089e-04, train_time=2.688 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 137) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 04:36:43,057 (trainer:338) INFO: 24epoch results: [train] iter_time=2.582e-04, forward_time=0.202, loss_att=50.669, acc=0.956, loss=50.669, backward_time=0.297, grad_norm=100.947, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.151e-04, train_time=2.710, time=6 hours, 46 minutes and 37.02 seconds, total_count=863904, gpu_max_cached_mem_GB=30.176, [valid] loss_att=41.376, acc=0.963, cer=0.044, wer=0.136, loss=41.376, time=4 minutes and 38.59 seconds, total_count=288, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 28.83 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 04:36:46,961 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 04:36:46,970 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/15epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 04:36:46,970 (trainer:272) INFO: 25/60epoch started. Estimated time to finish: 1 week, 3 days and 11 hours + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<51525> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<40962> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<51687> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<41212> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<52661> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<38243> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<19702> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<20591> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<58291> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<58341> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<37504> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<41472> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<50368> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<37480> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<15892> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<62681> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<35691> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<35887> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<33538> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<47009> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<64539> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<47181> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<32064> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<46766> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<32074> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<29194> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<30328> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 159) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<18826> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<63630> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<63610> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<35032> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<40190> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 05:01:09,734 (trainer:732) INFO: 25epoch:train:1-1799batch: iter_time=8.819e-04, forward_time=0.202, loss_att=50.282, acc=0.957, loss=50.282, backward_time=0.298, grad_norm=100.721, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.083e-04, train_time=3.253 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 157) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 05:21:14,621 (trainer:732) INFO: 25epoch:train:1800-3598batch: iter_time=2.313e-04, forward_time=0.202, loss_att=49.789, acc=0.957, loss=49.789, backward_time=0.297, grad_norm=104.253, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.077e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 05:41:23,478 (trainer:732) INFO: 25epoch:train:3599-5397batch: iter_time=2.214e-04, forward_time=0.203, loss_att=50.012, acc=0.957, loss=50.012, backward_time=0.298, grad_norm=103.507, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.070e-04, train_time=2.688 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 06:01:31,590 (trainer:732) INFO: 25epoch:train:5398-7196batch: iter_time=2.253e-04, forward_time=0.202, loss_att=50.156, acc=0.957, loss=50.156, backward_time=0.298, grad_norm=106.747, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.064e-04, train_time=2.685 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 06:21:38,782 (trainer:732) INFO: 25epoch:train:7197-8995batch: iter_time=2.207e-04, forward_time=0.202, loss_att=50.039, acc=0.957, loss=50.039, backward_time=0.298, grad_norm=104.067, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.058e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 06:41:44,452 (trainer:732) INFO: 25epoch:train:8996-10794batch: iter_time=2.168e-04, forward_time=0.202, loss_att=51.085, acc=0.956, loss=51.085, backward_time=0.298, grad_norm=101.598, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.052e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 07:01:47,561 (trainer:732) INFO: 25epoch:train:10795-12593batch: iter_time=2.206e-04, forward_time=0.202, loss_att=49.605, acc=0.957, loss=49.605, backward_time=0.296, grad_norm=95.643, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.045e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 07:21:49,379 (trainer:732) INFO: 25epoch:train:12594-14392batch: iter_time=2.162e-04, forward_time=0.201, loss_att=50.004, acc=0.956, loss=50.004, backward_time=0.296, grad_norm=101.867, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.039e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 07:41:53,026 (trainer:732) INFO: 25epoch:train:14393-16191batch: iter_time=2.162e-04, forward_time=0.202, loss_att=49.815, acc=0.957, loss=49.815, backward_time=0.297, grad_norm=105.240, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.033e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 08:01:57,238 (trainer:732) INFO: 25epoch:train:16192-17990batch: iter_time=2.186e-04, forward_time=0.202, loss_att=49.957, acc=0.956, loss=49.957, backward_time=0.297, grad_norm=99.579, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.027e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 08:22:03,555 (trainer:732) INFO: 25epoch:train:17991-19789batch: iter_time=2.182e-04, forward_time=0.202, loss_att=50.368, acc=0.956, loss=50.368, backward_time=0.298, grad_norm=100.098, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.021e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 08:42:07,082 (trainer:732) INFO: 25epoch:train:19790-21588batch: iter_time=2.204e-04, forward_time=0.202, loss_att=49.856, acc=0.956, loss=49.856, backward_time=0.297, grad_norm=108.574, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.015e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 09:02:14,037 (trainer:732) INFO: 25epoch:train:21589-23387batch: iter_time=2.179e-04, forward_time=0.203, loss_att=50.472, acc=0.957, loss=50.472, backward_time=0.298, grad_norm=102.321, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.008e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 09:22:21,084 (trainer:732) INFO: 25epoch:train:23388-25186batch: iter_time=2.221e-04, forward_time=0.202, loss_att=50.087, acc=0.957, loss=50.087, backward_time=0.298, grad_norm=100.029, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=6.002e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 09:42:26,070 (trainer:732) INFO: 25epoch:train:25187-26985batch: iter_time=2.202e-04, forward_time=0.202, loss_att=50.278, acc=0.956, loss=50.278, backward_time=0.297, grad_norm=99.345, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.996e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 10:02:31,615 (trainer:732) INFO: 25epoch:train:26986-28784batch: iter_time=2.218e-04, forward_time=0.202, loss_att=49.555, acc=0.957, loss=49.555, backward_time=0.297, grad_norm=102.196, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.990e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 10:22:34,344 (trainer:732) INFO: 25epoch:train:28785-30583batch: iter_time=2.254e-04, forward_time=0.202, loss_att=50.086, acc=0.956, loss=50.086, backward_time=0.297, grad_norm=94.373, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.984e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 10:42:38,830 (trainer:732) INFO: 25epoch:train:30584-32382batch: iter_time=2.194e-04, forward_time=0.202, loss_att=50.296, acc=0.956, loss=50.296, backward_time=0.297, grad_norm=99.294, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.978e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 11:02:43,020 (trainer:732) INFO: 25epoch:train:32383-34181batch: iter_time=2.188e-04, forward_time=0.202, loss_att=50.225, acc=0.956, loss=50.225, backward_time=0.297, grad_norm=104.503, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.972e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 11:22:52,110 (trainer:732) INFO: 25epoch:train:34182-35980batch: iter_time=2.229e-04, forward_time=0.203, loss_att=50.239, acc=0.957, loss=50.239, backward_time=0.298, grad_norm=99.503, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.966e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 11:31:02,590 (trainer:338) INFO: 25epoch results: [train] iter_time=2.537e-04, forward_time=0.202, loss_att=50.109, acc=0.956, loss=50.109, backward_time=0.297, grad_norm=101.672, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=6.024e-04, train_time=2.708, time=6 hours, 46 minutes and 25.44 seconds, total_count=899900, gpu_max_cached_mem_GB=30.176, [valid] loss_att=39.791, acc=0.965, cer=0.045, wer=0.136, loss=39.791, time=4 minutes and 21.41 seconds, total_count=300, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 28.77 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 11:31:06,104 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 11:31:06,112 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/14epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 11:31:06,113 (trainer:272) INFO: 26/60epoch started. Estimated time to finish: 1 week, 3 days and 4 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 11:55:34,550 (trainer:732) INFO: 26epoch:train:1-1799batch: iter_time=6.804e-04, forward_time=0.202, loss_att=49.343, acc=0.957, loss=49.343, backward_time=0.298, grad_norm=99.932, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.960e-04, train_time=3.266 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 12:15:38,895 (trainer:732) INFO: 26epoch:train:1800-3598batch: iter_time=2.315e-04, forward_time=0.202, loss_att=49.445, acc=0.957, loss=49.445, backward_time=0.297, grad_norm=100.567, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.954e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 12:35:41,395 (trainer:732) INFO: 26epoch:train:3599-5397batch: iter_time=2.300e-04, forward_time=0.202, loss_att=49.264, acc=0.957, loss=49.264, backward_time=0.297, grad_norm=97.669, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.948e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 12:55:46,724 (trainer:732) INFO: 26epoch:train:5398-7196batch: iter_time=2.340e-04, forward_time=0.202, loss_att=49.157, acc=0.957, loss=49.157, backward_time=0.297, grad_norm=100.690, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.942e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 13:15:52,458 (trainer:732) INFO: 26epoch:train:7197-8995batch: iter_time=2.275e-04, forward_time=0.202, loss_att=49.066, acc=0.957, loss=49.066, backward_time=0.298, grad_norm=102.241, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.937e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 13:35:56,817 (trainer:732) INFO: 26epoch:train:8996-10794batch: iter_time=2.337e-04, forward_time=0.202, loss_att=48.915, acc=0.957, loss=48.915, backward_time=0.297, grad_norm=100.591, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.931e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 13:56:00,342 (trainer:732) INFO: 26epoch:train:10795-12593batch: iter_time=2.310e-04, forward_time=0.202, loss_att=49.541, acc=0.957, loss=49.541, backward_time=0.297, grad_norm=100.756, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.925e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 14:16:02,411 (trainer:732) INFO: 26epoch:train:12594-14392batch: iter_time=2.275e-04, forward_time=0.202, loss_att=49.452, acc=0.957, loss=49.452, backward_time=0.297, grad_norm=97.050, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.919e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 14:36:10,066 (trainer:732) INFO: 26epoch:train:14393-16191batch: iter_time=2.274e-04, forward_time=0.203, loss_att=49.971, acc=0.957, loss=49.971, backward_time=0.299, grad_norm=102.561, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.913e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 14:56:17,094 (trainer:732) INFO: 26epoch:train:16192-17990batch: iter_time=2.333e-04, forward_time=0.203, loss_att=49.346, acc=0.957, loss=49.346, backward_time=0.298, grad_norm=100.698, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.907e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 15:16:25,344 (trainer:732) INFO: 26epoch:train:17991-19789batch: iter_time=2.343e-04, forward_time=0.203, loss_att=49.538, acc=0.957, loss=49.538, backward_time=0.299, grad_norm=106.844, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.902e-04, train_time=2.686 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 15:36:28,980 (trainer:732) INFO: 26epoch:train:19790-21588batch: iter_time=2.226e-04, forward_time=0.202, loss_att=49.710, acc=0.957, loss=49.710, backward_time=0.297, grad_norm=101.537, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.896e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 156) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 156) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 15:56:33,767 (trainer:732) INFO: 26epoch:train:21589-23387batch: iter_time=2.262e-04, forward_time=0.202, loss_att=49.697, acc=0.957, loss=49.697, backward_time=0.297, grad_norm=101.277, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.890e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 16:16:29,927 (trainer:732) INFO: 26epoch:train:23388-25186batch: iter_time=2.273e-04, forward_time=0.201, loss_att=49.421, acc=0.956, loss=49.421, backward_time=0.295, grad_norm=103.364, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.884e-04, train_time=2.659 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 16:36:35,242 (trainer:732) INFO: 26epoch:train:25187-26985batch: iter_time=2.248e-04, forward_time=0.202, loss_att=49.647, acc=0.957, loss=49.647, backward_time=0.298, grad_norm=104.715, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.879e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 16:56:42,670 (trainer:732) INFO: 26epoch:train:26986-28784batch: iter_time=2.221e-04, forward_time=0.203, loss_att=50.339, acc=0.957, loss=50.339, backward_time=0.298, grad_norm=100.475, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.873e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 17:16:47,329 (trainer:732) INFO: 26epoch:train:28785-30583batch: iter_time=2.255e-04, forward_time=0.202, loss_att=50.284, acc=0.957, loss=50.284, backward_time=0.298, grad_norm=103.618, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.867e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 17:36:52,101 (trainer:732) INFO: 26epoch:train:30584-32382batch: iter_time=2.198e-04, forward_time=0.202, loss_att=49.704, acc=0.957, loss=49.704, backward_time=0.297, grad_norm=105.462, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.862e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 17:56:57,238 (trainer:732) INFO: 26epoch:train:32383-34181batch: iter_time=2.222e-04, forward_time=0.202, loss_att=49.996, acc=0.957, loss=49.996, backward_time=0.298, grad_norm=105.006, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.856e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 18:17:00,545 (trainer:732) INFO: 26epoch:train:34182-35980batch: iter_time=2.256e-04, forward_time=0.202, loss_att=48.933, acc=0.957, loss=48.933, backward_time=0.297, grad_norm=105.096, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.850e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 18:25:04,552 (trainer:338) INFO: 26epoch results: [train] iter_time=2.503e-04, forward_time=0.202, loss_att=49.536, acc=0.957, loss=49.536, backward_time=0.297, grad_norm=101.998, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.905e-04, train_time=2.707, time=6 hours, 46 minutes and 12.31 seconds, total_count=935896, gpu_max_cached_mem_GB=30.176, [valid] loss_att=39.067, acc=0.965, cer=0.045, wer=0.136, loss=39.067, time=4 minutes and 21.53 seconds, total_count=312, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 24.6 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 18:25:08,038 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 18:25:08,049 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/17epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 18:25:08,049 (trainer:272) INFO: 27/60epoch started. Estimated time to finish: 1 week, 2 days and 21 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 18:49:35,992 (trainer:732) INFO: 27epoch:train:1-1799batch: iter_time=9.108e-04, forward_time=0.202, loss_att=48.360, acc=0.958, loss=48.360, backward_time=0.297, grad_norm=101.087, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.845e-04, train_time=3.265 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 19:09:39,241 (trainer:732) INFO: 27epoch:train:1800-3598batch: iter_time=2.540e-04, forward_time=0.202, loss_att=48.670, acc=0.957, loss=48.670, backward_time=0.297, grad_norm=94.216, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.839e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 19:29:46,576 (trainer:732) INFO: 27epoch:train:3599-5397batch: iter_time=2.508e-04, forward_time=0.203, loss_att=49.530, acc=0.958, loss=49.530, backward_time=0.298, grad_norm=104.170, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.833e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 19:49:53,212 (trainer:732) INFO: 27epoch:train:5398-7196batch: iter_time=2.518e-04, forward_time=0.203, loss_att=49.320, acc=0.958, loss=49.320, backward_time=0.298, grad_norm=98.507, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.828e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 20:10:00,800 (trainer:732) INFO: 27epoch:train:7197-8995batch: iter_time=2.533e-04, forward_time=0.203, loss_att=49.148, acc=0.957, loss=49.148, backward_time=0.298, grad_norm=96.417, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.822e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 20:30:03,812 (trainer:732) INFO: 27epoch:train:8996-10794batch: iter_time=2.492e-04, forward_time=0.203, loss_att=49.868, acc=0.957, loss=49.868, backward_time=0.298, grad_norm=100.343, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.817e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 20:50:05,801 (trainer:732) INFO: 27epoch:train:10795-12593batch: iter_time=2.477e-04, forward_time=0.202, loss_att=48.538, acc=0.957, loss=48.538, backward_time=0.297, grad_norm=97.076, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.811e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 21:10:08,556 (trainer:732) INFO: 27epoch:train:12594-14392batch: iter_time=2.473e-04, forward_time=0.202, loss_att=49.482, acc=0.957, loss=49.482, backward_time=0.297, grad_norm=104.109, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.806e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 21:30:09,240 (trainer:732) INFO: 27epoch:train:14393-16191batch: iter_time=2.482e-04, forward_time=0.202, loss_att=48.612, acc=0.958, loss=48.612, backward_time=0.296, grad_norm=98.748, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.800e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 21:50:17,724 (trainer:732) INFO: 27epoch:train:16192-17990batch: iter_time=2.459e-04, forward_time=0.203, loss_att=49.580, acc=0.958, loss=49.580, backward_time=0.299, grad_norm=97.545, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.795e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 22:10:21,458 (trainer:732) INFO: 27epoch:train:17991-19789batch: iter_time=2.507e-04, forward_time=0.202, loss_att=49.006, acc=0.957, loss=49.006, backward_time=0.297, grad_norm=101.301, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.789e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 22:30:26,409 (trainer:732) INFO: 27epoch:train:19790-21588batch: iter_time=2.509e-04, forward_time=0.202, loss_att=49.197, acc=0.957, loss=49.197, backward_time=0.297, grad_norm=100.471, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.784e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 22:50:31,061 (trainer:732) INFO: 27epoch:train:21589-23387batch: iter_time=2.498e-04, forward_time=0.202, loss_att=48.847, acc=0.957, loss=48.847, backward_time=0.297, grad_norm=103.007, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.778e-04, train_time=2.679 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 23:10:38,876 (trainer:732) INFO: 27epoch:train:23388-25186batch: iter_time=2.535e-04, forward_time=0.203, loss_att=49.173, acc=0.958, loss=49.173, backward_time=0.298, grad_norm=101.532, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.773e-04, train_time=2.685 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 23:30:41,976 (trainer:732) INFO: 27epoch:train:25187-26985batch: iter_time=2.558e-04, forward_time=0.202, loss_att=49.042, acc=0.957, loss=49.042, backward_time=0.297, grad_norm=100.835, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.768e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-10 23:50:45,687 (trainer:732) INFO: 27epoch:train:26986-28784batch: iter_time=2.553e-04, forward_time=0.202, loss_att=49.167, acc=0.957, loss=49.167, backward_time=0.297, grad_norm=104.105, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.762e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 156) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 156) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 00:10:52,379 (trainer:732) INFO: 27epoch:train:28785-30583batch: iter_time=2.534e-04, forward_time=0.203, loss_att=48.004, acc=0.958, loss=48.004, backward_time=0.298, grad_norm=104.210, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.757e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 00:30:58,318 (trainer:732) INFO: 27epoch:train:30584-32382batch: iter_time=2.509e-04, forward_time=0.203, loss_att=49.078, acc=0.957, loss=49.078, backward_time=0.298, grad_norm=100.136, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.751e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 00:51:04,358 (trainer:732) INFO: 27epoch:train:32383-34181batch: iter_time=2.533e-04, forward_time=0.203, loss_att=48.720, acc=0.957, loss=48.720, backward_time=0.298, grad_norm=103.491, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.746e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 01:11:10,850 (trainer:732) INFO: 27epoch:train:34182-35980batch: iter_time=2.532e-04, forward_time=0.203, loss_att=49.091, acc=0.958, loss=49.091, backward_time=0.298, grad_norm=106.937, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.741e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 01:19:41,708 (trainer:338) INFO: 27epoch results: [train] iter_time=2.842e-04, forward_time=0.202, loss_att=49.023, acc=0.957, loss=49.023, backward_time=0.298, grad_norm=100.923, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.792e-04, train_time=2.708, time=6 hours, 46 minutes and 23.99 seconds, total_count=971892, gpu_max_cached_mem_GB=30.176, [valid] loss_att=38.917, acc=0.965, cer=0.043, wer=0.135, loss=38.917, time=4 minutes and 41.41 seconds, total_count=324, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 28.26 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 01:19:45,554 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 01:19:45,566 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/16epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 01:19:45,566 (trainer:272) INFO: 28/60epoch started. Estimated time to finish: 1 week, 2 days and 14 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 01:44:15,507 (trainer:732) INFO: 28epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=48.189, acc=0.958, loss=48.189, backward_time=0.297, grad_norm=98.750, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.735e-04, train_time=3.269 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 02:04:19,860 (trainer:732) INFO: 28epoch:train:1800-3598batch: iter_time=2.520e-04, forward_time=0.202, loss_att=48.527, acc=0.958, loss=48.527, backward_time=0.297, grad_norm=103.182, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.730e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 02:24:24,258 (trainer:732) INFO: 28epoch:train:3599-5397batch: iter_time=2.441e-04, forward_time=0.202, loss_att=48.823, acc=0.957, loss=48.823, backward_time=0.297, grad_norm=102.501, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.725e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 02:44:29,218 (trainer:732) INFO: 28epoch:train:5398-7196batch: iter_time=2.422e-04, forward_time=0.202, loss_att=48.045, acc=0.958, loss=48.045, backward_time=0.297, grad_norm=102.194, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.720e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 03:04:35,735 (trainer:732) INFO: 28epoch:train:7197-8995batch: iter_time=2.450e-04, forward_time=0.203, loss_att=48.775, acc=0.958, loss=48.775, backward_time=0.298, grad_norm=102.056, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.714e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 03:24:40,048 (trainer:732) INFO: 28epoch:train:8996-10794batch: iter_time=2.439e-04, forward_time=0.202, loss_att=47.925, acc=0.958, loss=47.925, backward_time=0.297, grad_norm=102.743, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.709e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 03:44:46,975 (trainer:732) INFO: 28epoch:train:10795-12593batch: iter_time=2.419e-04, forward_time=0.203, loss_att=48.500, acc=0.958, loss=48.500, backward_time=0.298, grad_norm=94.903, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.704e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 04:04:51,412 (trainer:732) INFO: 28epoch:train:12594-14392batch: iter_time=2.465e-04, forward_time=0.202, loss_att=48.472, acc=0.958, loss=48.472, backward_time=0.297, grad_norm=101.429, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.699e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 04:24:55,212 (trainer:732) INFO: 28epoch:train:14393-16191batch: iter_time=2.434e-04, forward_time=0.202, loss_att=48.848, acc=0.958, loss=48.848, backward_time=0.297, grad_norm=101.598, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.693e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 04:45:01,872 (trainer:732) INFO: 28epoch:train:16192-17990batch: iter_time=2.445e-04, forward_time=0.202, loss_att=48.626, acc=0.958, loss=48.626, backward_time=0.297, grad_norm=97.623, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.688e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 05:05:04,631 (trainer:732) INFO: 28epoch:train:17991-19789batch: iter_time=2.433e-04, forward_time=0.202, loss_att=48.636, acc=0.958, loss=48.636, backward_time=0.297, grad_norm=101.504, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.683e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 05:25:11,565 (trainer:732) INFO: 28epoch:train:19790-21588batch: iter_time=2.424e-04, forward_time=0.203, loss_att=48.995, acc=0.958, loss=48.995, backward_time=0.298, grad_norm=99.338, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.678e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 05:45:15,122 (trainer:732) INFO: 28epoch:train:21589-23387batch: iter_time=2.463e-04, forward_time=0.202, loss_att=48.685, acc=0.957, loss=48.685, backward_time=0.297, grad_norm=102.649, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.673e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 156) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 156) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 06:05:21,591 (trainer:732) INFO: 28epoch:train:23388-25186batch: iter_time=2.419e-04, forward_time=0.203, loss_att=48.891, acc=0.958, loss=48.891, backward_time=0.298, grad_norm=104.786, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.668e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 06:25:27,033 (trainer:732) INFO: 28epoch:train:25187-26985batch: iter_time=2.408e-04, forward_time=0.202, loss_att=48.388, acc=0.958, loss=48.388, backward_time=0.297, grad_norm=98.593, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.663e-04, train_time=2.680 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 06:45:30,482 (trainer:732) INFO: 28epoch:train:26986-28784batch: iter_time=2.353e-04, forward_time=0.202, loss_att=47.483, acc=0.958, loss=47.483, backward_time=0.297, grad_norm=100.836, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.657e-04, train_time=2.675 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 07:05:36,252 (trainer:732) INFO: 28epoch:train:28785-30583batch: iter_time=2.411e-04, forward_time=0.203, loss_att=48.866, acc=0.958, loss=48.866, backward_time=0.298, grad_norm=102.634, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.652e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 07:25:38,476 (trainer:732) INFO: 28epoch:train:30584-32382batch: iter_time=2.358e-04, forward_time=0.202, loss_att=48.043, acc=0.958, loss=48.043, backward_time=0.297, grad_norm=101.760, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.647e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 07:45:44,285 (trainer:732) INFO: 28epoch:train:32383-34181batch: iter_time=2.385e-04, forward_time=0.202, loss_att=48.741, acc=0.958, loss=48.741, backward_time=0.298, grad_norm=99.831, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.642e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 08:05:52,756 (trainer:732) INFO: 28epoch:train:34182-35980batch: iter_time=2.333e-04, forward_time=0.203, loss_att=49.489, acc=0.957, loss=49.489, backward_time=0.299, grad_norm=98.016, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.637e-04, train_time=2.686 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<63052> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<63230> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<35127> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<27048> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<50365> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<28072> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<26671> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<47292> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 121) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<26679> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 128) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<51871> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<49282> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 137) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<16955> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<38844> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<38860> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<20617> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<20769> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 126) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 135) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 135) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 135) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 135) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 135) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<42479> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<20486> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<51880> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<40136> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<48002> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<48012> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 81) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<62831> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<63853> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 86) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<16505> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<17105> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<36774> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<62467> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<35169> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<35180> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<17501> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<17571> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 90) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 85) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 08:14:01,458 (trainer:338) INFO: 28epoch results: [train] iter_time=2.884e-04, forward_time=0.202, loss_att=48.544, acc=0.958, loss=48.544, backward_time=0.297, grad_norm=100.840, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.686e-04, train_time=2.709, time=6 hours, 46 minutes and 25.12 seconds, total_count=1007888, gpu_max_cached_mem_GB=30.176, [valid] loss_att=39.898, acc=0.965, cer=0.042, wer=0.132, loss=39.898, time=4 minutes and 27.08 seconds, total_count=336, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 23.69 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 08:14:05,191 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 08:14:05,203 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/19epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 08:14:05,203 (trainer:272) INFO: 29/60epoch started. Estimated time to finish: 1 week, 2 days and 7 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 08:38:27,968 (trainer:732) INFO: 29epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=47.296, acc=0.959, loss=47.296, backward_time=0.298, grad_norm=99.375, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.632e-04, train_time=3.253 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 08:58:30,814 (trainer:732) INFO: 29epoch:train:1800-3598batch: iter_time=2.448e-04, forward_time=0.202, loss_att=46.977, acc=0.959, loss=46.977, backward_time=0.297, grad_norm=102.220, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.627e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 09:18:34,934 (trainer:732) INFO: 29epoch:train:3599-5397batch: iter_time=2.433e-04, forward_time=0.202, loss_att=47.703, acc=0.959, loss=47.703, backward_time=0.297, grad_norm=100.613, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.622e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 09:38:42,359 (trainer:732) INFO: 29epoch:train:5398-7196batch: iter_time=2.390e-04, forward_time=0.203, loss_att=48.473, acc=0.958, loss=48.473, backward_time=0.298, grad_norm=98.931, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.617e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 09:58:49,458 (trainer:732) INFO: 29epoch:train:7197-8995batch: iter_time=2.489e-04, forward_time=0.203, loss_att=47.520, acc=0.959, loss=47.520, backward_time=0.298, grad_norm=101.224, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.612e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 10:18:52,335 (trainer:732) INFO: 29epoch:train:8996-10794batch: iter_time=2.423e-04, forward_time=0.202, loss_att=48.506, acc=0.957, loss=48.506, backward_time=0.297, grad_norm=100.608, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.607e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 10:38:58,506 (trainer:732) INFO: 29epoch:train:10795-12593batch: iter_time=2.385e-04, forward_time=0.203, loss_att=48.092, acc=0.958, loss=48.092, backward_time=0.298, grad_norm=108.742, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.602e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 10:59:04,560 (trainer:732) INFO: 29epoch:train:12594-14392batch: iter_time=2.417e-04, forward_time=0.203, loss_att=48.838, acc=0.958, loss=48.838, backward_time=0.298, grad_norm=98.740, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.597e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 11:19:09,332 (trainer:732) INFO: 29epoch:train:14393-16191batch: iter_time=2.478e-04, forward_time=0.202, loss_att=47.911, acc=0.958, loss=47.911, backward_time=0.297, grad_norm=104.724, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.592e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 11:39:12,877 (trainer:732) INFO: 29epoch:train:16192-17990batch: iter_time=2.440e-04, forward_time=0.202, loss_att=47.649, acc=0.958, loss=47.649, backward_time=0.297, grad_norm=101.961, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.587e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 11:59:16,485 (trainer:732) INFO: 29epoch:train:17991-19789batch: iter_time=2.406e-04, forward_time=0.202, loss_att=48.259, acc=0.958, loss=48.259, backward_time=0.297, grad_norm=102.571, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.583e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 12:19:22,022 (trainer:732) INFO: 29epoch:train:19790-21588batch: iter_time=2.410e-04, forward_time=0.203, loss_att=48.855, acc=0.958, loss=48.855, backward_time=0.298, grad_norm=105.046, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.578e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 12:39:26,681 (trainer:732) INFO: 29epoch:train:21589-23387batch: iter_time=2.346e-04, forward_time=0.202, loss_att=48.470, acc=0.958, loss=48.470, backward_time=0.298, grad_norm=95.927, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.573e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 12:59:32,627 (trainer:732) INFO: 29epoch:train:23388-25186batch: iter_time=2.401e-04, forward_time=0.202, loss_att=48.346, acc=0.958, loss=48.346, backward_time=0.298, grad_norm=100.256, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.568e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 13:19:38,065 (trainer:732) INFO: 29epoch:train:25187-26985batch: iter_time=2.412e-04, forward_time=0.202, loss_att=48.379, acc=0.958, loss=48.379, backward_time=0.297, grad_norm=106.891, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.563e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 13:39:40,561 (trainer:732) INFO: 29epoch:train:26986-28784batch: iter_time=2.391e-04, forward_time=0.202, loss_att=48.110, acc=0.958, loss=48.110, backward_time=0.297, grad_norm=98.582, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.558e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 13:59:49,097 (trainer:732) INFO: 29epoch:train:28785-30583batch: iter_time=2.432e-04, forward_time=0.203, loss_att=48.521, acc=0.958, loss=48.521, backward_time=0.299, grad_norm=106.426, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.553e-04, train_time=2.687 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 14:19:53,305 (trainer:732) INFO: 29epoch:train:30584-32382batch: iter_time=2.387e-04, forward_time=0.202, loss_att=47.939, acc=0.958, loss=47.939, backward_time=0.297, grad_norm=100.354, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.549e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 14:39:55,441 (trainer:732) INFO: 29epoch:train:32383-34181batch: iter_time=2.402e-04, forward_time=0.202, loss_att=48.016, acc=0.958, loss=48.016, backward_time=0.297, grad_norm=100.212, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.544e-04, train_time=2.672 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.205<24943> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 14:59:58,678 (trainer:732) INFO: 29epoch:train:34182-35980batch: iter_time=2.377e-04, forward_time=0.202, loss_att=47.928, acc=0.958, loss=47.928, backward_time=0.297, grad_norm=101.446, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.539e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 15:08:26,995 (trainer:338) INFO: 29epoch results: [train] iter_time=2.851e-04, forward_time=0.202, loss_att=48.087, acc=0.958, loss=48.087, backward_time=0.298, grad_norm=101.741, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.585e-04, train_time=2.707, time=6 hours, 46 minutes and 13.55 seconds, total_count=1043884, gpu_max_cached_mem_GB=30.176, [valid] loss_att=38.947, acc=0.965, cer=0.045, wer=0.136, loss=38.947, time=4 minutes and 37.66 seconds, total_count=348, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 30.57 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 15:08:30,577 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 15:08:30,586 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/20epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 15:08:30,586 (trainer:272) INFO: 30/60epoch started. Estimated time to finish: 1 week, 2 days and 21 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 15:33:20,345 (trainer:732) INFO: 30epoch:train:1-1799batch: iter_time=8.471e-04, forward_time=0.203, loss_att=47.151, acc=0.959, loss=47.151, backward_time=0.298, grad_norm=110.272, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.534e-04, train_time=3.313 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 15:53:27,726 (trainer:732) INFO: 30epoch:train:1800-3598batch: iter_time=2.606e-04, forward_time=0.203, loss_att=47.189, acc=0.959, loss=47.189, backward_time=0.298, grad_norm=96.689, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.530e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 16:13:35,477 (trainer:732) INFO: 30epoch:train:3599-5397batch: iter_time=2.556e-04, forward_time=0.202, loss_att=48.232, acc=0.959, loss=48.232, backward_time=0.298, grad_norm=100.839, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.525e-04, train_time=2.685 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 16:33:39,343 (trainer:732) INFO: 30epoch:train:5398-7196batch: iter_time=2.570e-04, forward_time=0.202, loss_att=47.731, acc=0.959, loss=47.731, backward_time=0.297, grad_norm=106.753, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.520e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 16:53:45,526 (trainer:732) INFO: 30epoch:train:7197-8995batch: iter_time=2.502e-04, forward_time=0.202, loss_att=46.832, acc=0.959, loss=46.832, backward_time=0.297, grad_norm=101.563, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.515e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 17:13:50,599 (trainer:732) INFO: 30epoch:train:8996-10794batch: iter_time=2.452e-04, forward_time=0.202, loss_att=48.010, acc=0.958, loss=48.010, backward_time=0.297, grad_norm=98.760, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.511e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 17:33:55,485 (trainer:732) INFO: 30epoch:train:10795-12593batch: iter_time=2.526e-04, forward_time=0.202, loss_att=47.678, acc=0.959, loss=47.678, backward_time=0.297, grad_norm=101.129, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.506e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 17:54:01,480 (trainer:732) INFO: 30epoch:train:12594-14392batch: iter_time=2.524e-04, forward_time=0.202, loss_att=47.984, acc=0.958, loss=47.984, backward_time=0.298, grad_norm=102.612, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.501e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 18:14:05,466 (trainer:732) INFO: 30epoch:train:14393-16191batch: iter_time=2.552e-04, forward_time=0.202, loss_att=47.536, acc=0.959, loss=47.536, backward_time=0.297, grad_norm=98.676, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.497e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 18:34:09,131 (trainer:732) INFO: 30epoch:train:16192-17990batch: iter_time=2.503e-04, forward_time=0.202, loss_att=47.534, acc=0.959, loss=47.534, backward_time=0.297, grad_norm=106.622, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.492e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 18:54:14,459 (trainer:732) INFO: 30epoch:train:17991-19789batch: iter_time=2.548e-04, forward_time=0.203, loss_att=47.264, acc=0.959, loss=47.264, backward_time=0.298, grad_norm=100.749, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.487e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 19:14:18,964 (trainer:732) INFO: 30epoch:train:19790-21588batch: iter_time=2.553e-04, forward_time=0.202, loss_att=47.740, acc=0.958, loss=47.740, backward_time=0.297, grad_norm=100.316, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.483e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 19:34:26,289 (trainer:732) INFO: 30epoch:train:21589-23387batch: iter_time=2.500e-04, forward_time=0.203, loss_att=48.092, acc=0.958, loss=48.092, backward_time=0.298, grad_norm=108.110, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.478e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 19:54:31,704 (trainer:732) INFO: 30epoch:train:23388-25186batch: iter_time=2.479e-04, forward_time=0.202, loss_att=47.981, acc=0.958, loss=47.981, backward_time=0.297, grad_norm=106.577, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.473e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 20:14:34,040 (trainer:732) INFO: 30epoch:train:25187-26985batch: iter_time=2.438e-04, forward_time=0.202, loss_att=47.466, acc=0.958, loss=47.466, backward_time=0.297, grad_norm=104.684, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.469e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 20:34:35,385 (trainer:732) INFO: 30epoch:train:26986-28784batch: iter_time=2.484e-04, forward_time=0.202, loss_att=47.211, acc=0.959, loss=47.211, backward_time=0.297, grad_norm=102.799, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.464e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 20:54:41,248 (trainer:732) INFO: 30epoch:train:28785-30583batch: iter_time=2.520e-04, forward_time=0.202, loss_att=48.525, acc=0.958, loss=48.525, backward_time=0.298, grad_norm=100.447, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.460e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 21:14:45,121 (trainer:732) INFO: 30epoch:train:30584-32382batch: iter_time=2.458e-04, forward_time=0.202, loss_att=48.063, acc=0.958, loss=48.063, backward_time=0.297, grad_norm=107.641, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.455e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 21:34:50,151 (trainer:732) INFO: 30epoch:train:32383-34181batch: iter_time=2.470e-04, forward_time=0.202, loss_att=47.538, acc=0.959, loss=47.538, backward_time=0.298, grad_norm=104.894, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.450e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 21:54:56,959 (trainer:732) INFO: 30epoch:train:34182-35980batch: iter_time=2.482e-04, forward_time=0.203, loss_att=47.659, acc=0.959, loss=47.659, backward_time=0.298, grad_norm=106.015, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.446e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 22:03:09,486 (trainer:338) INFO: 30epoch results: [train] iter_time=2.809e-04, forward_time=0.202, loss_att=47.670, acc=0.959, loss=47.670, backward_time=0.297, grad_norm=103.302, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.490e-04, train_time=2.711, time=6 hours, 46 minutes and 47.56 seconds, total_count=1079880, gpu_max_cached_mem_GB=30.176, [valid] loss_att=38.164, acc=0.966, cer=0.042, wer=0.131, loss=38.164, time=4 minutes and 21.09 seconds, total_count=360, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 30.25 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 22:03:13,516 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 22:03:13,526 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/18epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 22:03:13,526 (trainer:272) INFO: 31/60epoch started. Estimated time to finish: 1 week, 1 day and 17 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 22:27:41,664 (trainer:732) INFO: 31epoch:train:1-1799batch: iter_time=8.815e-04, forward_time=0.202, loss_att=46.667, acc=0.960, loss=46.667, backward_time=0.297, grad_norm=101.289, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.441e-04, train_time=3.265 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 22:47:44,358 (trainer:732) INFO: 31epoch:train:1800-3598batch: iter_time=2.560e-04, forward_time=0.202, loss_att=45.845, acc=0.960, loss=45.845, backward_time=0.297, grad_norm=101.152, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.437e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 23:07:49,905 (trainer:732) INFO: 31epoch:train:3599-5397batch: iter_time=2.491e-04, forward_time=0.202, loss_att=47.816, acc=0.958, loss=47.816, backward_time=0.297, grad_norm=97.757, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.432e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 23:27:53,410 (trainer:732) INFO: 31epoch:train:5398-7196batch: iter_time=2.474e-04, forward_time=0.202, loss_att=47.022, acc=0.959, loss=47.022, backward_time=0.297, grad_norm=100.994, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.428e-04, train_time=2.675 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-11 23:47:58,719 (trainer:732) INFO: 31epoch:train:7197-8995batch: iter_time=2.457e-04, forward_time=0.202, loss_att=47.045, acc=0.959, loss=47.045, backward_time=0.297, grad_norm=103.736, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.423e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 00:08:08,518 (trainer:732) INFO: 31epoch:train:8996-10794batch: iter_time=2.457e-04, forward_time=0.203, loss_att=47.105, acc=0.959, loss=47.105, backward_time=0.299, grad_norm=107.790, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.419e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 00:28:15,503 (trainer:732) INFO: 31epoch:train:10795-12593batch: iter_time=2.439e-04, forward_time=0.203, loss_att=47.416, acc=0.959, loss=47.416, backward_time=0.298, grad_norm=103.351, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.414e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 00:48:23,404 (trainer:732) INFO: 31epoch:train:12594-14392batch: iter_time=2.452e-04, forward_time=0.203, loss_att=48.062, acc=0.959, loss=48.062, backward_time=0.298, grad_norm=101.316, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.410e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 01:08:28,570 (trainer:732) INFO: 31epoch:train:14393-16191batch: iter_time=2.488e-04, forward_time=0.202, loss_att=47.212, acc=0.959, loss=47.212, backward_time=0.297, grad_norm=102.858, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.405e-04, train_time=2.680 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 01:28:32,057 (trainer:732) INFO: 31epoch:train:16192-17990batch: iter_time=2.465e-04, forward_time=0.202, loss_att=47.694, acc=0.959, loss=47.694, backward_time=0.297, grad_norm=105.646, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.401e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 01:48:40,516 (trainer:732) INFO: 31epoch:train:17991-19789batch: iter_time=2.380e-04, forward_time=0.203, loss_att=47.696, acc=0.959, loss=47.696, backward_time=0.299, grad_norm=100.399, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.397e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 02:08:44,631 (trainer:732) INFO: 31epoch:train:19790-21588batch: iter_time=2.477e-04, forward_time=0.202, loss_att=47.146, acc=0.959, loss=47.146, backward_time=0.297, grad_norm=103.790, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.392e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 02:28:49,887 (trainer:732) INFO: 31epoch:train:21589-23387batch: iter_time=2.429e-04, forward_time=0.202, loss_att=46.626, acc=0.960, loss=46.626, backward_time=0.298, grad_norm=105.931, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.388e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 02:48:52,876 (trainer:732) INFO: 31epoch:train:23388-25186batch: iter_time=2.464e-04, forward_time=0.202, loss_att=47.196, acc=0.959, loss=47.196, backward_time=0.297, grad_norm=102.878, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.383e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 03:08:56,844 (trainer:732) INFO: 31epoch:train:25187-26985batch: iter_time=2.433e-04, forward_time=0.202, loss_att=47.337, acc=0.959, loss=47.337, backward_time=0.297, grad_norm=101.994, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.379e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 03:29:04,123 (trainer:732) INFO: 31epoch:train:26986-28784batch: iter_time=2.425e-04, forward_time=0.203, loss_att=47.443, acc=0.959, loss=47.443, backward_time=0.298, grad_norm=113.521, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.375e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 03:49:06,457 (trainer:732) INFO: 31epoch:train:28785-30583batch: iter_time=2.393e-04, forward_time=0.202, loss_att=47.805, acc=0.958, loss=47.805, backward_time=0.297, grad_norm=102.381, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.370e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 04:09:10,336 (trainer:732) INFO: 31epoch:train:30584-32382batch: iter_time=2.385e-04, forward_time=0.202, loss_att=47.188, acc=0.959, loss=47.188, backward_time=0.297, grad_norm=104.426, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=5.366e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 04:29:14,257 (trainer:732) INFO: 31epoch:train:32383-34181batch: iter_time=2.400e-04, forward_time=0.202, loss_att=47.355, acc=0.959, loss=47.355, backward_time=0.297, grad_norm=101.556, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.362e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 04:49:20,917 (trainer:732) INFO: 31epoch:train:34182-35980batch: iter_time=2.446e-04, forward_time=0.202, loss_att=47.809, acc=0.959, loss=47.809, backward_time=0.298, grad_norm=104.543, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.357e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 04:57:48,348 (trainer:338) INFO: 31epoch results: [train] iter_time=2.766e-04, forward_time=0.202, loss_att=47.274, acc=0.959, loss=47.274, backward_time=0.297, grad_norm=103.360, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.399e-04, train_time=2.709, time=6 hours, 46 minutes and 25.22 seconds, total_count=1115876, gpu_max_cached_mem_GB=30.176, [valid] loss_att=37.896, acc=0.966, cer=0.042, wer=0.131, loss=37.896, time=4 minutes and 40.52 seconds, total_count=372, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 29.08 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 04:57:52,069 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 04:57:52,099 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/21epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 04:57:52,100 (trainer:272) INFO: 32/60epoch started. Estimated time to finish: 1 week, 1 day and 10 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 05:22:20,599 (trainer:732) INFO: 32epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=47.077, acc=0.959, loss=47.077, backward_time=0.298, grad_norm=101.390, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.353e-04, train_time=3.266 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 05:42:25,854 (trainer:732) INFO: 32epoch:train:1800-3598batch: iter_time=2.553e-04, forward_time=0.202, loss_att=46.832, acc=0.959, loss=46.832, backward_time=0.298, grad_norm=107.569, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.349e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 06:02:27,549 (trainer:732) INFO: 32epoch:train:3599-5397batch: iter_time=2.540e-04, forward_time=0.202, loss_att=45.823, acc=0.960, loss=45.823, backward_time=0.296, grad_norm=99.521, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.344e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 06:22:32,778 (trainer:732) INFO: 32epoch:train:5398-7196batch: iter_time=2.515e-04, forward_time=0.202, loss_att=46.427, acc=0.960, loss=46.427, backward_time=0.297, grad_norm=102.391, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.340e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 06:42:39,779 (trainer:732) INFO: 32epoch:train:7197-8995batch: iter_time=2.527e-04, forward_time=0.203, loss_att=47.220, acc=0.959, loss=47.220, backward_time=0.298, grad_norm=105.091, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.336e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 07:02:42,466 (trainer:732) INFO: 32epoch:train:8996-10794batch: iter_time=2.523e-04, forward_time=0.202, loss_att=46.373, acc=0.959, loss=46.373, backward_time=0.297, grad_norm=99.961, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.332e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 07:22:44,676 (trainer:732) INFO: 32epoch:train:10795-12593batch: iter_time=2.552e-04, forward_time=0.202, loss_att=46.546, acc=0.959, loss=46.546, backward_time=0.297, grad_norm=102.252, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.327e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 07:42:49,171 (trainer:732) INFO: 32epoch:train:12594-14392batch: iter_time=2.517e-04, forward_time=0.202, loss_att=47.627, acc=0.959, loss=47.627, backward_time=0.297, grad_norm=105.281, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.323e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 08:02:55,533 (trainer:732) INFO: 32epoch:train:14393-16191batch: iter_time=2.470e-04, forward_time=0.203, loss_att=47.865, acc=0.959, loss=47.865, backward_time=0.298, grad_norm=99.876, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.319e-04, train_time=2.682 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 08:23:00,745 (trainer:732) INFO: 32epoch:train:16192-17990batch: iter_time=2.444e-04, forward_time=0.202, loss_att=46.652, acc=0.959, loss=46.652, backward_time=0.297, grad_norm=104.004, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.315e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 08:43:03,468 (trainer:732) INFO: 32epoch:train:17991-19789batch: iter_time=2.492e-04, forward_time=0.202, loss_att=46.516, acc=0.959, loss=46.516, backward_time=0.297, grad_norm=99.718, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.310e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 09:03:08,757 (trainer:732) INFO: 32epoch:train:19790-21588batch: iter_time=2.452e-04, forward_time=0.202, loss_att=46.964, acc=0.959, loss=46.964, backward_time=0.298, grad_norm=101.339, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.306e-04, train_time=2.679 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 09:23:10,147 (trainer:732) INFO: 32epoch:train:21589-23387batch: iter_time=2.476e-04, forward_time=0.202, loss_att=47.371, acc=0.959, loss=47.371, backward_time=0.297, grad_norm=104.437, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.302e-04, train_time=2.671 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 09:43:15,758 (trainer:732) INFO: 32epoch:train:23388-25186batch: iter_time=2.451e-04, forward_time=0.202, loss_att=46.831, acc=0.959, loss=46.831, backward_time=0.298, grad_norm=103.473, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.298e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 10:03:23,352 (trainer:732) INFO: 32epoch:train:25187-26985batch: iter_time=2.410e-04, forward_time=0.203, loss_att=46.761, acc=0.960, loss=46.761, backward_time=0.298, grad_norm=108.045, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.294e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 10:23:26,899 (trainer:732) INFO: 32epoch:train:26986-28784batch: iter_time=2.468e-04, forward_time=0.202, loss_att=47.323, acc=0.959, loss=47.323, backward_time=0.297, grad_norm=104.945, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.289e-04, train_time=2.676 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 10:43:34,410 (trainer:732) INFO: 32epoch:train:28785-30583batch: iter_time=2.480e-04, forward_time=0.203, loss_att=47.874, acc=0.959, loss=47.874, backward_time=0.298, grad_norm=103.110, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.285e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 11:03:39,793 (trainer:732) INFO: 32epoch:train:30584-32382batch: iter_time=2.486e-04, forward_time=0.202, loss_att=46.825, acc=0.959, loss=46.825, backward_time=0.298, grad_norm=99.666, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.281e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 11:23:45,451 (trainer:732) INFO: 32epoch:train:32383-34181batch: iter_time=2.456e-04, forward_time=0.202, loss_att=46.534, acc=0.959, loss=46.534, backward_time=0.297, grad_norm=150.377, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.277e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 11:43:51,497 (trainer:732) INFO: 32epoch:train:34182-35980batch: iter_time=2.529e-04, forward_time=0.203, loss_att=46.489, acc=0.960, loss=46.489, backward_time=0.298, grad_norm=102.203, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.273e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 11:51:54,464 (trainer:338) INFO: 32epoch results: [train] iter_time=2.886e-04, forward_time=0.202, loss_att=46.895, acc=0.959, loss=46.895, backward_time=0.297, grad_norm=105.225, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.313e-04, train_time=2.708, time=6 hours, 46 minutes and 16.81 seconds, total_count=1151872, gpu_max_cached_mem_GB=30.176, [valid] loss_att=37.624, acc=0.967, cer=0.042, wer=0.130, loss=37.624, time=4 minutes and 19.89 seconds, total_count=384, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 25.67 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 11:51:58,220 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 11:51:58,230 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/23epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 11:51:58,230 (trainer:272) INFO: 33/60epoch started. Estimated time to finish: 1 week, 1 day and 3 hours +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 12:16:23,234 (trainer:732) INFO: 33epoch:train:1-1799batch: iter_time=8.903e-04, forward_time=0.203, loss_att=45.748, acc=0.960, loss=45.748, backward_time=0.298, grad_norm=104.952, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.269e-04, train_time=3.258 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 12:36:29,672 (trainer:732) INFO: 33epoch:train:1800-3598batch: iter_time=2.615e-04, forward_time=0.203, loss_att=46.380, acc=0.959, loss=46.380, backward_time=0.298, grad_norm=98.072, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=5.265e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 12:56:37,935 (trainer:732) INFO: 33epoch:train:3599-5397batch: iter_time=2.717e-04, forward_time=0.203, loss_att=46.994, acc=0.960, loss=46.994, backward_time=0.298, grad_norm=101.108, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=5.260e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 13:16:48,209 (trainer:732) INFO: 33epoch:train:5398-7196batch: iter_time=2.728e-04, forward_time=0.203, loss_att=46.641, acc=0.960, loss=46.641, backward_time=0.298, grad_norm=105.636, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=5.256e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 13:36:55,459 (trainer:732) INFO: 33epoch:train:7197-8995batch: iter_time=2.570e-04, forward_time=0.203, loss_att=46.687, acc=0.960, loss=46.687, backward_time=0.298, grad_norm=108.212, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.252e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 13:56:57,842 (trainer:732) INFO: 33epoch:train:8996-10794batch: iter_time=2.346e-04, forward_time=0.202, loss_att=46.375, acc=0.959, loss=46.375, backward_time=0.297, grad_norm=100.182, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.248e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 14:17:01,486 (trainer:732) INFO: 33epoch:train:10795-12593batch: iter_time=2.362e-04, forward_time=0.202, loss_att=45.518, acc=0.960, loss=45.518, backward_time=0.297, grad_norm=100.247, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.244e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 14:37:04,735 (trainer:732) INFO: 33epoch:train:12594-14392batch: iter_time=2.359e-04, forward_time=0.202, loss_att=46.204, acc=0.960, loss=46.204, backward_time=0.297, grad_norm=102.136, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.240e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 14:57:07,753 (trainer:732) INFO: 33epoch:train:14393-16191batch: iter_time=2.375e-04, forward_time=0.202, loss_att=46.753, acc=0.959, loss=46.753, backward_time=0.297, grad_norm=102.316, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.236e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 15:17:09,326 (trainer:732) INFO: 33epoch:train:16192-17990batch: iter_time=2.360e-04, forward_time=0.202, loss_att=46.455, acc=0.959, loss=46.455, backward_time=0.297, grad_norm=108.136, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.232e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 15:37:14,833 (trainer:732) INFO: 33epoch:train:17991-19789batch: iter_time=2.314e-04, forward_time=0.202, loss_att=46.900, acc=0.959, loss=46.900, backward_time=0.298, grad_norm=101.038, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.228e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 15:57:25,065 (trainer:732) INFO: 33epoch:train:19790-21588batch: iter_time=2.362e-04, forward_time=0.203, loss_att=47.148, acc=0.960, loss=47.148, backward_time=0.299, grad_norm=103.547, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.224e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 16:17:31,215 (trainer:732) INFO: 33epoch:train:21589-23387batch: iter_time=2.405e-04, forward_time=0.203, loss_att=46.994, acc=0.959, loss=46.994, backward_time=0.298, grad_norm=103.735, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.220e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 16:37:40,718 (trainer:732) INFO: 33epoch:train:23388-25186batch: iter_time=2.424e-04, forward_time=0.203, loss_att=46.580, acc=0.960, loss=46.580, backward_time=0.299, grad_norm=100.481, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.216e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 16:57:45,322 (trainer:732) INFO: 33epoch:train:25187-26985batch: iter_time=2.422e-04, forward_time=0.202, loss_att=47.243, acc=0.959, loss=47.243, backward_time=0.298, grad_norm=104.214, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.212e-04, train_time=2.678 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 17:17:51,696 (trainer:732) INFO: 33epoch:train:26986-28784batch: iter_time=2.376e-04, forward_time=0.202, loss_att=47.052, acc=0.960, loss=47.052, backward_time=0.298, grad_norm=101.963, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.208e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 17:37:54,799 (trainer:732) INFO: 33epoch:train:28785-30583batch: iter_time=2.427e-04, forward_time=0.202, loss_att=46.267, acc=0.959, loss=46.267, backward_time=0.297, grad_norm=107.103, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.204e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 17:57:56,734 (trainer:732) INFO: 33epoch:train:30584-32382batch: iter_time=2.432e-04, forward_time=0.202, loss_att=45.383, acc=0.960, loss=45.383, backward_time=0.297, grad_norm=101.070, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.200e-04, train_time=2.672 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 18:18:02,190 (trainer:732) INFO: 33epoch:train:32383-34181batch: iter_time=2.441e-04, forward_time=0.203, loss_att=46.438, acc=0.960, loss=46.438, backward_time=0.298, grad_norm=110.436, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.196e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 18:38:04,146 (trainer:732) INFO: 33epoch:train:34182-35980batch: iter_time=2.413e-04, forward_time=0.202, loss_att=46.760, acc=0.959, loss=46.760, backward_time=0.297, grad_norm=103.612, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.192e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 18:46:14,046 (trainer:338) INFO: 33epoch results: [train] iter_time=2.767e-04, forward_time=0.202, loss_att=46.522, acc=0.960, loss=46.522, backward_time=0.298, grad_norm=103.412, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.230e-04, train_time=2.709, time=6 hours, 46 minutes and 26.49 seconds, total_count=1187868, gpu_max_cached_mem_GB=30.176, [valid] loss_att=37.921, acc=0.966, cer=0.040, wer=0.128, loss=37.921, time=4 minutes and 18.01 seconds, total_count=396, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 31.32 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 18:46:17,778 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 18:46:17,788 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/24epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 18:46:17,789 (trainer:272) INFO: 34/60epoch started. Estimated time to finish: 1 week, 20 hours and 11 minutes + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 19:10:50,158 (trainer:732) INFO: 34epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=45.893, acc=0.960, loss=45.893, backward_time=0.297, grad_norm=97.035, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.188e-04, train_time=3.275 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 19:30:50,642 (trainer:732) INFO: 34epoch:train:1800-3598batch: iter_time=2.378e-04, forward_time=0.202, loss_att=45.257, acc=0.960, loss=45.257, backward_time=0.296, grad_norm=103.431, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.184e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 19:50:54,327 (trainer:732) INFO: 34epoch:train:3599-5397batch: iter_time=2.321e-04, forward_time=0.202, loss_att=45.691, acc=0.960, loss=45.691, backward_time=0.297, grad_norm=101.830, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.180e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 20:10:58,956 (trainer:732) INFO: 34epoch:train:5398-7196batch: iter_time=2.316e-04, forward_time=0.202, loss_att=46.060, acc=0.960, loss=46.060, backward_time=0.297, grad_norm=103.380, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.177e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 20:31:02,519 (trainer:732) INFO: 34epoch:train:7197-8995batch: iter_time=2.345e-04, forward_time=0.202, loss_att=46.447, acc=0.960, loss=46.447, backward_time=0.297, grad_norm=105.428, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.173e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 20:51:09,315 (trainer:732) INFO: 34epoch:train:8996-10794batch: iter_time=2.445e-04, forward_time=0.203, loss_att=45.838, acc=0.960, loss=45.838, backward_time=0.298, grad_norm=102.043, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.169e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 21:11:15,969 (trainer:732) INFO: 34epoch:train:10795-12593batch: iter_time=2.344e-04, forward_time=0.203, loss_att=46.101, acc=0.960, loss=46.101, backward_time=0.298, grad_norm=96.326, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.165e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 21:31:20,937 (trainer:732) INFO: 34epoch:train:12594-14392batch: iter_time=2.349e-04, forward_time=0.202, loss_att=46.445, acc=0.960, loss=46.445, backward_time=0.297, grad_norm=106.567, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.161e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 21:51:25,710 (trainer:732) INFO: 34epoch:train:14393-16191batch: iter_time=2.311e-04, forward_time=0.202, loss_att=45.618, acc=0.960, loss=45.618, backward_time=0.297, grad_norm=97.023, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.157e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 22:11:31,873 (trainer:732) INFO: 34epoch:train:16192-17990batch: iter_time=2.338e-04, forward_time=0.203, loss_att=46.212, acc=0.960, loss=46.212, backward_time=0.298, grad_norm=107.672, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.153e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 22:31:35,304 (trainer:732) INFO: 34epoch:train:17991-19789batch: iter_time=2.381e-04, forward_time=0.202, loss_att=46.841, acc=0.959, loss=46.841, backward_time=0.297, grad_norm=106.806, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.149e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 22:51:44,506 (trainer:732) INFO: 34epoch:train:19790-21588batch: iter_time=2.371e-04, forward_time=0.203, loss_att=46.604, acc=0.960, loss=46.604, backward_time=0.299, grad_norm=100.440, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.146e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 23:11:50,860 (trainer:732) INFO: 34epoch:train:21589-23387batch: iter_time=2.375e-04, forward_time=0.202, loss_att=45.811, acc=0.960, loss=45.811, backward_time=0.298, grad_norm=102.727, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.142e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 23:31:58,593 (trainer:732) INFO: 34epoch:train:23388-25186batch: iter_time=2.390e-04, forward_time=0.203, loss_att=46.396, acc=0.960, loss=46.396, backward_time=0.298, grad_norm=101.936, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.138e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-12 23:52:04,152 (trainer:732) INFO: 34epoch:train:25187-26985batch: iter_time=2.391e-04, forward_time=0.202, loss_att=46.559, acc=0.960, loss=46.559, backward_time=0.298, grad_norm=105.432, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.134e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 00:12:11,696 (trainer:732) INFO: 34epoch:train:26986-28784batch: iter_time=2.400e-04, forward_time=0.203, loss_att=45.912, acc=0.960, loss=45.912, backward_time=0.298, grad_norm=103.038, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.130e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 00:32:18,518 (trainer:732) INFO: 34epoch:train:28785-30583batch: iter_time=2.390e-04, forward_time=0.203, loss_att=46.509, acc=0.960, loss=46.509, backward_time=0.298, grad_norm=100.151, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.127e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 00:52:24,033 (trainer:732) INFO: 34epoch:train:30584-32382batch: iter_time=2.354e-04, forward_time=0.202, loss_att=46.437, acc=0.960, loss=46.437, backward_time=0.297, grad_norm=96.479, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.123e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 01:12:28,240 (trainer:732) INFO: 34epoch:train:32383-34181batch: iter_time=2.344e-04, forward_time=0.202, loss_att=46.099, acc=0.960, loss=46.099, backward_time=0.297, grad_norm=113.841, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.119e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 01:32:34,597 (trainer:732) INFO: 34epoch:train:34182-35980batch: iter_time=2.323e-04, forward_time=0.203, loss_att=46.701, acc=0.960, loss=46.701, backward_time=0.298, grad_norm=104.140, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.115e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 01:40:59,521 (trainer:338) INFO: 34epoch results: [train] iter_time=2.859e-04, forward_time=0.202, loss_att=46.173, acc=0.960, loss=46.173, backward_time=0.298, grad_norm=102.792, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.152e-04, train_time=2.710, time=6 hours, 46 minutes and 35.23 seconds, total_count=1223864, gpu_max_cached_mem_GB=30.176, [valid] loss_att=38.486, acc=0.966, cer=0.041, wer=0.129, loss=38.486, time=4 minutes and 35.07 seconds, total_count=408, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 31.42 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 01:41:03,349 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 01:41:03,361 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/22epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 01:41:03,362 (trainer:272) INFO: 35/60epoch started. Estimated time to finish: 1 week, 13 hours and 11 minutes + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 02:05:37,562 (trainer:732) INFO: 35epoch:train:1-1799batch: iter_time=8.551e-04, forward_time=0.203, loss_att=46.130, acc=0.960, loss=46.130, backward_time=0.298, grad_norm=106.070, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=5.111e-04, train_time=3.278 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 02:25:51,488 (trainer:732) INFO: 35epoch:train:1800-3598batch: iter_time=2.712e-04, forward_time=0.204, loss_att=45.005, acc=0.961, loss=45.005, backward_time=0.298, grad_norm=105.574, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=5.108e-04, train_time=2.699 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 02:45:57,843 (trainer:732) INFO: 35epoch:train:3599-5397batch: iter_time=2.438e-04, forward_time=0.203, loss_att=45.590, acc=0.961, loss=45.590, backward_time=0.298, grad_norm=108.896, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.104e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 03:06:09,547 (trainer:732) INFO: 35epoch:train:5398-7196batch: iter_time=2.541e-04, forward_time=0.203, loss_att=45.787, acc=0.961, loss=45.787, backward_time=0.299, grad_norm=110.099, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=5.100e-04, train_time=2.694 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 03:26:18,491 (trainer:732) INFO: 35epoch:train:7197-8995batch: iter_time=2.412e-04, forward_time=0.203, loss_att=46.150, acc=0.960, loss=46.150, backward_time=0.298, grad_norm=104.457, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.097e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 03:46:22,654 (trainer:732) INFO: 35epoch:train:8996-10794batch: iter_time=2.362e-04, forward_time=0.202, loss_att=46.339, acc=0.960, loss=46.339, backward_time=0.297, grad_norm=104.408, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.093e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 04:06:26,693 (trainer:732) INFO: 35epoch:train:10795-12593batch: iter_time=2.336e-04, forward_time=0.202, loss_att=46.367, acc=0.960, loss=46.367, backward_time=0.297, grad_norm=103.207, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.089e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 04:26:32,329 (trainer:732) INFO: 35epoch:train:12594-14392batch: iter_time=2.433e-04, forward_time=0.203, loss_att=46.515, acc=0.960, loss=46.515, backward_time=0.297, grad_norm=113.011, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.085e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 04:46:36,435 (trainer:732) INFO: 35epoch:train:14393-16191batch: iter_time=2.348e-04, forward_time=0.202, loss_att=45.733, acc=0.960, loss=45.733, backward_time=0.297, grad_norm=107.189, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.082e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 05:06:39,940 (trainer:732) INFO: 35epoch:train:16192-17990batch: iter_time=2.398e-04, forward_time=0.202, loss_att=46.329, acc=0.960, loss=46.329, backward_time=0.297, grad_norm=104.948, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.078e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 05:26:45,518 (trainer:732) INFO: 35epoch:train:17991-19789batch: iter_time=2.358e-04, forward_time=0.202, loss_att=45.510, acc=0.960, loss=45.510, backward_time=0.297, grad_norm=105.443, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.074e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 05:46:50,003 (trainer:732) INFO: 35epoch:train:19790-21588batch: iter_time=2.363e-04, forward_time=0.202, loss_att=45.935, acc=0.960, loss=45.935, backward_time=0.297, grad_norm=107.403, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.071e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 06:06:53,919 (trainer:732) INFO: 35epoch:train:21589-23387batch: iter_time=2.381e-04, forward_time=0.202, loss_att=45.913, acc=0.960, loss=45.913, backward_time=0.297, grad_norm=101.250, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.067e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 06:26:52,990 (trainer:732) INFO: 35epoch:train:23388-25186batch: iter_time=2.343e-04, forward_time=0.202, loss_att=45.788, acc=0.960, loss=45.788, backward_time=0.296, grad_norm=105.321, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.063e-04, train_time=2.666 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 06:47:00,195 (trainer:732) INFO: 35epoch:train:25187-26985batch: iter_time=2.316e-04, forward_time=0.203, loss_att=45.232, acc=0.961, loss=45.232, backward_time=0.298, grad_norm=100.568, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.060e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 07:07:04,138 (trainer:732) INFO: 35epoch:train:26986-28784batch: iter_time=2.332e-04, forward_time=0.202, loss_att=45.451, acc=0.960, loss=45.451, backward_time=0.297, grad_norm=102.240, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.056e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 07:27:08,875 (trainer:732) INFO: 35epoch:train:28785-30583batch: iter_time=2.363e-04, forward_time=0.202, loss_att=46.029, acc=0.960, loss=46.029, backward_time=0.298, grad_norm=100.717, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.052e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 07:47:15,579 (trainer:732) INFO: 35epoch:train:30584-32382batch: iter_time=2.329e-04, forward_time=0.203, loss_att=45.721, acc=0.960, loss=45.721, backward_time=0.298, grad_norm=96.416, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.049e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 08:07:18,775 (trainer:732) INFO: 35epoch:train:32383-34181batch: iter_time=2.347e-04, forward_time=0.202, loss_att=46.297, acc=0.960, loss=46.297, backward_time=0.297, grad_norm=104.608, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.045e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 08:27:25,058 (trainer:732) INFO: 35epoch:train:34182-35980batch: iter_time=2.321e-04, forward_time=0.202, loss_att=45.837, acc=0.960, loss=45.837, backward_time=0.298, grad_norm=100.684, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.042e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 08:35:37,090 (trainer:338) INFO: 35epoch results: [train] iter_time=2.699e-04, forward_time=0.202, loss_att=45.881, acc=0.960, loss=45.881, backward_time=0.298, grad_norm=104.612, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.076e-04, train_time=2.710, time=6 hours, 46 minutes and 39.31 seconds, total_count=1259860, gpu_max_cached_mem_GB=30.176, [valid] loss_att=38.266, acc=0.966, cer=0.041, wer=0.128, loss=38.266, time=4 minutes and 29.17 seconds, total_count=420, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 25.25 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 08:35:41,017 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 08:35:41,028 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/25epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 08:35:41,029 (trainer:272) INFO: 36/60epoch started. Estimated time to finish: 1 week, 6 hours and 10 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 09:00:06,865 (trainer:732) INFO: 36epoch:train:1-1799batch: iter_time=8.908e-04, forward_time=0.203, loss_att=45.093, acc=0.961, loss=45.093, backward_time=0.298, grad_norm=107.034, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.038e-04, train_time=3.260 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 09:20:15,220 (trainer:732) INFO: 36epoch:train:1800-3598batch: iter_time=2.331e-04, forward_time=0.203, loss_att=45.225, acc=0.961, loss=45.225, backward_time=0.298, grad_norm=106.855, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.034e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 09:40:20,637 (trainer:732) INFO: 36epoch:train:3599-5397batch: iter_time=2.316e-04, forward_time=0.202, loss_att=45.671, acc=0.960, loss=45.671, backward_time=0.298, grad_norm=99.884, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.031e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 10:00:21,629 (trainer:732) INFO: 36epoch:train:5398-7196batch: iter_time=2.290e-04, forward_time=0.202, loss_att=45.239, acc=0.960, loss=45.239, backward_time=0.297, grad_norm=101.667, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.027e-04, train_time=2.670 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 10:20:25,621 (trainer:732) INFO: 36epoch:train:7197-8995batch: iter_time=2.304e-04, forward_time=0.202, loss_att=45.567, acc=0.960, loss=45.567, backward_time=0.297, grad_norm=102.261, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.024e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 10:40:27,088 (trainer:732) INFO: 36epoch:train:8996-10794batch: iter_time=2.265e-04, forward_time=0.202, loss_att=44.701, acc=0.961, loss=44.701, backward_time=0.297, grad_norm=107.742, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.020e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 11:00:32,943 (trainer:732) INFO: 36epoch:train:10795-12593batch: iter_time=2.327e-04, forward_time=0.203, loss_att=45.185, acc=0.960, loss=45.185, backward_time=0.298, grad_norm=105.046, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.017e-04, train_time=2.680 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 11:20:39,195 (trainer:732) INFO: 36epoch:train:12594-14392batch: iter_time=2.328e-04, forward_time=0.203, loss_att=46.078, acc=0.960, loss=46.078, backward_time=0.298, grad_norm=103.209, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.013e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 11:40:41,584 (trainer:732) INFO: 36epoch:train:14393-16191batch: iter_time=2.311e-04, forward_time=0.202, loss_att=45.620, acc=0.960, loss=45.620, backward_time=0.297, grad_norm=106.375, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.009e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 12:00:47,014 (trainer:732) INFO: 36epoch:train:16192-17990batch: iter_time=2.259e-04, forward_time=0.202, loss_att=45.445, acc=0.960, loss=45.445, backward_time=0.298, grad_norm=104.297, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=5.006e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 12:20:53,621 (trainer:732) INFO: 36epoch:train:17991-19789batch: iter_time=2.263e-04, forward_time=0.202, loss_att=46.326, acc=0.960, loss=46.326, backward_time=0.298, grad_norm=110.636, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=5.002e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 12:41:00,469 (trainer:732) INFO: 36epoch:train:19790-21588batch: iter_time=2.277e-04, forward_time=0.202, loss_att=45.341, acc=0.961, loss=45.341, backward_time=0.298, grad_norm=107.835, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.999e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 13:01:04,572 (trainer:732) INFO: 36epoch:train:21589-23387batch: iter_time=2.294e-04, forward_time=0.202, loss_att=45.813, acc=0.960, loss=45.813, backward_time=0.298, grad_norm=103.899, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.995e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 13:21:09,054 (trainer:732) INFO: 36epoch:train:23388-25186batch: iter_time=2.287e-04, forward_time=0.202, loss_att=45.171, acc=0.961, loss=45.171, backward_time=0.297, grad_norm=104.193, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.992e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 13:41:19,447 (trainer:732) INFO: 36epoch:train:25187-26985batch: iter_time=2.254e-04, forward_time=0.203, loss_att=46.072, acc=0.961, loss=46.072, backward_time=0.299, grad_norm=105.605, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.988e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 14:01:23,762 (trainer:732) INFO: 36epoch:train:26986-28784batch: iter_time=2.299e-04, forward_time=0.202, loss_att=45.450, acc=0.960, loss=45.450, backward_time=0.297, grad_norm=103.352, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.985e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 14:21:28,293 (trainer:732) INFO: 36epoch:train:28785-30583batch: iter_time=2.248e-04, forward_time=0.202, loss_att=46.290, acc=0.960, loss=46.290, backward_time=0.298, grad_norm=105.945, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.981e-04, train_time=2.678 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<27971> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 157) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 14:41:32,683 (trainer:732) INFO: 36epoch:train:30584-32382batch: iter_time=2.313e-04, forward_time=0.202, loss_att=45.934, acc=0.960, loss=45.934, backward_time=0.297, grad_norm=104.447, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.978e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 15:01:37,141 (trainer:732) INFO: 36epoch:train:32383-34181batch: iter_time=2.285e-04, forward_time=0.202, loss_att=45.336, acc=0.960, loss=45.336, backward_time=0.297, grad_norm=101.453, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.975e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 15:21:43,280 (trainer:732) INFO: 36epoch:train:34182-35980batch: iter_time=2.276e-04, forward_time=0.203, loss_att=45.227, acc=0.960, loss=45.227, backward_time=0.298, grad_norm=100.192, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.971e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 15:29:47,440 (trainer:338) INFO: 36epoch results: [train] iter_time=2.621e-04, forward_time=0.202, loss_att=45.538, acc=0.960, loss=45.538, backward_time=0.298, grad_norm=104.595, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=5.004e-04, train_time=2.708, time=6 hours, 46 minutes and 20.01 seconds, total_count=1295856, gpu_max_cached_mem_GB=30.176, [valid] loss_att=37.577, acc=0.966, cer=0.040, wer=0.128, loss=37.577, time=4 minutes and 20.31 seconds, total_count=432, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 26.1 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 15:29:51,104 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 15:29:51,134 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/28epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 15:29:51,135 (trainer:272) INFO: 37/60epoch started. Estimated time to finish: 6 days, 23 hours and 9 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 15:54:24,807 (trainer:732) INFO: 37epoch:train:1-1799batch: iter_time=7.232e-04, forward_time=0.203, loss_att=44.190, acc=0.961, loss=44.190, backward_time=0.298, grad_norm=111.144, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.968e-04, train_time=3.278 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<25594> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 16:14:28,191 (trainer:732) INFO: 37epoch:train:1800-3598batch: iter_time=2.460e-04, forward_time=0.202, loss_att=44.971, acc=0.961, loss=44.971, backward_time=0.297, grad_norm=101.982, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.964e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 16:34:35,319 (trainer:732) INFO: 37epoch:train:3599-5397batch: iter_time=2.431e-04, forward_time=0.203, loss_att=44.742, acc=0.961, loss=44.742, backward_time=0.298, grad_norm=108.836, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.961e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 16:54:38,548 (trainer:732) INFO: 37epoch:train:5398-7196batch: iter_time=2.431e-04, forward_time=0.202, loss_att=44.762, acc=0.961, loss=44.762, backward_time=0.297, grad_norm=111.904, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.957e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 17:14:44,960 (trainer:732) INFO: 37epoch:train:7197-8995batch: iter_time=2.404e-04, forward_time=0.203, loss_att=44.958, acc=0.961, loss=44.958, backward_time=0.298, grad_norm=103.408, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.954e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 17:34:48,903 (trainer:732) INFO: 37epoch:train:8996-10794batch: iter_time=2.378e-04, forward_time=0.202, loss_att=45.469, acc=0.961, loss=45.469, backward_time=0.297, grad_norm=106.774, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.950e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 17:54:54,812 (trainer:732) INFO: 37epoch:train:10795-12593batch: iter_time=2.386e-04, forward_time=0.202, loss_att=45.302, acc=0.961, loss=45.302, backward_time=0.297, grad_norm=107.505, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.947e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 18:14:58,424 (trainer:732) INFO: 37epoch:train:12594-14392batch: iter_time=2.427e-04, forward_time=0.202, loss_att=44.611, acc=0.961, loss=44.611, backward_time=0.297, grad_norm=104.064, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.944e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 18:35:03,897 (trainer:732) INFO: 37epoch:train:14393-16191batch: iter_time=2.422e-04, forward_time=0.203, loss_att=45.315, acc=0.961, loss=45.315, backward_time=0.298, grad_norm=104.151, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.940e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 18:55:09,382 (trainer:732) INFO: 37epoch:train:16192-17990batch: iter_time=2.406e-04, forward_time=0.203, loss_att=45.199, acc=0.961, loss=45.199, backward_time=0.298, grad_norm=102.132, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.937e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 19:15:14,552 (trainer:732) INFO: 37epoch:train:17991-19789batch: iter_time=2.441e-04, forward_time=0.202, loss_att=45.335, acc=0.961, loss=45.335, backward_time=0.297, grad_norm=105.477, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.933e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 19:35:22,039 (trainer:732) INFO: 37epoch:train:19790-21588batch: iter_time=2.421e-04, forward_time=0.203, loss_att=46.156, acc=0.960, loss=46.156, backward_time=0.298, grad_norm=101.172, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.930e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 19:55:24,499 (trainer:732) INFO: 37epoch:train:21589-23387batch: iter_time=2.429e-04, forward_time=0.202, loss_att=45.729, acc=0.960, loss=45.729, backward_time=0.297, grad_norm=105.211, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.927e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 20:15:33,357 (trainer:732) INFO: 37epoch:train:23388-25186batch: iter_time=2.438e-04, forward_time=0.203, loss_att=45.360, acc=0.961, loss=45.360, backward_time=0.298, grad_norm=101.525, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.923e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 20:35:39,354 (trainer:732) INFO: 37epoch:train:25187-26985batch: iter_time=2.452e-04, forward_time=0.203, loss_att=45.594, acc=0.961, loss=45.594, backward_time=0.298, grad_norm=103.231, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.920e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 20:55:49,156 (trainer:732) INFO: 37epoch:train:26986-28784batch: iter_time=2.417e-04, forward_time=0.203, loss_att=45.427, acc=0.961, loss=45.427, backward_time=0.299, grad_norm=100.982, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.917e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 21:15:52,052 (trainer:732) INFO: 37epoch:train:28785-30583batch: iter_time=2.397e-04, forward_time=0.202, loss_att=45.659, acc=0.960, loss=45.659, backward_time=0.297, grad_norm=109.589, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.913e-04, train_time=2.675 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 21:35:51,974 (trainer:732) INFO: 37epoch:train:30584-32382batch: iter_time=2.370e-04, forward_time=0.202, loss_att=45.038, acc=0.960, loss=45.038, backward_time=0.296, grad_norm=106.611, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.910e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 21:55:56,070 (trainer:732) INFO: 37epoch:train:32383-34181batch: iter_time=2.389e-04, forward_time=0.202, loss_att=45.624, acc=0.960, loss=45.624, backward_time=0.297, grad_norm=103.999, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.907e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 22:16:02,306 (trainer:732) INFO: 37epoch:train:34182-35980batch: iter_time=2.424e-04, forward_time=0.203, loss_att=45.459, acc=0.961, loss=45.459, backward_time=0.298, grad_norm=104.039, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.903e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 22:24:27,373 (trainer:338) INFO: 37epoch results: [train] iter_time=2.657e-04, forward_time=0.202, loss_att=45.245, acc=0.961, loss=45.245, backward_time=0.298, grad_norm=105.201, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.935e-04, train_time=2.709, time=6 hours, 46 minutes and 28.92 seconds, total_count=1331852, gpu_max_cached_mem_GB=30.176, [valid] loss_att=36.901, acc=0.967, cer=0.042, wer=0.130, loss=36.901, time=4 minutes and 35.98 seconds, total_count=444, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 31.34 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 22:24:31,038 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 22:24:31,051 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/26epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 22:24:31,051 (trainer:272) INFO: 38/60epoch started. Estimated time to finish: 6 days, 16 hours and 9 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 22:48:59,877 (trainer:732) INFO: 38epoch:train:1-1799batch: iter_time=9.479e-04, forward_time=0.202, loss_att=44.401, acc=0.961, loss=44.401, backward_time=0.297, grad_norm=103.723, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.900e-04, train_time=3.267 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 23:09:11,765 (trainer:732) INFO: 38epoch:train:1800-3598batch: iter_time=2.645e-04, forward_time=0.203, loss_att=44.712, acc=0.961, loss=44.712, backward_time=0.299, grad_norm=114.151, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.897e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 23:29:16,548 (trainer:732) INFO: 38epoch:train:3599-5397batch: iter_time=2.649e-04, forward_time=0.202, loss_att=45.076, acc=0.961, loss=45.076, backward_time=0.297, grad_norm=104.312, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.893e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-13 23:49:20,940 (trainer:732) INFO: 38epoch:train:5398-7196batch: iter_time=2.636e-04, forward_time=0.203, loss_att=45.160, acc=0.961, loss=45.160, backward_time=0.297, grad_norm=108.382, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=4.890e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 00:09:25,757 (trainer:732) INFO: 38epoch:train:7197-8995batch: iter_time=2.561e-04, forward_time=0.202, loss_att=44.380, acc=0.961, loss=44.380, backward_time=0.297, grad_norm=102.215, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.887e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 00:29:30,504 (trainer:732) INFO: 38epoch:train:8996-10794batch: iter_time=2.603e-04, forward_time=0.202, loss_att=44.774, acc=0.961, loss=44.774, backward_time=0.297, grad_norm=104.513, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=4.884e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 00:49:36,223 (trainer:732) INFO: 38epoch:train:10795-12593batch: iter_time=2.649e-04, forward_time=0.202, loss_att=45.082, acc=0.961, loss=45.082, backward_time=0.297, grad_norm=104.228, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.880e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 01:09:43,032 (trainer:732) INFO: 38epoch:train:12594-14392batch: iter_time=2.646e-04, forward_time=0.203, loss_att=45.549, acc=0.961, loss=45.549, backward_time=0.298, grad_norm=104.794, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.877e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 01:29:51,459 (trainer:732) INFO: 38epoch:train:14393-16191batch: iter_time=2.572e-04, forward_time=0.203, loss_att=44.614, acc=0.961, loss=44.614, backward_time=0.298, grad_norm=108.834, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.874e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 01:49:55,168 (trainer:732) INFO: 38epoch:train:16192-17990batch: iter_time=2.635e-04, forward_time=0.202, loss_att=45.211, acc=0.961, loss=45.211, backward_time=0.297, grad_norm=110.234, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.871e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 02:09:57,148 (trainer:732) INFO: 38epoch:train:17991-19789batch: iter_time=2.605e-04, forward_time=0.202, loss_att=44.408, acc=0.961, loss=44.408, backward_time=0.296, grad_norm=110.680, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.867e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 02:30:01,812 (trainer:732) INFO: 38epoch:train:19790-21588batch: iter_time=2.606e-04, forward_time=0.202, loss_att=44.674, acc=0.961, loss=44.674, backward_time=0.297, grad_norm=107.611, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.864e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 02:50:06,105 (trainer:732) INFO: 38epoch:train:21589-23387batch: iter_time=2.626e-04, forward_time=0.202, loss_att=44.906, acc=0.961, loss=44.906, backward_time=0.297, grad_norm=110.948, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.861e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 03:10:09,729 (trainer:732) INFO: 38epoch:train:23388-25186batch: iter_time=2.631e-04, forward_time=0.202, loss_att=45.711, acc=0.960, loss=45.711, backward_time=0.297, grad_norm=108.829, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.858e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 03:30:15,799 (trainer:732) INFO: 38epoch:train:25187-26985batch: iter_time=2.596e-04, forward_time=0.203, loss_att=44.957, acc=0.961, loss=44.957, backward_time=0.298, grad_norm=101.024, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.854e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 03:50:19,069 (trainer:732) INFO: 38epoch:train:26986-28784batch: iter_time=2.615e-04, forward_time=0.202, loss_att=44.802, acc=0.961, loss=44.802, backward_time=0.297, grad_norm=100.945, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.851e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 04:10:23,870 (trainer:732) INFO: 38epoch:train:28785-30583batch: iter_time=2.594e-04, forward_time=0.202, loss_att=45.054, acc=0.961, loss=45.054, backward_time=0.298, grad_norm=109.472, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.848e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 04:30:29,569 (trainer:732) INFO: 38epoch:train:30584-32382batch: iter_time=2.562e-04, forward_time=0.202, loss_att=44.685, acc=0.961, loss=44.685, backward_time=0.298, grad_norm=102.476, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.845e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 04:50:37,751 (trainer:732) INFO: 38epoch:train:32383-34181batch: iter_time=2.540e-04, forward_time=0.203, loss_att=45.653, acc=0.961, loss=45.653, backward_time=0.298, grad_norm=113.995, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.842e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 05:10:46,042 (trainer:732) INFO: 38epoch:train:34182-35980batch: iter_time=2.547e-04, forward_time=0.203, loss_att=45.343, acc=0.961, loss=45.343, backward_time=0.298, grad_norm=110.402, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.838e-04, train_time=2.686 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 05:19:15,044 (trainer:338) INFO: 38epoch results: [train] iter_time=2.949e-04, forward_time=0.202, loss_att=44.956, acc=0.961, loss=44.956, backward_time=0.297, grad_norm=107.089, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.869e-04, train_time=2.709, time=6 hours, 46 minutes and 33.13 seconds, total_count=1367848, gpu_max_cached_mem_GB=30.176, [valid] loss_att=37.676, acc=0.967, cer=0.039, wer=0.125, loss=37.676, time=4 minutes and 36.79 seconds, total_count=456, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 34.07 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 05:19:19,186 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 05:19:19,218 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/27epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 05:19:19,219 (trainer:272) INFO: 39/60epoch started. Estimated time to finish: 6 days, 9 hours and 10 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 05:43:55,078 (trainer:732) INFO: 39epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=43.672, acc=0.962, loss=43.672, backward_time=0.298, grad_norm=111.102, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.835e-04, train_time=3.282 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 06:03:59,431 (trainer:732) INFO: 39epoch:train:1800-3598batch: iter_time=2.572e-04, forward_time=0.202, loss_att=44.205, acc=0.961, loss=44.205, backward_time=0.297, grad_norm=103.266, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.832e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 06:24:02,216 (trainer:732) INFO: 39epoch:train:3599-5397batch: iter_time=2.508e-04, forward_time=0.202, loss_att=44.116, acc=0.961, loss=44.116, backward_time=0.297, grad_norm=100.679, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.829e-04, train_time=2.674 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 06:44:04,773 (trainer:732) INFO: 39epoch:train:5398-7196batch: iter_time=2.486e-04, forward_time=0.203, loss_att=44.640, acc=0.961, loss=44.640, backward_time=0.297, grad_norm=103.325, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.826e-04, train_time=2.673 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 07:04:07,343 (trainer:732) INFO: 39epoch:train:7197-8995batch: iter_time=2.483e-04, forward_time=0.202, loss_att=44.902, acc=0.961, loss=44.902, backward_time=0.297, grad_norm=101.833, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.823e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 07:24:15,873 (trainer:732) INFO: 39epoch:train:8996-10794batch: iter_time=2.475e-04, forward_time=0.203, loss_att=44.895, acc=0.961, loss=44.895, backward_time=0.298, grad_norm=96.912, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.819e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 07:44:21,006 (trainer:732) INFO: 39epoch:train:10795-12593batch: iter_time=2.474e-04, forward_time=0.203, loss_att=45.165, acc=0.961, loss=45.165, backward_time=0.298, grad_norm=106.259, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.816e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 08:04:26,118 (trainer:732) INFO: 39epoch:train:12594-14392batch: iter_time=2.462e-04, forward_time=0.202, loss_att=44.430, acc=0.961, loss=44.430, backward_time=0.297, grad_norm=103.573, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.813e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 08:24:33,006 (trainer:732) INFO: 39epoch:train:14393-16191batch: iter_time=2.449e-04, forward_time=0.203, loss_att=44.933, acc=0.961, loss=44.933, backward_time=0.298, grad_norm=107.273, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.810e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 08:44:38,419 (trainer:732) INFO: 39epoch:train:16192-17990batch: iter_time=2.481e-04, forward_time=0.203, loss_att=44.570, acc=0.961, loss=44.570, backward_time=0.298, grad_norm=99.238, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.807e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 09:04:46,939 (trainer:732) INFO: 39epoch:train:17991-19789batch: iter_time=2.477e-04, forward_time=0.203, loss_att=45.250, acc=0.961, loss=45.250, backward_time=0.298, grad_norm=104.116, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.804e-04, train_time=2.687 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<64391> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<62402> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<28795> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 158) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<54754> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<40377> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<40361> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<62053> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<16016> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 10, fd 160) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<54823> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<22261> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<53563> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<55091> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<22523> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<53831> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<29779> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<49778> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<60457> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<23514> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<63862> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<61775> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<64597> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<16771> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<16793> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<64576> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<47243> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<47429> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<44899> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<42188> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<42252> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<60354> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<26345> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.220<40410> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 148) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 148) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 148) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 148) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 09:24:49,544 (trainer:732) INFO: 39epoch:train:19790-21588batch: iter_time=2.469e-04, forward_time=0.202, loss_att=44.666, acc=0.961, loss=44.666, backward_time=0.297, grad_norm=101.645, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.801e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 09:44:51,966 (trainer:732) INFO: 39epoch:train:21589-23387batch: iter_time=2.440e-04, forward_time=0.202, loss_att=44.596, acc=0.961, loss=44.596, backward_time=0.297, grad_norm=102.254, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.797e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 10:04:59,223 (trainer:732) INFO: 39epoch:train:23388-25186batch: iter_time=2.463e-04, forward_time=0.203, loss_att=44.053, acc=0.962, loss=44.053, backward_time=0.298, grad_norm=105.228, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.794e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 10:25:02,415 (trainer:732) INFO: 39epoch:train:25187-26985batch: iter_time=2.431e-04, forward_time=0.202, loss_att=44.635, acc=0.961, loss=44.635, backward_time=0.297, grad_norm=103.501, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=4.791e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 10:45:12,656 (trainer:732) INFO: 39epoch:train:26986-28784batch: iter_time=2.423e-04, forward_time=0.203, loss_att=45.865, acc=0.961, loss=45.865, backward_time=0.299, grad_norm=106.788, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.788e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 11:05:16,186 (trainer:732) INFO: 39epoch:train:28785-30583batch: iter_time=2.459e-04, forward_time=0.202, loss_att=44.703, acc=0.961, loss=44.703, backward_time=0.297, grad_norm=102.980, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.785e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 11:25:20,099 (trainer:732) INFO: 39epoch:train:30584-32382batch: iter_time=2.464e-04, forward_time=0.202, loss_att=44.419, acc=0.961, loss=44.419, backward_time=0.297, grad_norm=109.117, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.782e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 11:45:28,970 (trainer:732) INFO: 39epoch:train:32383-34181batch: iter_time=2.409e-04, forward_time=0.203, loss_att=45.123, acc=0.961, loss=45.123, backward_time=0.299, grad_norm=105.176, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.779e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 12:05:32,520 (trainer:732) INFO: 39epoch:train:34182-35980batch: iter_time=2.434e-04, forward_time=0.202, loss_att=44.626, acc=0.961, loss=44.626, backward_time=0.297, grad_norm=104.545, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.776e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 12:13:39,133 (trainer:338) INFO: 39epoch results: [train] iter_time=2.990e-04, forward_time=0.203, loss_att=44.669, acc=0.961, loss=44.669, backward_time=0.298, grad_norm=103.929, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.805e-04, train_time=2.709, time=6 hours, 46 minutes and 30.47 seconds, total_count=1403844, gpu_max_cached_mem_GB=30.176, [valid] loss_att=36.114, acc=0.968, cer=0.040, wer=0.126, loss=36.114, time=4 minutes and 17.64 seconds, total_count=468, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 31.8 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 12:13:42,966 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 12:13:42,978 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/29epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 12:13:42,978 (trainer:272) INFO: 40/60epoch started. Estimated time to finish: 6 days, 2 hours and 10 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 12:38:16,518 (trainer:732) INFO: 40epoch:train:1-1799batch: iter_time=0.001, forward_time=0.203, loss_att=43.985, acc=0.962, loss=43.985, backward_time=0.298, grad_norm=106.035, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.773e-04, train_time=3.277 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 12:58:23,662 (trainer:732) INFO: 40epoch:train:1800-3598batch: iter_time=2.428e-04, forward_time=0.203, loss_att=44.723, acc=0.961, loss=44.723, backward_time=0.298, grad_norm=106.917, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.770e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 13:18:29,730 (trainer:732) INFO: 40epoch:train:3599-5397batch: iter_time=2.397e-04, forward_time=0.203, loss_att=44.164, acc=0.962, loss=44.164, backward_time=0.298, grad_norm=106.674, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.767e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 13:38:34,908 (trainer:732) INFO: 40epoch:train:5398-7196batch: iter_time=2.384e-04, forward_time=0.202, loss_att=43.482, acc=0.962, loss=43.482, backward_time=0.297, grad_norm=107.089, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.764e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 13:58:38,008 (trainer:732) INFO: 40epoch:train:7197-8995batch: iter_time=2.385e-04, forward_time=0.202, loss_att=44.638, acc=0.961, loss=44.638, backward_time=0.297, grad_norm=108.012, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.761e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 14:18:43,035 (trainer:732) INFO: 40epoch:train:8996-10794batch: iter_time=2.349e-04, forward_time=0.202, loss_att=44.186, acc=0.961, loss=44.186, backward_time=0.297, grad_norm=102.539, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.758e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 14:38:47,531 (trainer:732) INFO: 40epoch:train:10795-12593batch: iter_time=2.388e-04, forward_time=0.202, loss_att=44.009, acc=0.962, loss=44.009, backward_time=0.297, grad_norm=104.831, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.755e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 14:58:50,096 (trainer:732) INFO: 40epoch:train:12594-14392batch: iter_time=2.352e-04, forward_time=0.202, loss_att=44.329, acc=0.961, loss=44.329, backward_time=0.297, grad_norm=103.200, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.752e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 15:18:53,175 (trainer:732) INFO: 40epoch:train:14393-16191batch: iter_time=2.367e-04, forward_time=0.202, loss_att=43.965, acc=0.961, loss=43.965, backward_time=0.297, grad_norm=101.727, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.749e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 15:39:01,206 (trainer:732) INFO: 40epoch:train:16192-17990batch: iter_time=2.371e-04, forward_time=0.203, loss_att=44.240, acc=0.962, loss=44.240, backward_time=0.298, grad_norm=95.925, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.746e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 15:59:07,062 (trainer:732) INFO: 40epoch:train:17991-19789batch: iter_time=2.387e-04, forward_time=0.202, loss_att=44.986, acc=0.961, loss=44.986, backward_time=0.298, grad_norm=113.055, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.743e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 16:19:13,452 (trainer:732) INFO: 40epoch:train:19790-21588batch: iter_time=2.358e-04, forward_time=0.203, loss_att=44.055, acc=0.962, loss=44.055, backward_time=0.298, grad_norm=105.442, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.740e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 16:39:15,162 (trainer:732) INFO: 40epoch:train:21589-23387batch: iter_time=2.367e-04, forward_time=0.202, loss_att=44.616, acc=0.961, loss=44.616, backward_time=0.297, grad_norm=103.716, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.737e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 16:59:24,100 (trainer:732) INFO: 40epoch:train:23388-25186batch: iter_time=2.372e-04, forward_time=0.203, loss_att=45.325, acc=0.961, loss=45.325, backward_time=0.299, grad_norm=108.897, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.734e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 17:19:28,554 (trainer:732) INFO: 40epoch:train:25187-26985batch: iter_time=2.441e-04, forward_time=0.202, loss_att=44.246, acc=0.961, loss=44.246, backward_time=0.297, grad_norm=108.623, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.731e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 17:39:32,598 (trainer:732) INFO: 40epoch:train:26986-28784batch: iter_time=2.348e-04, forward_time=0.202, loss_att=44.496, acc=0.961, loss=44.496, backward_time=0.297, grad_norm=104.736, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.728e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 17:59:37,172 (trainer:732) INFO: 40epoch:train:28785-30583batch: iter_time=2.381e-04, forward_time=0.203, loss_att=44.361, acc=0.962, loss=44.361, backward_time=0.298, grad_norm=105.917, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.725e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 18:19:41,760 (trainer:732) INFO: 40epoch:train:30584-32382batch: iter_time=2.345e-04, forward_time=0.202, loss_att=44.782, acc=0.961, loss=44.782, backward_time=0.298, grad_norm=106.161, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.722e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 18:39:51,005 (trainer:732) INFO: 40epoch:train:32383-34181batch: iter_time=2.365e-04, forward_time=0.203, loss_att=44.489, acc=0.962, loss=44.489, backward_time=0.299, grad_norm=103.004, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=4.719e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 18:59:53,047 (trainer:732) INFO: 40epoch:train:34182-35980batch: iter_time=2.349e-04, forward_time=0.202, loss_att=44.884, acc=0.961, loss=44.884, backward_time=0.296, grad_norm=106.536, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.716e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 19:08:00,552 (trainer:338) INFO: 40epoch results: [train] iter_time=2.760e-04, forward_time=0.202, loss_att=44.398, acc=0.961, loss=44.398, backward_time=0.298, grad_norm=105.474, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.744e-04, train_time=2.709, time=6 hours, 46 minutes and 27.58 seconds, total_count=1439840, gpu_max_cached_mem_GB=30.176, [valid] loss_att=36.820, acc=0.967, cer=0.039, wer=0.124, loss=36.820, time=4 minutes and 20.92 seconds, total_count=480, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 29.07 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 19:08:04,376 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 19:08:04,387 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/31epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 19:08:04,388 (trainer:272) INFO: 41/60epoch started. Estimated time to finish: 5 days, 19 hours and 11 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 19:32:33,784 (trainer:732) INFO: 41epoch:train:1-1799batch: iter_time=6.988e-04, forward_time=0.203, loss_att=44.198, acc=0.962, loss=44.198, backward_time=0.298, grad_norm=99.928, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.713e-04, train_time=3.268 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 19:52:39,226 (trainer:732) INFO: 41epoch:train:1800-3598batch: iter_time=2.623e-04, forward_time=0.203, loss_att=43.998, acc=0.961, loss=43.998, backward_time=0.298, grad_norm=107.334, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.710e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 20:12:44,560 (trainer:732) INFO: 41epoch:train:3599-5397batch: iter_time=2.599e-04, forward_time=0.202, loss_att=43.760, acc=0.962, loss=43.760, backward_time=0.297, grad_norm=105.539, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.707e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 20:32:50,352 (trainer:732) INFO: 41epoch:train:5398-7196batch: iter_time=2.649e-04, forward_time=0.203, loss_att=43.752, acc=0.962, loss=43.752, backward_time=0.298, grad_norm=116.539, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.704e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 20:52:53,744 (trainer:732) INFO: 41epoch:train:7197-8995batch: iter_time=2.595e-04, forward_time=0.202, loss_att=43.606, acc=0.962, loss=43.606, backward_time=0.297, grad_norm=107.462, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.701e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 21:12:59,239 (trainer:732) INFO: 41epoch:train:8996-10794batch: iter_time=2.563e-04, forward_time=0.202, loss_att=44.394, acc=0.961, loss=44.394, backward_time=0.297, grad_norm=110.538, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.698e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 21:33:08,364 (trainer:732) INFO: 41epoch:train:10795-12593batch: iter_time=2.524e-04, forward_time=0.203, loss_att=44.385, acc=0.962, loss=44.385, backward_time=0.299, grad_norm=112.111, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.695e-04, train_time=2.688 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 21:53:14,313 (trainer:732) INFO: 41epoch:train:12594-14392batch: iter_time=2.583e-04, forward_time=0.203, loss_att=44.024, acc=0.962, loss=44.024, backward_time=0.298, grad_norm=106.792, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.692e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 161) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 161) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 22:13:20,098 (trainer:732) INFO: 41epoch:train:14393-16191batch: iter_time=2.602e-04, forward_time=0.203, loss_att=43.924, acc=0.962, loss=43.924, backward_time=0.298, grad_norm=99.838, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.689e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 22:33:27,944 (trainer:732) INFO: 41epoch:train:16192-17990batch: iter_time=2.659e-04, forward_time=0.203, loss_att=44.222, acc=0.962, loss=44.222, backward_time=0.298, grad_norm=111.737, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.687e-04, train_time=2.685 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 22:53:32,416 (trainer:732) INFO: 41epoch:train:17991-19789batch: iter_time=2.647e-04, forward_time=0.202, loss_att=43.846, acc=0.962, loss=43.846, backward_time=0.297, grad_norm=105.861, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.684e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 23:13:37,146 (trainer:732) INFO: 41epoch:train:19790-21588batch: iter_time=2.635e-04, forward_time=0.202, loss_att=44.507, acc=0.961, loss=44.507, backward_time=0.297, grad_norm=105.170, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.681e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 23:33:38,713 (trainer:732) INFO: 41epoch:train:21589-23387batch: iter_time=2.654e-04, forward_time=0.202, loss_att=43.860, acc=0.961, loss=43.860, backward_time=0.296, grad_norm=107.080, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.678e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-14 23:53:43,478 (trainer:732) INFO: 41epoch:train:23388-25186batch: iter_time=2.626e-04, forward_time=0.202, loss_att=44.467, acc=0.961, loss=44.467, backward_time=0.297, grad_norm=105.777, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.675e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 00:13:49,197 (trainer:732) INFO: 41epoch:train:25187-26985batch: iter_time=2.625e-04, forward_time=0.202, loss_att=44.173, acc=0.962, loss=44.173, backward_time=0.297, grad_norm=107.477, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=4.672e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 00:33:55,989 (trainer:732) INFO: 41epoch:train:26986-28784batch: iter_time=2.692e-04, forward_time=0.203, loss_att=44.805, acc=0.961, loss=44.805, backward_time=0.298, grad_norm=103.569, clip=100.000, loss_scale=1.000, optim_step_time=0.058, optim0_lr0=4.669e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 00:54:00,643 (trainer:732) INFO: 41epoch:train:28785-30583batch: iter_time=2.652e-04, forward_time=0.202, loss_att=44.366, acc=0.962, loss=44.366, backward_time=0.297, grad_norm=109.843, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.666e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 01:14:05,995 (trainer:732) INFO: 41epoch:train:30584-32382batch: iter_time=2.698e-04, forward_time=0.202, loss_att=44.589, acc=0.961, loss=44.589, backward_time=0.297, grad_norm=109.189, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.664e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 01:34:11,532 (trainer:732) INFO: 41epoch:train:32383-34181batch: iter_time=2.623e-04, forward_time=0.202, loss_att=44.072, acc=0.962, loss=44.072, backward_time=0.298, grad_norm=103.660, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.661e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 01:54:17,362 (trainer:732) INFO: 41epoch:train:34182-35980batch: iter_time=2.667e-04, forward_time=0.202, loss_att=44.293, acc=0.961, loss=44.293, backward_time=0.297, grad_norm=104.855, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.658e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 02:02:51,931 (trainer:338) INFO: 41epoch results: [train] iter_time=2.845e-04, forward_time=0.202, loss_att=44.159, acc=0.962, loss=44.159, backward_time=0.298, grad_norm=107.034, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.685e-04, train_time=2.709, time=6 hours, 46 minutes and 31.11 seconds, total_count=1475836, gpu_max_cached_mem_GB=30.176, [valid] loss_att=37.156, acc=0.967, cer=0.039, wer=0.124, loss=37.156, time=4 minutes and 41.77 seconds, total_count=492, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 34.66 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 02:02:56,241 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 02:02:56,296 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/34epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 02:02:56,296 (trainer:272) INFO: 42/60epoch started. Estimated time to finish: 5 days, 12 hours and 12 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 02:27:22,916 (trainer:732) INFO: 42epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=43.907, acc=0.962, loss=43.907, backward_time=0.298, grad_norm=110.937, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.655e-04, train_time=3.262 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 02:47:27,011 (trainer:732) INFO: 42epoch:train:1800-3598batch: iter_time=2.480e-04, forward_time=0.202, loss_att=44.161, acc=0.961, loss=44.161, backward_time=0.297, grad_norm=103.838, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.652e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 03:07:32,528 (trainer:732) INFO: 42epoch:train:3599-5397batch: iter_time=2.475e-04, forward_time=0.202, loss_att=43.565, acc=0.962, loss=43.565, backward_time=0.297, grad_norm=105.685, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.649e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 03:27:41,833 (trainer:732) INFO: 42epoch:train:5398-7196batch: iter_time=2.475e-04, forward_time=0.203, loss_att=44.484, acc=0.962, loss=44.484, backward_time=0.299, grad_norm=115.436, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.647e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 03:47:43,538 (trainer:732) INFO: 42epoch:train:7197-8995batch: iter_time=2.479e-04, forward_time=0.202, loss_att=43.247, acc=0.962, loss=43.247, backward_time=0.296, grad_norm=104.681, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.644e-04, train_time=2.672 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 04:07:48,674 (trainer:732) INFO: 42epoch:train:8996-10794batch: iter_time=2.406e-04, forward_time=0.202, loss_att=43.625, acc=0.962, loss=43.625, backward_time=0.298, grad_norm=107.492, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.641e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 04:27:53,904 (trainer:732) INFO: 42epoch:train:10795-12593batch: iter_time=2.459e-04, forward_time=0.202, loss_att=43.725, acc=0.962, loss=43.725, backward_time=0.297, grad_norm=103.529, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.638e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 04:48:00,601 (trainer:732) INFO: 42epoch:train:12594-14392batch: iter_time=2.445e-04, forward_time=0.203, loss_att=43.817, acc=0.962, loss=43.817, backward_time=0.298, grad_norm=112.606, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.635e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 05:08:02,810 (trainer:732) INFO: 42epoch:train:14393-16191batch: iter_time=2.386e-04, forward_time=0.202, loss_att=43.649, acc=0.962, loss=43.649, backward_time=0.296, grad_norm=105.374, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.633e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 05:28:08,819 (trainer:732) INFO: 42epoch:train:16192-17990batch: iter_time=2.439e-04, forward_time=0.202, loss_att=44.336, acc=0.962, loss=44.336, backward_time=0.297, grad_norm=102.661, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.630e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 05:48:12,769 (trainer:732) INFO: 42epoch:train:17991-19789batch: iter_time=2.419e-04, forward_time=0.202, loss_att=43.785, acc=0.962, loss=43.785, backward_time=0.297, grad_norm=103.478, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.627e-04, train_time=2.677 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 161) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 161) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 06:08:18,663 (trainer:732) INFO: 42epoch:train:19790-21588batch: iter_time=2.407e-04, forward_time=0.203, loss_att=44.570, acc=0.961, loss=44.570, backward_time=0.298, grad_norm=103.998, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.624e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 06:28:21,338 (trainer:732) INFO: 42epoch:train:21589-23387batch: iter_time=2.352e-04, forward_time=0.202, loss_att=43.478, acc=0.962, loss=43.478, backward_time=0.297, grad_norm=102.499, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.621e-04, train_time=2.674 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 06:48:27,103 (trainer:732) INFO: 42epoch:train:23388-25186batch: iter_time=2.423e-04, forward_time=0.203, loss_att=43.646, acc=0.962, loss=43.646, backward_time=0.298, grad_norm=106.363, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.619e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 07:08:32,063 (trainer:732) INFO: 42epoch:train:25187-26985batch: iter_time=2.410e-04, forward_time=0.202, loss_att=43.755, acc=0.962, loss=43.755, backward_time=0.297, grad_norm=106.273, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.616e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 07:28:39,959 (trainer:732) INFO: 42epoch:train:26986-28784batch: iter_time=2.399e-04, forward_time=0.203, loss_att=44.460, acc=0.962, loss=44.460, backward_time=0.299, grad_norm=106.564, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.613e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 07:48:48,685 (trainer:732) INFO: 42epoch:train:28785-30583batch: iter_time=2.402e-04, forward_time=0.203, loss_att=43.955, acc=0.962, loss=43.955, backward_time=0.299, grad_norm=107.134, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.610e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 08:08:53,634 (trainer:732) INFO: 42epoch:train:30584-32382batch: iter_time=2.423e-04, forward_time=0.202, loss_att=43.976, acc=0.962, loss=43.976, backward_time=0.297, grad_norm=102.254, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.608e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 08:28:56,739 (trainer:732) INFO: 42epoch:train:32383-34181batch: iter_time=2.379e-04, forward_time=0.202, loss_att=44.053, acc=0.962, loss=44.053, backward_time=0.297, grad_norm=108.736, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.605e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 08:49:02,840 (trainer:732) INFO: 42epoch:train:34182-35980batch: iter_time=2.377e-04, forward_time=0.203, loss_att=44.462, acc=0.962, loss=44.462, backward_time=0.298, grad_norm=102.825, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=4.602e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 08:57:17,103 (trainer:338) INFO: 42epoch results: [train] iter_time=2.840e-04, forward_time=0.202, loss_att=43.932, acc=0.962, loss=43.932, backward_time=0.297, grad_norm=106.113, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.628e-04, train_time=2.709, time=6 hours, 46 minutes and 24.47 seconds, total_count=1511832, gpu_max_cached_mem_GB=30.176, [valid] loss_att=36.811, acc=0.968, cer=0.039, wer=0.124, loss=36.811, time=4 minutes and 26.64 seconds, total_count=504, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 29.7 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 08:57:20,691 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 08:57:20,703 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/35epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 08:57:20,703 (trainer:272) INFO: 43/60epoch started. Estimated time to finish: 5 days, 5 hours and 13 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 09:21:41,000 (trainer:732) INFO: 43epoch:train:1-1799batch: iter_time=6.457e-04, forward_time=0.203, loss_att=42.915, acc=0.963, loss=42.915, backward_time=0.298, grad_norm=104.594, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.599e-04, train_time=3.248 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 09:41:49,511 (trainer:732) INFO: 43epoch:train:1800-3598batch: iter_time=2.530e-04, forward_time=0.203, loss_att=43.681, acc=0.962, loss=43.681, backward_time=0.298, grad_norm=112.320, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.597e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 10:01:56,063 (trainer:732) INFO: 43epoch:train:3599-5397batch: iter_time=2.520e-04, forward_time=0.202, loss_att=43.570, acc=0.962, loss=43.570, backward_time=0.298, grad_norm=106.199, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.594e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 10:21:57,241 (trainer:732) INFO: 43epoch:train:5398-7196batch: iter_time=2.507e-04, forward_time=0.201, loss_att=43.609, acc=0.962, loss=43.609, backward_time=0.296, grad_norm=102.559, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.591e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 10:42:01,983 (trainer:732) INFO: 43epoch:train:7197-8995batch: iter_time=2.529e-04, forward_time=0.203, loss_att=42.722, acc=0.963, loss=42.722, backward_time=0.298, grad_norm=111.937, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.588e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 11:02:07,037 (trainer:732) INFO: 43epoch:train:8996-10794batch: iter_time=2.510e-04, forward_time=0.203, loss_att=43.807, acc=0.962, loss=43.807, backward_time=0.298, grad_norm=108.617, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=4.586e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 11:22:14,420 (trainer:732) INFO: 43epoch:train:10795-12593batch: iter_time=2.444e-04, forward_time=0.203, loss_att=44.076, acc=0.962, loss=44.076, backward_time=0.298, grad_norm=107.573, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.583e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 11:42:19,146 (trainer:732) INFO: 43epoch:train:12594-14392batch: iter_time=2.514e-04, forward_time=0.202, loss_att=43.527, acc=0.962, loss=43.527, backward_time=0.298, grad_norm=104.374, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.580e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 12:02:20,167 (trainer:732) INFO: 43epoch:train:14393-16191batch: iter_time=2.493e-04, forward_time=0.202, loss_att=42.971, acc=0.962, loss=42.971, backward_time=0.297, grad_norm=103.996, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.578e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 12:22:25,031 (trainer:732) INFO: 43epoch:train:16192-17990batch: iter_time=2.531e-04, forward_time=0.202, loss_att=43.576, acc=0.962, loss=43.576, backward_time=0.297, grad_norm=108.447, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.575e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 12:42:29,142 (trainer:732) INFO: 43epoch:train:17991-19789batch: iter_time=2.466e-04, forward_time=0.202, loss_att=43.796, acc=0.962, loss=43.796, backward_time=0.297, grad_norm=113.017, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.572e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 13:02:38,897 (trainer:732) INFO: 43epoch:train:19790-21588batch: iter_time=2.476e-04, forward_time=0.203, loss_att=44.357, acc=0.962, loss=44.357, backward_time=0.299, grad_norm=108.013, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.570e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 13:22:43,576 (trainer:732) INFO: 43epoch:train:21589-23387batch: iter_time=2.524e-04, forward_time=0.202, loss_att=43.755, acc=0.962, loss=43.755, backward_time=0.297, grad_norm=103.205, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.567e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 13:42:49,225 (trainer:732) INFO: 43epoch:train:23388-25186batch: iter_time=2.519e-04, forward_time=0.203, loss_att=44.120, acc=0.962, loss=44.120, backward_time=0.298, grad_norm=110.731, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.564e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 14:02:53,550 (trainer:732) INFO: 43epoch:train:25187-26985batch: iter_time=2.457e-04, forward_time=0.202, loss_att=44.170, acc=0.962, loss=44.170, backward_time=0.297, grad_norm=109.226, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.562e-04, train_time=2.678 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 12, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 12, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 14:22:58,268 (trainer:732) INFO: 43epoch:train:26986-28784batch: iter_time=2.535e-04, forward_time=0.203, loss_att=43.703, acc=0.962, loss=43.703, backward_time=0.298, grad_norm=107.978, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.559e-04, train_time=2.678 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 161) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 11, fd 161) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 14:42:58,983 (trainer:732) INFO: 43epoch:train:28785-30583batch: iter_time=2.509e-04, forward_time=0.202, loss_att=43.810, acc=0.961, loss=43.810, backward_time=0.297, grad_norm=109.655, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.556e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 15:03:04,813 (trainer:732) INFO: 43epoch:train:30584-32382batch: iter_time=2.520e-04, forward_time=0.202, loss_att=43.858, acc=0.962, loss=43.858, backward_time=0.297, grad_norm=102.066, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.554e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.205<55402> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 12, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 15:23:10,753 (trainer:732) INFO: 43epoch:train:32383-34181batch: iter_time=2.514e-04, forward_time=0.202, loss_att=43.892, acc=0.962, loss=43.892, backward_time=0.298, grad_norm=104.430, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.551e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 15:43:19,565 (trainer:732) INFO: 43epoch:train:34182-35980batch: iter_time=2.488e-04, forward_time=0.203, loss_att=43.729, acc=0.962, loss=43.729, backward_time=0.299, grad_norm=103.396, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.548e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 15:51:42,499 (trainer:338) INFO: 43epoch results: [train] iter_time=2.702e-04, forward_time=0.202, loss_att=43.682, acc=0.962, loss=43.682, backward_time=0.298, grad_norm=107.111, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.574e-04, train_time=2.708, time=6 hours, 46 minutes and 16.4 seconds, total_count=1547828, gpu_max_cached_mem_GB=30.176, [valid] loss_att=36.589, acc=0.967, cer=0.039, wer=0.126, loss=36.589, time=4 minutes and 30.25 seconds, total_count=516, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 35.15 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 15:51:46,325 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 15:51:46,358 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/30epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 15:51:46,358 (trainer:272) INFO: 44/60epoch started. Estimated time to finish: 4 days, 22 hours and 15 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 16:16:19,941 (trainer:732) INFO: 44epoch:train:1-1799batch: iter_time=7.514e-04, forward_time=0.203, loss_att=43.692, acc=0.962, loss=43.692, backward_time=0.297, grad_norm=104.993, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.546e-04, train_time=3.277 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 16:36:27,377 (trainer:732) INFO: 44epoch:train:1800-3598batch: iter_time=2.558e-04, forward_time=0.203, loss_att=42.907, acc=0.963, loss=42.907, backward_time=0.298, grad_norm=108.511, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.543e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 16:56:34,923 (trainer:732) INFO: 44epoch:train:3599-5397batch: iter_time=2.599e-04, forward_time=0.203, loss_att=43.346, acc=0.962, loss=43.346, backward_time=0.298, grad_norm=107.835, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.540e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 17:16:38,927 (trainer:732) INFO: 44epoch:train:5398-7196batch: iter_time=2.541e-04, forward_time=0.202, loss_att=42.863, acc=0.962, loss=42.863, backward_time=0.297, grad_norm=103.408, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.538e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 17:36:39,866 (trainer:732) INFO: 44epoch:train:7197-8995batch: iter_time=2.522e-04, forward_time=0.202, loss_att=43.058, acc=0.962, loss=43.058, backward_time=0.296, grad_norm=102.642, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.535e-04, train_time=2.670 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 17:56:47,657 (trainer:732) INFO: 44epoch:train:8996-10794batch: iter_time=2.533e-04, forward_time=0.203, loss_att=43.509, acc=0.962, loss=43.509, backward_time=0.298, grad_norm=110.626, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.532e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 18:16:51,524 (trainer:732) INFO: 44epoch:train:10795-12593batch: iter_time=2.535e-04, forward_time=0.202, loss_att=42.736, acc=0.963, loss=42.736, backward_time=0.297, grad_norm=110.194, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.530e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 18:36:56,219 (trainer:732) INFO: 44epoch:train:12594-14392batch: iter_time=2.536e-04, forward_time=0.203, loss_att=43.444, acc=0.962, loss=43.444, backward_time=0.297, grad_norm=109.557, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.527e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 18:57:03,578 (trainer:732) INFO: 44epoch:train:14393-16191batch: iter_time=2.491e-04, forward_time=0.203, loss_att=44.150, acc=0.962, loss=44.150, backward_time=0.298, grad_norm=106.073, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.525e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 19:17:08,233 (trainer:732) INFO: 44epoch:train:16192-17990batch: iter_time=2.505e-04, forward_time=0.202, loss_att=43.274, acc=0.962, loss=43.274, backward_time=0.297, grad_norm=106.011, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.522e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 19:37:15,510 (trainer:732) INFO: 44epoch:train:17991-19789batch: iter_time=2.513e-04, forward_time=0.203, loss_att=43.698, acc=0.962, loss=43.698, backward_time=0.298, grad_norm=106.208, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.519e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 19:57:22,416 (trainer:732) INFO: 44epoch:train:19790-21588batch: iter_time=2.478e-04, forward_time=0.203, loss_att=43.312, acc=0.962, loss=43.312, backward_time=0.298, grad_norm=107.467, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.517e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 20:17:29,500 (trainer:732) INFO: 44epoch:train:21589-23387batch: iter_time=2.475e-04, forward_time=0.203, loss_att=43.958, acc=0.962, loss=43.958, backward_time=0.298, grad_norm=106.389, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.514e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 20:37:31,774 (trainer:732) INFO: 44epoch:train:23388-25186batch: iter_time=2.470e-04, forward_time=0.202, loss_att=43.349, acc=0.962, loss=43.349, backward_time=0.297, grad_norm=102.626, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.512e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 20:57:36,991 (trainer:732) INFO: 44epoch:train:25187-26985batch: iter_time=2.456e-04, forward_time=0.202, loss_att=43.603, acc=0.962, loss=43.603, backward_time=0.297, grad_norm=104.774, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.509e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 21:17:37,792 (trainer:732) INFO: 44epoch:train:26986-28784batch: iter_time=2.490e-04, forward_time=0.202, loss_att=43.005, acc=0.962, loss=43.005, backward_time=0.296, grad_norm=106.917, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.506e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 21:37:43,826 (trainer:732) INFO: 44epoch:train:28785-30583batch: iter_time=2.462e-04, forward_time=0.203, loss_att=44.249, acc=0.962, loss=44.249, backward_time=0.298, grad_norm=114.019, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.504e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 21:57:48,101 (trainer:732) INFO: 44epoch:train:30584-32382batch: iter_time=2.476e-04, forward_time=0.202, loss_att=43.018, acc=0.962, loss=43.018, backward_time=0.297, grad_norm=109.897, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.501e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 22:17:54,019 (trainer:732) INFO: 44epoch:train:32383-34181batch: iter_time=2.503e-04, forward_time=0.203, loss_att=43.704, acc=0.962, loss=43.704, backward_time=0.298, grad_norm=111.297, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.499e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 22:38:02,497 (trainer:732) INFO: 44epoch:train:34182-35980batch: iter_time=2.468e-04, forward_time=0.203, loss_att=43.922, acc=0.962, loss=43.922, backward_time=0.298, grad_norm=117.643, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.496e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 22:46:08,358 (trainer:338) INFO: 44epoch results: [train] iter_time=2.756e-04, forward_time=0.203, loss_att=43.436, acc=0.962, loss=43.436, backward_time=0.298, grad_norm=107.849, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.521e-04, train_time=2.710, time=6 hours, 46 minutes and 33.61 seconds, total_count=1583824, gpu_max_cached_mem_GB=30.176, [valid] loss_att=37.906, acc=0.967, cer=0.038, wer=0.122, loss=37.906, time=4 minutes and 22.68 seconds, total_count=528, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 25.71 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 22:46:11,977 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 22:46:12,005 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/36epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 22:46:12,006 (trainer:272) INFO: 45/60epoch started. Estimated time to finish: 4 days, 15 hours and 16 minutes + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 12, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 12, fd 162) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 23:10:29,387 (trainer:732) INFO: 45epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=42.924, acc=0.963, loss=42.924, backward_time=0.297, grad_norm=107.348, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.494e-04, train_time=3.241 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 23:30:32,027 (trainer:732) INFO: 45epoch:train:1800-3598batch: iter_time=2.405e-04, forward_time=0.202, loss_att=43.254, acc=0.962, loss=43.254, backward_time=0.297, grad_norm=106.701, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.491e-04, train_time=2.674 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-15 23:50:38,109 (trainer:732) INFO: 45epoch:train:3599-5397batch: iter_time=2.407e-04, forward_time=0.203, loss_att=43.037, acc=0.963, loss=43.037, backward_time=0.298, grad_norm=111.937, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.489e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 00:10:46,209 (trainer:732) INFO: 45epoch:train:5398-7196batch: iter_time=2.415e-04, forward_time=0.203, loss_att=43.506, acc=0.963, loss=43.506, backward_time=0.298, grad_norm=101.737, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.486e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 00:30:52,048 (trainer:732) INFO: 45epoch:train:7197-8995batch: iter_time=2.368e-04, forward_time=0.203, loss_att=42.883, acc=0.963, loss=42.883, backward_time=0.298, grad_norm=112.267, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.483e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 00:50:58,264 (trainer:732) INFO: 45epoch:train:8996-10794batch: iter_time=2.387e-04, forward_time=0.203, loss_att=43.493, acc=0.962, loss=43.493, backward_time=0.298, grad_norm=104.182, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.481e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 01:11:04,264 (trainer:732) INFO: 45epoch:train:10795-12593batch: iter_time=2.342e-04, forward_time=0.203, loss_att=42.916, acc=0.963, loss=42.916, backward_time=0.298, grad_norm=107.651, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.478e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 01:31:09,369 (trainer:732) INFO: 45epoch:train:12594-14392batch: iter_time=2.367e-04, forward_time=0.202, loss_att=43.301, acc=0.962, loss=43.301, backward_time=0.297, grad_norm=105.339, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.476e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 01:51:11,839 (trainer:732) INFO: 45epoch:train:14393-16191batch: iter_time=2.362e-04, forward_time=0.202, loss_att=42.729, acc=0.963, loss=42.729, backward_time=0.297, grad_norm=101.616, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.473e-04, train_time=2.674 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 02:11:15,903 (trainer:732) INFO: 45epoch:train:16192-17990batch: iter_time=2.349e-04, forward_time=0.202, loss_att=42.567, acc=0.963, loss=42.567, backward_time=0.297, grad_norm=107.856, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.471e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 02:31:21,394 (trainer:732) INFO: 45epoch:train:17991-19789batch: iter_time=2.380e-04, forward_time=0.203, loss_att=43.569, acc=0.962, loss=43.569, backward_time=0.298, grad_norm=104.816, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.468e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 02:51:29,131 (trainer:732) INFO: 45epoch:train:19790-21588batch: iter_time=2.353e-04, forward_time=0.203, loss_att=42.861, acc=0.963, loss=42.861, backward_time=0.298, grad_norm=101.806, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.466e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 03:11:36,279 (trainer:732) INFO: 45epoch:train:21589-23387batch: iter_time=2.370e-04, forward_time=0.203, loss_att=42.991, acc=0.963, loss=42.991, backward_time=0.298, grad_norm=104.712, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.463e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 03:31:41,667 (trainer:732) INFO: 45epoch:train:23388-25186batch: iter_time=2.384e-04, forward_time=0.202, loss_att=43.465, acc=0.962, loss=43.465, backward_time=0.298, grad_norm=109.951, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.461e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 03:51:49,103 (trainer:732) INFO: 45epoch:train:25187-26985batch: iter_time=2.356e-04, forward_time=0.203, loss_att=43.468, acc=0.962, loss=43.468, backward_time=0.298, grad_norm=105.335, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.458e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 04:11:56,568 (trainer:732) INFO: 45epoch:train:26986-28784batch: iter_time=2.411e-04, forward_time=0.203, loss_att=43.590, acc=0.962, loss=43.590, backward_time=0.298, grad_norm=107.851, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.456e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 04:32:02,689 (trainer:732) INFO: 45epoch:train:28785-30583batch: iter_time=2.359e-04, forward_time=0.203, loss_att=43.239, acc=0.962, loss=43.239, backward_time=0.298, grad_norm=105.621, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.453e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 04:52:06,809 (trainer:732) INFO: 45epoch:train:30584-32382batch: iter_time=2.332e-04, forward_time=0.202, loss_att=43.959, acc=0.962, loss=43.959, backward_time=0.297, grad_norm=105.854, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.451e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 05:12:10,009 (trainer:732) INFO: 45epoch:train:32383-34181batch: iter_time=2.328e-04, forward_time=0.202, loss_att=42.993, acc=0.962, loss=42.993, backward_time=0.297, grad_norm=105.604, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.448e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 05:32:10,345 (trainer:732) INFO: 45epoch:train:34182-35980batch: iter_time=2.341e-04, forward_time=0.201, loss_att=43.531, acc=0.962, loss=43.531, backward_time=0.296, grad_norm=109.091, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.446e-04, train_time=2.668 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 05:40:17,914 (trainer:338) INFO: 45epoch results: [train] iter_time=2.854e-04, forward_time=0.202, loss_att=43.213, acc=0.962, loss=43.213, backward_time=0.298, grad_norm=106.364, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.470e-04, train_time=2.708, time=6 hours, 46 minutes and 15.93 seconds, total_count=1619820, gpu_max_cached_mem_GB=30.176, [valid] loss_att=36.160, acc=0.968, cer=0.039, wer=0.122, loss=36.160, time=4 minutes and 20.79 seconds, total_count=540, gpu_max_cached_mem_GB=30.176, [att_plot] time=3 minutes and 29.19 seconds, total_count=0, gpu_max_cached_mem_GB=30.176 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 05:40:21,686 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 05:40:21,715 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/33epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 05:40:21,715 (trainer:272) INFO: 46/60epoch started. Estimated time to finish: 4 days, 8 hours and 18 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 06:04:48,190 (trainer:732) INFO: 46epoch:train:1-1799batch: iter_time=9.831e-04, forward_time=0.202, loss_att=42.867, acc=0.963, loss=42.867, backward_time=0.297, grad_norm=109.645, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.443e-04, train_time=3.262 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 06:24:52,762 (trainer:732) INFO: 46epoch:train:1800-3598batch: iter_time=2.376e-04, forward_time=0.202, loss_att=42.515, acc=0.963, loss=42.515, backward_time=0.297, grad_norm=102.889, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.441e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 06:45:01,396 (trainer:732) INFO: 46epoch:train:3599-5397batch: iter_time=2.378e-04, forward_time=0.203, loss_att=42.907, acc=0.963, loss=42.907, backward_time=0.299, grad_norm=107.387, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.439e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 07:05:07,481 (trainer:732) INFO: 46epoch:train:5398-7196batch: iter_time=2.305e-04, forward_time=0.202, loss_att=42.694, acc=0.963, loss=42.694, backward_time=0.297, grad_norm=109.138, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.436e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813082:1813868 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 146) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 07:25:14,716 (trainer:732) INFO: 46epoch:train:7197-8995batch: iter_time=2.329e-04, forward_time=0.203, loss_att=43.301, acc=0.963, loss=43.301, backward_time=0.298, grad_norm=106.367, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.434e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 07:45:19,911 (trainer:732) INFO: 46epoch:train:8996-10794batch: iter_time=2.336e-04, forward_time=0.202, loss_att=43.022, acc=0.962, loss=43.022, backward_time=0.297, grad_norm=107.259, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.431e-04, train_time=2.679 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813081:1813867 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 9, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813083:1813870 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 08:05:26,827 (trainer:732) INFO: 46epoch:train:10795-12593batch: iter_time=2.303e-04, forward_time=0.203, loss_att=42.925, acc=0.963, loss=42.925, backward_time=0.298, grad_norm=107.699, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.429e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 08:25:29,673 (trainer:732) INFO: 46epoch:train:12594-14392batch: iter_time=2.338e-04, forward_time=0.202, loss_att=42.815, acc=0.963, loss=42.815, backward_time=0.297, grad_norm=105.171, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.426e-04, train_time=2.674 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 12, fd 162) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:1813079:1813869 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 12, fd 162) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 08:45:36,735 (trainer:732) INFO: 46epoch:train:14393-16191batch: iter_time=2.288e-04, forward_time=0.203, loss_att=43.009, acc=0.963, loss=43.009, backward_time=0.298, grad_norm=106.552, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.424e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 09:05:37,964 (trainer:732) INFO: 46epoch:train:16192-17990batch: iter_time=2.334e-04, forward_time=0.202, loss_att=42.769, acc=0.962, loss=42.769, backward_time=0.296, grad_norm=104.659, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.421e-04, train_time=2.671 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 09:25:45,070 (trainer:732) INFO: 46epoch:train:17991-19789batch: iter_time=2.311e-04, forward_time=0.203, loss_att=43.057, acc=0.963, loss=43.057, backward_time=0.298, grad_norm=111.809, clip=100.000, loss_scale=1.000, optim_step_time=0.055, optim0_lr0=4.419e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 09:45:50,987 (trainer:732) INFO: 46epoch:train:19790-21588batch: iter_time=2.372e-04, forward_time=0.202, loss_att=42.880, acc=0.963, loss=42.880, backward_time=0.298, grad_norm=108.066, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.417e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 10:05:56,528 (trainer:732) INFO: 46epoch:train:21589-23387batch: iter_time=2.339e-04, forward_time=0.202, loss_att=42.771, acc=0.963, loss=42.771, backward_time=0.298, grad_norm=102.509, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.414e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 10:26:04,756 (trainer:732) INFO: 46epoch:train:23388-25186batch: iter_time=2.357e-04, forward_time=0.203, loss_att=44.113, acc=0.962, loss=44.113, backward_time=0.299, grad_norm=110.229, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.412e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 10:46:05,342 (trainer:732) INFO: 46epoch:train:25187-26985batch: iter_time=2.364e-04, forward_time=0.202, loss_att=41.980, acc=0.963, loss=41.980, backward_time=0.296, grad_norm=114.412, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.409e-04, train_time=2.669 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 11:06:09,640 (trainer:732) INFO: 46epoch:train:26986-28784batch: iter_time=2.374e-04, forward_time=0.202, loss_att=43.110, acc=0.962, loss=43.110, backward_time=0.297, grad_norm=115.451, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.407e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 11:26:15,553 (trainer:732) INFO: 46epoch:train:28785-30583batch: iter_time=2.354e-04, forward_time=0.202, loss_att=43.057, acc=0.963, loss=43.057, backward_time=0.298, grad_norm=110.854, clip=100.000, loss_scale=1.000, optim_step_time=0.057, optim0_lr0=4.405e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 11:46:14,762 (trainer:732) INFO: 46epoch:train:30584-32382batch: iter_time=2.333e-04, forward_time=0.202, loss_att=43.342, acc=0.962, loss=43.342, backward_time=0.296, grad_norm=121.803, clip=100.000, loss_scale=1.000, optim_step_time=0.056, optim0_lr0=4.402e-04, train_time=2.666 +Exception ignored from cffi callback .vio_tell at 0x7ff4526ab310>: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/soundfile.py", line 1264, in vio_tell + @_ffi.callback("sf_vio_tell") +KeyboardInterrupt: + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 109, in join + ready = multiprocessing.connection.wait( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt +Process SpawnProcess-1: +Process SpawnProcess-3: +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 632, in train_one_epoch + loss.backward() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward + torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 632, in train_one_epoch + loss.backward() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward + torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt +Process SpawnProcess-2: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 632, in train_one_epoch + loss.backward() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward + torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt +Process SpawnProcess-4: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 632, in train_one_epoch + loss.backward() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward + torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.log new file mode 100644 index 0000000000000000000000000000000000000000..8320da929bf9b8d5b6078ed59f6e7e2771b18985 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/train.log @@ -0,0 +1,4076 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_2spk_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_large_kaldi_fmt/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_large_kaldi_fmt/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_2spk_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Tue Jan 16 12:45:30 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_2spk_kaldi_fmt/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/train_large_kaldi_fmt/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/train_large_kaldi_fmt/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/dev_2spk_kaldi_fmt/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:42,134 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:42,135 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:42,160 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:46,375 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:46,384 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:46,384 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:46,384 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:46,386 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/config.yaml +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:46,400 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:46:53,651 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:16,737 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/train_large_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/train_large_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:16,737 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=35996, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:16,744 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=35996, mean=53.8, min=15, max=258 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:17,039 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:17,051 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_2spk_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_2spk_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:17,051 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=36, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:17,051 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=36, mean=44.6, min=8, max=83 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:17,058 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:17,080 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_2spk_kaldi_fmt/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/dev_2spk_kaldi_fmt/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:17,080 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=1606, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:17,081 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:19,819 (trainer:159) INFO: The training was resumed using exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/checkpoint.pth +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] NCCL INFO Bootstrap : Using eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3520218 [2] NCCL INFO Using network Socket + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3520219 [3] NCCL INFO Using network Socket + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.6.147<0> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3520217 [1] NCCL INFO Using network Socket +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO Setting affinity for GPU 6 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO Setting affinity for GPU 7 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO Setting affinity for GPU 5 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO Setting affinity for GPU 4 to ff,ffc0000f,fffc0000 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO Channel 00 : 1[b2000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO Channel 00 : 3[b5000] -> 0[b1000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO Channel 00 : 0[b1000] -> 1[b2000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO Channel 01 : 1[b2000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO Channel 01 : 3[b5000] -> 0[b1000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO Channel 00 : 2[b4000] -> 3[b5000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO Channel 01 : 0[b1000] -> 1[b2000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO Channel 01 : 2[b4000] -> 3[b5000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO Channel 00 : 3[b5000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO Channel 01 : 3[b5000] -> 2[b4000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO Connected all rings +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO Channel 00 : 1[b2000] -> 0[b1000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO Channel 01 : 1[b2000] -> 0[b1000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO Channel 00 : 2[b4000] -> 1[b2000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO Channel 01 : 2[b4000] -> 1[b2000] via P2P/IPC +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO Connected all trees +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521144 [3] NCCL INFO comm 0x7fec10002f70 rank 3 nranks 4 cudaDev 3 busId b5000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521141 [0] NCCL INFO comm 0x7fa4fc002f70 rank 0 nranks 4 cudaDev 0 busId b1000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521145 [1] NCCL INFO comm 0x7f5320002f70 rank 1 nranks 4 cudaDev 1 busId b2000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521143 [2] NCCL INFO comm 0x7f0e94002f70 rank 2 nranks 4 cudaDev 2 busId b4000 - Init COMPLETE +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3520216 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:47:24,270 (trainer:284) INFO: 46/60epoch started +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 12:52:30,714 (distributed:995) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 13:12:34,592 (trainer:732) INFO: 46epoch:train:1-1799batch: iter_time=8.268e-04, forward_time=0.206, loss_att=42.857, acc=0.963, loss=42.857, backward_time=0.298, grad_norm=109.508, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.443e-04, train_time=3.359 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 13:32:40,518 (trainer:732) INFO: 46epoch:train:1800-3598batch: iter_time=2.151e-04, forward_time=0.202, loss_att=42.475, acc=0.963, loss=42.475, backward_time=0.298, grad_norm=109.243, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.441e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 13:52:50,629 (trainer:732) INFO: 46epoch:train:3599-5397batch: iter_time=2.131e-04, forward_time=0.202, loss_att=42.830, acc=0.963, loss=42.830, backward_time=0.299, grad_norm=109.327, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.439e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 14:12:58,990 (trainer:732) INFO: 46epoch:train:5398-7196batch: iter_time=2.212e-04, forward_time=0.202, loss_att=42.646, acc=0.963, loss=42.646, backward_time=0.298, grad_norm=102.856, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.436e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 14:33:07,553 (trainer:732) INFO: 46epoch:train:7197-8995batch: iter_time=2.192e-04, forward_time=0.202, loss_att=43.346, acc=0.963, loss=43.346, backward_time=0.299, grad_norm=104.257, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.434e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 14:53:13,861 (trainer:732) INFO: 46epoch:train:8996-10794batch: iter_time=2.172e-04, forward_time=0.202, loss_att=43.046, acc=0.962, loss=43.046, backward_time=0.298, grad_norm=104.621, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.431e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 15:13:23,097 (trainer:732) INFO: 46epoch:train:10795-12593batch: iter_time=2.206e-04, forward_time=0.202, loss_att=42.905, acc=0.963, loss=42.905, backward_time=0.299, grad_norm=108.596, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.429e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 15:33:28,093 (trainer:732) INFO: 46epoch:train:12594-14392batch: iter_time=2.203e-04, forward_time=0.201, loss_att=42.721, acc=0.963, loss=42.721, backward_time=0.298, grad_norm=106.450, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.426e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 15:53:37,364 (trainer:732) INFO: 46epoch:train:14393-16191batch: iter_time=2.159e-04, forward_time=0.202, loss_att=42.981, acc=0.963, loss=42.981, backward_time=0.299, grad_norm=108.447, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.424e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 16:13:41,738 (trainer:732) INFO: 46epoch:train:16192-17990batch: iter_time=2.168e-04, forward_time=0.201, loss_att=42.778, acc=0.962, loss=42.778, backward_time=0.297, grad_norm=103.760, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.421e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 16:33:50,024 (trainer:732) INFO: 46epoch:train:17991-19789batch: iter_time=2.188e-04, forward_time=0.202, loss_att=43.048, acc=0.963, loss=43.048, backward_time=0.299, grad_norm=107.620, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.419e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 16:53:59,185 (trainer:732) INFO: 46epoch:train:19790-21588batch: iter_time=2.172e-04, forward_time=0.202, loss_att=42.878, acc=0.963, loss=42.878, backward_time=0.299, grad_norm=107.261, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.417e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 17:14:06,228 (trainer:732) INFO: 46epoch:train:21589-23387batch: iter_time=2.190e-04, forward_time=0.202, loss_att=42.756, acc=0.963, loss=42.756, backward_time=0.298, grad_norm=106.711, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.414e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 17:34:17,824 (trainer:732) INFO: 46epoch:train:23388-25186batch: iter_time=2.172e-04, forward_time=0.202, loss_att=44.107, acc=0.962, loss=44.107, backward_time=0.299, grad_norm=108.433, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.412e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 17:54:20,243 (trainer:732) INFO: 46epoch:train:25187-26985batch: iter_time=2.145e-04, forward_time=0.201, loss_att=41.925, acc=0.963, loss=41.925, backward_time=0.297, grad_norm=107.417, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.409e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 18:14:27,118 (trainer:732) INFO: 46epoch:train:26986-28784batch: iter_time=2.157e-04, forward_time=0.202, loss_att=42.998, acc=0.962, loss=42.998, backward_time=0.298, grad_norm=116.591, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.407e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 18:34:34,417 (trainer:732) INFO: 46epoch:train:28785-30583batch: iter_time=2.142e-04, forward_time=0.202, loss_att=42.917, acc=0.963, loss=42.917, backward_time=0.298, grad_norm=108.445, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.405e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 18:54:36,577 (trainer:732) INFO: 46epoch:train:30584-32382batch: iter_time=2.155e-04, forward_time=0.201, loss_att=43.317, acc=0.962, loss=43.317, backward_time=0.297, grad_norm=112.440, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.402e-04, train_time=2.673 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 19:14:47,789 (trainer:732) INFO: 46epoch:train:32383-34181batch: iter_time=2.156e-04, forward_time=0.203, loss_att=44.076, acc=0.962, loss=44.076, backward_time=0.300, grad_norm=111.531, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.400e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 19:34:54,510 (trainer:732) INFO: 46epoch:train:34182-35980batch: iter_time=2.159e-04, forward_time=0.202, loss_att=42.924, acc=0.962, loss=42.924, backward_time=0.298, grad_norm=111.828, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.397e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 19:43:32,558 (trainer:338) INFO: 46epoch results: [train] iter_time=2.474e-04, forward_time=0.202, loss_att=42.978, acc=0.963, loss=42.978, backward_time=0.298, grad_norm=108.324, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.420e-04, train_time=2.718, time=6 hours, 47 minutes and 47.5 seconds, total_count=1655816, gpu_max_cached_mem_GB=30.396, [valid] loss_att=24.209, acc=0.982, cer=0.023, wer=0.077, loss=24.209, time=4 minutes and 29.7 seconds, total_count=576, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 51.04 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 19:43:36,420 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 19:43:36,434 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/32epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 19:43:36,435 (trainer:272) INFO: 47/60epoch started. Estimated time to finish: 4 days, 1 hour and 6 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 20:08:02,007 (trainer:732) INFO: 47epoch:train:1-1799batch: iter_time=9.808e-04, forward_time=0.203, loss_att=42.160, acc=0.963, loss=42.160, backward_time=0.298, grad_norm=113.369, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.395e-04, train_time=3.259 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 20:28:06,908 (trainer:732) INFO: 47epoch:train:1800-3598batch: iter_time=2.236e-04, forward_time=0.202, loss_att=43.072, acc=0.962, loss=43.072, backward_time=0.296, grad_norm=119.705, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.393e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 20:48:17,413 (trainer:732) INFO: 47epoch:train:3599-5397batch: iter_time=2.189e-04, forward_time=0.202, loss_att=42.418, acc=0.963, loss=42.418, backward_time=0.298, grad_norm=107.733, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.390e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 21:08:25,417 (trainer:732) INFO: 47epoch:train:5398-7196batch: iter_time=2.178e-04, forward_time=0.202, loss_att=42.853, acc=0.963, loss=42.853, backward_time=0.298, grad_norm=104.649, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.388e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 21:28:34,492 (trainer:732) INFO: 47epoch:train:7197-8995batch: iter_time=2.198e-04, forward_time=0.203, loss_att=42.740, acc=0.963, loss=42.740, backward_time=0.298, grad_norm=99.644, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.385e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 21:48:43,423 (trainer:732) INFO: 47epoch:train:8996-10794batch: iter_time=2.123e-04, forward_time=0.202, loss_att=42.573, acc=0.963, loss=42.573, backward_time=0.298, grad_norm=108.993, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.383e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 22:08:48,794 (trainer:732) INFO: 47epoch:train:10795-12593batch: iter_time=2.188e-04, forward_time=0.202, loss_att=42.471, acc=0.963, loss=42.471, backward_time=0.297, grad_norm=104.840, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.381e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 22:28:55,041 (trainer:732) INFO: 47epoch:train:12594-14392batch: iter_time=2.125e-04, forward_time=0.202, loss_att=42.715, acc=0.963, loss=42.715, backward_time=0.297, grad_norm=108.538, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.378e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 22:49:03,070 (trainer:732) INFO: 47epoch:train:14393-16191batch: iter_time=2.139e-04, forward_time=0.202, loss_att=42.264, acc=0.963, loss=42.264, backward_time=0.297, grad_norm=112.707, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.376e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 23:09:12,221 (trainer:732) INFO: 47epoch:train:16192-17990batch: iter_time=2.149e-04, forward_time=0.202, loss_att=43.047, acc=0.963, loss=43.047, backward_time=0.298, grad_norm=110.715, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.374e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 23:29:24,096 (trainer:732) INFO: 47epoch:train:17991-19789batch: iter_time=2.132e-04, forward_time=0.203, loss_att=42.683, acc=0.963, loss=42.683, backward_time=0.298, grad_norm=107.963, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.371e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-16 23:49:32,222 (trainer:732) INFO: 47epoch:train:19790-21588batch: iter_time=2.121e-04, forward_time=0.202, loss_att=42.666, acc=0.963, loss=42.666, backward_time=0.298, grad_norm=105.228, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.369e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 00:09:42,101 (trainer:732) INFO: 47epoch:train:21589-23387batch: iter_time=2.146e-04, forward_time=0.202, loss_att=42.733, acc=0.963, loss=42.733, backward_time=0.298, grad_norm=112.989, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.367e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 00:29:50,383 (trainer:732) INFO: 47epoch:train:23388-25186batch: iter_time=2.159e-04, forward_time=0.202, loss_att=42.755, acc=0.963, loss=42.755, backward_time=0.298, grad_norm=107.130, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.364e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 00:50:02,620 (trainer:732) INFO: 47epoch:train:25187-26985batch: iter_time=2.174e-04, forward_time=0.203, loss_att=43.605, acc=0.963, loss=43.605, backward_time=0.299, grad_norm=102.031, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.362e-04, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 01:10:12,486 (trainer:732) INFO: 47epoch:train:26986-28784batch: iter_time=2.186e-04, forward_time=0.203, loss_att=43.167, acc=0.962, loss=43.167, backward_time=0.298, grad_norm=107.720, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.360e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 01:30:20,500 (trainer:732) INFO: 47epoch:train:28785-30583batch: iter_time=2.202e-04, forward_time=0.202, loss_att=43.250, acc=0.962, loss=43.250, backward_time=0.297, grad_norm=105.501, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.357e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 01:50:31,437 (trainer:732) INFO: 47epoch:train:30584-32382batch: iter_time=2.132e-04, forward_time=0.203, loss_att=42.974, acc=0.962, loss=42.974, backward_time=0.299, grad_norm=108.892, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.355e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 02:10:37,170 (trainer:732) INFO: 47epoch:train:32383-34181batch: iter_time=2.141e-04, forward_time=0.202, loss_att=43.189, acc=0.962, loss=43.189, backward_time=0.296, grad_norm=103.778, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.353e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 02:30:43,756 (trainer:732) INFO: 47epoch:train:34182-35980batch: iter_time=2.149e-04, forward_time=0.202, loss_att=42.937, acc=0.963, loss=42.937, backward_time=0.297, grad_norm=110.473, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.350e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 02:39:06,408 (trainer:338) INFO: 47epoch results: [train] iter_time=2.543e-04, forward_time=0.202, loss_att=42.811, acc=0.963, loss=42.811, backward_time=0.298, grad_norm=108.141, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.372e-04, train_time=2.715, time=6 hours, 47 minutes and 24.68 seconds, total_count=1691812, gpu_max_cached_mem_GB=30.396, [valid] loss_att=22.855, acc=0.983, cer=0.022, wer=0.075, loss=22.855, time=4 minutes and 28.89 seconds, total_count=612, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 36.4 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 02:39:10,228 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 02:39:10,257 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/38epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 02:39:10,257 (trainer:272) INFO: 48/60epoch started. Estimated time to finish: 3 days, 18 hours and 6 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 03:03:37,696 (trainer:732) INFO: 48epoch:train:1-1799batch: iter_time=8.828e-04, forward_time=0.203, loss_att=42.532, acc=0.963, loss=42.532, backward_time=0.298, grad_norm=107.317, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.348e-04, train_time=3.263 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 03:23:48,340 (trainer:732) INFO: 48epoch:train:1800-3598batch: iter_time=2.194e-04, forward_time=0.203, loss_att=42.434, acc=0.963, loss=42.434, backward_time=0.298, grad_norm=113.459, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.346e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 03:43:58,325 (trainer:732) INFO: 48epoch:train:3599-5397batch: iter_time=2.153e-04, forward_time=0.202, loss_att=42.394, acc=0.963, loss=42.394, backward_time=0.298, grad_norm=109.088, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.343e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 04:04:06,869 (trainer:732) INFO: 48epoch:train:5398-7196batch: iter_time=2.224e-04, forward_time=0.202, loss_att=42.342, acc=0.963, loss=42.342, backward_time=0.298, grad_norm=106.488, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.341e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 04:24:16,975 (trainer:732) INFO: 48epoch:train:7197-8995batch: iter_time=2.213e-04, forward_time=0.203, loss_att=42.817, acc=0.963, loss=42.817, backward_time=0.298, grad_norm=107.413, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.339e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 04:44:27,015 (trainer:732) INFO: 48epoch:train:8996-10794batch: iter_time=2.238e-04, forward_time=0.203, loss_att=42.402, acc=0.963, loss=42.402, backward_time=0.298, grad_norm=107.266, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.336e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 05:04:34,154 (trainer:732) INFO: 48epoch:train:10795-12593batch: iter_time=2.275e-04, forward_time=0.202, loss_att=42.408, acc=0.963, loss=42.408, backward_time=0.297, grad_norm=105.456, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.334e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 05:24:41,277 (trainer:732) INFO: 48epoch:train:12594-14392batch: iter_time=2.253e-04, forward_time=0.202, loss_att=41.814, acc=0.963, loss=41.814, backward_time=0.297, grad_norm=105.316, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.332e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 05:44:47,737 (trainer:732) INFO: 48epoch:train:14393-16191batch: iter_time=2.199e-04, forward_time=0.202, loss_att=42.770, acc=0.963, loss=42.770, backward_time=0.297, grad_norm=106.881, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.330e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<17396> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<17608> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<23704> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<41154> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<41458> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<41464> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<58446> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<33226> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<53887> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<54103> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<62552> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<50299> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<55195> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<55186> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<63497> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<16617> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<49779> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<49879> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<52419> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<57995> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<25119> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<21679> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<25105> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<41703> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<42045> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<21947> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<57002> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<23890> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<62729> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<62721> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<28393> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.152<28751> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 06:04:54,739 (trainer:732) INFO: 48epoch:train:16192-17990batch: iter_time=2.243e-04, forward_time=0.202, loss_att=42.452, acc=0.963, loss=42.452, backward_time=0.297, grad_norm=104.783, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.327e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 06:25:05,702 (trainer:732) INFO: 48epoch:train:17991-19789batch: iter_time=2.197e-04, forward_time=0.203, loss_att=42.257, acc=0.963, loss=42.257, backward_time=0.298, grad_norm=106.082, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.325e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 06:45:15,938 (trainer:732) INFO: 48epoch:train:19790-21588batch: iter_time=2.218e-04, forward_time=0.202, loss_att=42.311, acc=0.963, loss=42.311, backward_time=0.298, grad_norm=111.764, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.323e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 07:05:23,864 (trainer:732) INFO: 48epoch:train:21589-23387batch: iter_time=2.190e-04, forward_time=0.203, loss_att=43.166, acc=0.962, loss=43.166, backward_time=0.298, grad_norm=107.519, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.320e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 07:25:32,257 (trainer:732) INFO: 48epoch:train:23388-25186batch: iter_time=2.161e-04, forward_time=0.202, loss_att=42.747, acc=0.963, loss=42.747, backward_time=0.298, grad_norm=113.260, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.318e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 07:45:42,156 (trainer:732) INFO: 48epoch:train:25187-26985batch: iter_time=2.210e-04, forward_time=0.202, loss_att=43.616, acc=0.962, loss=43.616, backward_time=0.298, grad_norm=104.485, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.316e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 08:05:50,018 (trainer:732) INFO: 48epoch:train:26986-28784batch: iter_time=2.220e-04, forward_time=0.202, loss_att=42.745, acc=0.963, loss=42.745, backward_time=0.297, grad_norm=107.489, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.314e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 08:25:57,005 (trainer:732) INFO: 48epoch:train:28785-30583batch: iter_time=2.232e-04, forward_time=0.202, loss_att=42.617, acc=0.963, loss=42.617, backward_time=0.297, grad_norm=109.196, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.311e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 08:46:05,023 (trainer:732) INFO: 48epoch:train:30584-32382batch: iter_time=2.203e-04, forward_time=0.202, loss_att=42.315, acc=0.963, loss=42.315, backward_time=0.297, grad_norm=103.203, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.309e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 09:06:14,182 (trainer:732) INFO: 48epoch:train:32383-34181batch: iter_time=2.186e-04, forward_time=0.203, loss_att=43.191, acc=0.962, loss=43.191, backward_time=0.298, grad_norm=107.414, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.307e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 09:26:21,500 (trainer:732) INFO: 48epoch:train:34182-35980batch: iter_time=2.145e-04, forward_time=0.202, loss_att=42.642, acc=0.963, loss=42.642, backward_time=0.297, grad_norm=111.315, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.305e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 09:34:47,045 (trainer:338) INFO: 48epoch results: [train] iter_time=2.539e-04, forward_time=0.202, loss_att=42.599, acc=0.963, loss=42.599, backward_time=0.298, grad_norm=107.760, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.326e-04, train_time=2.716, time=6 hours, 47 minutes and 28.78 seconds, total_count=1727808, gpu_max_cached_mem_GB=30.396, [valid] loss_att=23.313, acc=0.983, cer=0.023, wer=0.075, loss=23.313, time=4 minutes and 27.59 seconds, total_count=648, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 40.41 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 09:34:50,957 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 09:34:51,010 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/44epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 09:34:51,010 (trainer:272) INFO: 49/60epoch started. Estimated time to finish: 3 days, 11 hours and 9 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 09:59:16,837 (trainer:732) INFO: 49epoch:train:1-1799batch: iter_time=6.799e-04, forward_time=0.202, loss_att=41.639, acc=0.964, loss=41.639, backward_time=0.298, grad_norm=108.168, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.302e-04, train_time=3.260 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 10:19:25,871 (trainer:732) INFO: 49epoch:train:1800-3598batch: iter_time=2.276e-04, forward_time=0.203, loss_att=42.744, acc=0.963, loss=42.744, backward_time=0.298, grad_norm=110.725, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.300e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 10:39:35,773 (trainer:732) INFO: 49epoch:train:3599-5397batch: iter_time=2.272e-04, forward_time=0.203, loss_att=42.350, acc=0.963, loss=42.350, backward_time=0.298, grad_norm=109.748, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.298e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 10:59:43,467 (trainer:732) INFO: 49epoch:train:5398-7196batch: iter_time=2.282e-04, forward_time=0.202, loss_att=42.030, acc=0.963, loss=42.030, backward_time=0.297, grad_norm=104.331, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.296e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 11:19:49,997 (trainer:732) INFO: 49epoch:train:7197-8995batch: iter_time=2.276e-04, forward_time=0.202, loss_att=41.850, acc=0.963, loss=41.850, backward_time=0.297, grad_norm=106.259, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.294e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 11:39:59,441 (trainer:732) INFO: 49epoch:train:8996-10794batch: iter_time=2.243e-04, forward_time=0.202, loss_att=42.256, acc=0.963, loss=42.256, backward_time=0.298, grad_norm=110.706, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.291e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 12:00:09,384 (trainer:732) INFO: 49epoch:train:10795-12593batch: iter_time=2.279e-04, forward_time=0.203, loss_att=43.194, acc=0.963, loss=43.194, backward_time=0.298, grad_norm=115.269, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.289e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 12:20:13,149 (trainer:732) INFO: 49epoch:train:12594-14392batch: iter_time=2.243e-04, forward_time=0.202, loss_att=41.829, acc=0.963, loss=41.829, backward_time=0.296, grad_norm=105.280, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.287e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 12:40:25,264 (trainer:732) INFO: 49epoch:train:14393-16191batch: iter_time=2.275e-04, forward_time=0.204, loss_att=42.632, acc=0.963, loss=42.632, backward_time=0.299, grad_norm=114.682, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.285e-04, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 13:00:35,851 (trainer:732) INFO: 49epoch:train:16192-17990batch: iter_time=2.273e-04, forward_time=0.203, loss_att=42.341, acc=0.963, loss=42.341, backward_time=0.298, grad_norm=109.049, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.282e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 13:20:46,132 (trainer:732) INFO: 49epoch:train:17991-19789batch: iter_time=2.244e-04, forward_time=0.203, loss_att=42.805, acc=0.963, loss=42.805, backward_time=0.298, grad_norm=113.802, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.280e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 13:40:54,168 (trainer:732) INFO: 49epoch:train:19790-21588batch: iter_time=2.204e-04, forward_time=0.202, loss_att=42.823, acc=0.963, loss=42.823, backward_time=0.297, grad_norm=113.468, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.278e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 14:01:03,079 (trainer:732) INFO: 49epoch:train:21589-23387batch: iter_time=2.245e-04, forward_time=0.203, loss_att=42.605, acc=0.963, loss=42.605, backward_time=0.298, grad_norm=126.120, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.276e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 14:21:10,157 (trainer:732) INFO: 49epoch:train:23388-25186batch: iter_time=2.241e-04, forward_time=0.202, loss_att=42.542, acc=0.963, loss=42.542, backward_time=0.297, grad_norm=108.516, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.274e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 14:41:21,772 (trainer:732) INFO: 49epoch:train:25187-26985batch: iter_time=2.260e-04, forward_time=0.202, loss_att=42.765, acc=0.963, loss=42.765, backward_time=0.298, grad_norm=101.953, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.271e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 15:01:27,573 (trainer:732) INFO: 49epoch:train:26986-28784batch: iter_time=2.256e-04, forward_time=0.202, loss_att=41.709, acc=0.963, loss=41.709, backward_time=0.297, grad_norm=101.446, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.269e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 15:21:34,016 (trainer:732) INFO: 49epoch:train:28785-30583batch: iter_time=2.281e-04, forward_time=0.202, loss_att=42.583, acc=0.963, loss=42.583, backward_time=0.297, grad_norm=109.023, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.267e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 15:41:42,049 (trainer:732) INFO: 49epoch:train:30584-32382batch: iter_time=2.238e-04, forward_time=0.203, loss_att=42.128, acc=0.963, loss=42.128, backward_time=0.298, grad_norm=109.385, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.265e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 16:01:47,727 (trainer:732) INFO: 49epoch:train:32383-34181batch: iter_time=2.220e-04, forward_time=0.202, loss_att=42.380, acc=0.963, loss=42.380, backward_time=0.297, grad_norm=107.738, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.263e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 16:21:57,367 (trainer:732) INFO: 49epoch:train:34182-35980batch: iter_time=2.221e-04, forward_time=0.202, loss_att=42.566, acc=0.963, loss=42.566, backward_time=0.298, grad_norm=117.852, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.261e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 16:30:19,531 (trainer:338) INFO: 49epoch results: [train] iter_time=2.481e-04, forward_time=0.202, loss_att=42.389, acc=0.963, loss=42.389, backward_time=0.298, grad_norm=110.164, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.281e-04, train_time=2.715, time=6 hours, 47 minutes and 23.82 seconds, total_count=1763804, gpu_max_cached_mem_GB=30.396, [valid] loss_att=23.863, acc=0.982, cer=0.024, wer=0.077, loss=23.863, time=4 minutes and 26.45 seconds, total_count=684, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 38.25 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 16:30:23,485 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 16:30:23,497 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/37epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 16:30:23,498 (trainer:272) INFO: 50/60epoch started. Estimated time to finish: 3 days, 4 hours and 13 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 16:54:47,662 (trainer:732) INFO: 50epoch:train:1-1799batch: iter_time=8.955e-04, forward_time=0.203, loss_att=42.016, acc=0.964, loss=42.016, backward_time=0.298, grad_norm=109.895, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=4.258e-04, train_time=3.256 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 17:14:55,500 (trainer:732) INFO: 50epoch:train:1800-3598batch: iter_time=2.425e-04, forward_time=0.202, loss_att=41.827, acc=0.963, loss=41.827, backward_time=0.297, grad_norm=104.076, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.256e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 17:35:07,568 (trainer:732) INFO: 50epoch:train:3599-5397batch: iter_time=2.430e-04, forward_time=0.203, loss_att=42.133, acc=0.963, loss=42.133, backward_time=0.299, grad_norm=105.683, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.254e-04, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 17:55:14,911 (trainer:732) INFO: 50epoch:train:5398-7196batch: iter_time=2.433e-04, forward_time=0.202, loss_att=42.235, acc=0.963, loss=42.235, backward_time=0.297, grad_norm=111.807, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.252e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 18:15:17,842 (trainer:732) INFO: 50epoch:train:7197-8995batch: iter_time=2.431e-04, forward_time=0.202, loss_att=41.847, acc=0.963, loss=41.847, backward_time=0.296, grad_norm=116.385, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.250e-04, train_time=2.675 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 18:35:22,008 (trainer:732) INFO: 50epoch:train:8996-10794batch: iter_time=2.407e-04, forward_time=0.202, loss_att=42.415, acc=0.963, loss=42.415, backward_time=0.296, grad_norm=108.690, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.248e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 18:55:32,808 (trainer:732) INFO: 50epoch:train:10795-12593batch: iter_time=2.414e-04, forward_time=0.203, loss_att=41.817, acc=0.964, loss=41.817, backward_time=0.298, grad_norm=110.842, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.245e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 19:15:41,214 (trainer:732) INFO: 50epoch:train:12594-14392batch: iter_time=2.428e-04, forward_time=0.203, loss_att=42.538, acc=0.963, loss=42.538, backward_time=0.298, grad_norm=110.846, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.243e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 19:35:49,694 (trainer:732) INFO: 50epoch:train:14393-16191batch: iter_time=2.389e-04, forward_time=0.203, loss_att=42.138, acc=0.963, loss=42.138, backward_time=0.298, grad_norm=103.115, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.241e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 19:56:01,862 (trainer:732) INFO: 50epoch:train:16192-17990batch: iter_time=2.384e-04, forward_time=0.203, loss_att=42.291, acc=0.964, loss=42.291, backward_time=0.298, grad_norm=103.995, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.239e-04, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 20:16:10,964 (trainer:732) INFO: 50epoch:train:17991-19789batch: iter_time=2.378e-04, forward_time=0.203, loss_att=41.924, acc=0.963, loss=41.924, backward_time=0.298, grad_norm=104.157, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.237e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 20:36:18,165 (trainer:732) INFO: 50epoch:train:19790-21588batch: iter_time=2.369e-04, forward_time=0.202, loss_att=41.923, acc=0.963, loss=41.923, backward_time=0.297, grad_norm=110.966, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.235e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 20:56:25,941 (trainer:732) INFO: 50epoch:train:21589-23387batch: iter_time=2.384e-04, forward_time=0.202, loss_att=42.499, acc=0.963, loss=42.499, backward_time=0.298, grad_norm=115.416, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.233e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 21:16:33,700 (trainer:732) INFO: 50epoch:train:23388-25186batch: iter_time=2.440e-04, forward_time=0.202, loss_att=42.398, acc=0.963, loss=42.398, backward_time=0.297, grad_norm=108.271, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.230e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 21:36:41,610 (trainer:732) INFO: 50epoch:train:25187-26985batch: iter_time=2.422e-04, forward_time=0.202, loss_att=42.658, acc=0.963, loss=42.658, backward_time=0.297, grad_norm=114.726, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.228e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 21:56:51,230 (trainer:732) INFO: 50epoch:train:26986-28784batch: iter_time=2.394e-04, forward_time=0.202, loss_att=42.795, acc=0.963, loss=42.795, backward_time=0.298, grad_norm=111.305, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.226e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 22:17:01,452 (trainer:732) INFO: 50epoch:train:28785-30583batch: iter_time=2.445e-04, forward_time=0.203, loss_att=42.035, acc=0.964, loss=42.035, backward_time=0.298, grad_norm=108.069, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.224e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 22:37:10,130 (trainer:732) INFO: 50epoch:train:30584-32382batch: iter_time=2.404e-04, forward_time=0.202, loss_att=41.940, acc=0.963, loss=41.940, backward_time=0.298, grad_norm=107.843, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.222e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 22:57:17,439 (trainer:732) INFO: 50epoch:train:32383-34181batch: iter_time=2.390e-04, forward_time=0.202, loss_att=42.286, acc=0.963, loss=42.286, backward_time=0.297, grad_norm=110.756, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.220e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 23:17:28,626 (trainer:732) INFO: 50epoch:train:34182-35980batch: iter_time=2.427e-04, forward_time=0.203, loss_att=42.751, acc=0.963, loss=42.751, backward_time=0.298, grad_norm=108.936, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.218e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 23:25:53,171 (trainer:338) INFO: 50epoch results: [train] iter_time=2.737e-04, forward_time=0.202, loss_att=42.223, acc=0.963, loss=42.223, backward_time=0.298, grad_norm=109.307, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.238e-04, train_time=2.715, time=6 hours, 47 minutes and 22.17 seconds, total_count=1799800, gpu_max_cached_mem_GB=30.396, [valid] loss_att=22.820, acc=0.983, cer=0.022, wer=0.074, loss=22.820, time=4 minutes and 30.22 seconds, total_count=720, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 37.29 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 23:25:57,001 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 23:25:57,072 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/41epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 23:25:57,072 (trainer:272) INFO: 51/60epoch started. Estimated time to finish: 2 days, 21 hours and 17 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-17 23:50:25,111 (trainer:732) INFO: 51epoch:train:1-1799batch: iter_time=7.920e-04, forward_time=0.202, loss_att=41.224, acc=0.964, loss=41.224, backward_time=0.297, grad_norm=112.348, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.216e-04, train_time=3.265 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 00:10:34,626 (trainer:732) INFO: 51epoch:train:1800-3598batch: iter_time=2.249e-04, forward_time=0.203, loss_att=42.119, acc=0.964, loss=42.119, backward_time=0.298, grad_norm=108.948, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.213e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 00:30:48,305 (trainer:732) INFO: 51epoch:train:3599-5397batch: iter_time=2.244e-04, forward_time=0.203, loss_att=42.069, acc=0.964, loss=42.069, backward_time=0.299, grad_norm=111.053, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.211e-04, train_time=2.698 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 00:50:59,806 (trainer:732) INFO: 51epoch:train:5398-7196batch: iter_time=2.281e-04, forward_time=0.203, loss_att=41.999, acc=0.964, loss=41.999, backward_time=0.299, grad_norm=110.860, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.209e-04, train_time=2.693 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 01:11:11,663 (trainer:732) INFO: 51epoch:train:7197-8995batch: iter_time=2.244e-04, forward_time=0.203, loss_att=41.571, acc=0.964, loss=41.571, backward_time=0.298, grad_norm=109.318, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.207e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 01:31:18,671 (trainer:732) INFO: 51epoch:train:8996-10794batch: iter_time=2.228e-04, forward_time=0.202, loss_att=42.133, acc=0.963, loss=42.133, backward_time=0.297, grad_norm=111.107, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.205e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 01:51:23,974 (trainer:732) INFO: 51epoch:train:10795-12593batch: iter_time=2.259e-04, forward_time=0.202, loss_att=41.651, acc=0.963, loss=41.651, backward_time=0.297, grad_norm=106.823, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.203e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 02:11:28,365 (trainer:732) INFO: 51epoch:train:12594-14392batch: iter_time=2.221e-04, forward_time=0.202, loss_att=42.064, acc=0.963, loss=42.064, backward_time=0.297, grad_norm=105.743, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.201e-04, train_time=2.677 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 02:31:33,224 (trainer:732) INFO: 51epoch:train:14393-16191batch: iter_time=2.195e-04, forward_time=0.202, loss_att=42.200, acc=0.963, loss=42.200, backward_time=0.297, grad_norm=111.019, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.199e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 02:51:43,696 (trainer:732) INFO: 51epoch:train:16192-17990batch: iter_time=2.232e-04, forward_time=0.203, loss_att=41.705, acc=0.964, loss=41.705, backward_time=0.298, grad_norm=108.211, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.197e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 03:11:53,005 (trainer:732) INFO: 51epoch:train:17991-19789batch: iter_time=2.245e-04, forward_time=0.202, loss_att=42.334, acc=0.963, loss=42.334, backward_time=0.298, grad_norm=113.086, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.195e-04, train_time=2.688 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 03:31:59,830 (trainer:732) INFO: 51epoch:train:19790-21588batch: iter_time=2.207e-04, forward_time=0.202, loss_att=41.854, acc=0.963, loss=41.854, backward_time=0.297, grad_norm=107.833, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.193e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 03:52:05,918 (trainer:732) INFO: 51epoch:train:21589-23387batch: iter_time=2.221e-04, forward_time=0.202, loss_att=41.824, acc=0.963, loss=41.824, backward_time=0.297, grad_norm=115.460, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.191e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 04:12:12,625 (trainer:732) INFO: 51epoch:train:23388-25186batch: iter_time=2.195e-04, forward_time=0.202, loss_att=42.611, acc=0.963, loss=42.611, backward_time=0.297, grad_norm=103.699, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.188e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 04:32:21,603 (trainer:732) INFO: 51epoch:train:25187-26985batch: iter_time=2.239e-04, forward_time=0.202, loss_att=42.598, acc=0.963, loss=42.598, backward_time=0.298, grad_norm=111.195, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.186e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 04:52:32,392 (trainer:732) INFO: 51epoch:train:26986-28784batch: iter_time=2.201e-04, forward_time=0.203, loss_att=42.198, acc=0.963, loss=42.198, backward_time=0.298, grad_norm=111.420, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.184e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 05:12:41,775 (trainer:732) INFO: 51epoch:train:28785-30583batch: iter_time=2.216e-04, forward_time=0.202, loss_att=42.570, acc=0.963, loss=42.570, backward_time=0.298, grad_norm=113.139, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.182e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 05:32:51,444 (trainer:732) INFO: 51epoch:train:30584-32382batch: iter_time=2.227e-04, forward_time=0.202, loss_att=42.348, acc=0.963, loss=42.348, backward_time=0.298, grad_norm=106.606, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.180e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 05:52:59,699 (trainer:732) INFO: 51epoch:train:32383-34181batch: iter_time=2.224e-04, forward_time=0.202, loss_att=42.005, acc=0.963, loss=42.005, backward_time=0.297, grad_norm=110.500, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.178e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 06:13:08,446 (trainer:732) INFO: 51epoch:train:34182-35980batch: iter_time=2.232e-04, forward_time=0.202, loss_att=41.581, acc=0.964, loss=41.581, backward_time=0.298, grad_norm=117.577, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.176e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 06:21:33,987 (trainer:338) INFO: 51epoch results: [train] iter_time=2.514e-04, forward_time=0.202, loss_att=42.032, acc=0.963, loss=42.032, backward_time=0.298, grad_norm=110.290, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.196e-04, train_time=2.716, time=6 hours, 47 minutes and 29 seconds, total_count=1835796, gpu_max_cached_mem_GB=30.396, [valid] loss_att=23.502, acc=0.983, cer=0.022, wer=0.075, loss=23.502, time=4 minutes and 28.49 seconds, total_count=756, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 39.43 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 06:21:37,822 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 06:21:37,834 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/40epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 06:21:37,834 (trainer:272) INFO: 52/60epoch started. Estimated time to finish: 2 days, 14 hours and 21 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 06:46:01,299 (trainer:732) INFO: 52epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=41.521, acc=0.964, loss=41.521, backward_time=0.297, grad_norm=115.842, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.174e-04, train_time=3.255 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 07:06:06,719 (trainer:732) INFO: 52epoch:train:1800-3598batch: iter_time=2.097e-04, forward_time=0.202, loss_att=41.621, acc=0.963, loss=41.621, backward_time=0.297, grad_norm=130.780, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.172e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 07:26:15,488 (trainer:732) INFO: 52epoch:train:3599-5397batch: iter_time=2.093e-04, forward_time=0.202, loss_att=41.628, acc=0.964, loss=41.628, backward_time=0.297, grad_norm=108.531, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.170e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 07:46:26,061 (trainer:732) INFO: 52epoch:train:5398-7196batch: iter_time=2.117e-04, forward_time=0.203, loss_att=41.860, acc=0.964, loss=41.860, backward_time=0.298, grad_norm=106.547, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.168e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 08:06:33,749 (trainer:732) INFO: 52epoch:train:7197-8995batch: iter_time=2.120e-04, forward_time=0.202, loss_att=41.006, acc=0.964, loss=41.006, backward_time=0.297, grad_norm=111.104, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.166e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 08:26:42,394 (trainer:732) INFO: 52epoch:train:8996-10794batch: iter_time=2.117e-04, forward_time=0.202, loss_att=41.722, acc=0.964, loss=41.722, backward_time=0.297, grad_norm=109.501, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.164e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 08:46:49,322 (trainer:732) INFO: 52epoch:train:10795-12593batch: iter_time=2.257e-04, forward_time=0.202, loss_att=41.598, acc=0.964, loss=41.598, backward_time=0.297, grad_norm=116.093, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.162e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 09:06:57,084 (trainer:732) INFO: 52epoch:train:12594-14392batch: iter_time=2.128e-04, forward_time=0.202, loss_att=42.059, acc=0.963, loss=42.059, backward_time=0.297, grad_norm=122.746, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.160e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 09:27:03,789 (trainer:732) INFO: 52epoch:train:14393-16191batch: iter_time=2.123e-04, forward_time=0.202, loss_att=41.878, acc=0.963, loss=41.878, backward_time=0.297, grad_norm=109.245, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.158e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 09:47:09,060 (trainer:732) INFO: 52epoch:train:16192-17990batch: iter_time=2.096e-04, forward_time=0.202, loss_att=42.215, acc=0.963, loss=42.215, backward_time=0.297, grad_norm=110.844, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.156e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 10:07:17,668 (trainer:732) INFO: 52epoch:train:17991-19789batch: iter_time=2.113e-04, forward_time=0.202, loss_att=41.904, acc=0.964, loss=41.904, backward_time=0.298, grad_norm=115.027, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.154e-04, train_time=2.687 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 10:27:25,965 (trainer:732) INFO: 52epoch:train:19790-21588batch: iter_time=2.157e-04, forward_time=0.202, loss_att=41.911, acc=0.963, loss=41.911, backward_time=0.297, grad_norm=109.462, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.152e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 10:47:33,968 (trainer:732) INFO: 52epoch:train:21589-23387batch: iter_time=2.094e-04, forward_time=0.202, loss_att=41.637, acc=0.963, loss=41.637, backward_time=0.297, grad_norm=104.473, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.150e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 11:07:43,872 (trainer:732) INFO: 52epoch:train:23388-25186batch: iter_time=2.100e-04, forward_time=0.203, loss_att=41.976, acc=0.964, loss=41.976, backward_time=0.298, grad_norm=109.161, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.148e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 11:27:54,739 (trainer:732) INFO: 52epoch:train:25187-26985batch: iter_time=2.082e-04, forward_time=0.203, loss_att=41.930, acc=0.964, loss=41.930, backward_time=0.298, grad_norm=111.633, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.146e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 11:48:06,516 (trainer:732) INFO: 52epoch:train:26986-28784batch: iter_time=2.120e-04, forward_time=0.203, loss_att=42.532, acc=0.963, loss=42.532, backward_time=0.298, grad_norm=106.303, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.144e-04, train_time=2.694 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 12:08:18,534 (trainer:732) INFO: 52epoch:train:28785-30583batch: iter_time=2.139e-04, forward_time=0.203, loss_att=41.798, acc=0.964, loss=41.798, backward_time=0.299, grad_norm=105.717, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.142e-04, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 12:28:27,728 (trainer:732) INFO: 52epoch:train:30584-32382batch: iter_time=2.100e-04, forward_time=0.202, loss_att=42.015, acc=0.964, loss=42.015, backward_time=0.298, grad_norm=106.226, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.140e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 12:48:37,540 (trainer:732) INFO: 52epoch:train:32383-34181batch: iter_time=2.106e-04, forward_time=0.202, loss_att=42.069, acc=0.964, loss=42.069, backward_time=0.298, grad_norm=108.800, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.138e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 13:08:45,176 (trainer:732) INFO: 52epoch:train:34182-35980batch: iter_time=2.073e-04, forward_time=0.202, loss_att=41.983, acc=0.963, loss=41.983, backward_time=0.297, grad_norm=112.650, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.136e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 13:17:11,192 (trainer:338) INFO: 52epoch results: [train] iter_time=2.545e-04, forward_time=0.202, loss_att=41.843, acc=0.964, loss=41.843, backward_time=0.298, grad_norm=111.529, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.155e-04, train_time=2.715, time=6 hours, 47 minutes and 25.24 seconds, total_count=1871792, gpu_max_cached_mem_GB=30.396, [valid] loss_att=22.718, acc=0.983, cer=0.022, wer=0.075, loss=22.718, time=4 minutes and 28.84 seconds, total_count=792, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 39.28 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 13:17:15,255 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 13:17:15,301 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/43epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 13:17:15,301 (trainer:272) INFO: 53/60epoch started. Estimated time to finish: 2 days, 7 hours and 25 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 13:41:42,448 (trainer:732) INFO: 53epoch:train:1-1799batch: iter_time=9.462e-04, forward_time=0.203, loss_att=40.961, acc=0.965, loss=40.961, backward_time=0.298, grad_norm=113.592, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.134e-04, train_time=3.263 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 14:01:52,032 (trainer:732) INFO: 53epoch:train:1800-3598batch: iter_time=2.208e-04, forward_time=0.203, loss_att=41.590, acc=0.964, loss=41.590, backward_time=0.298, grad_norm=113.648, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.132e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 14:22:03,664 (trainer:732) INFO: 53epoch:train:3599-5397batch: iter_time=2.161e-04, forward_time=0.203, loss_att=41.408, acc=0.964, loss=41.408, backward_time=0.299, grad_norm=112.816, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.130e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 14:42:11,293 (trainer:732) INFO: 53epoch:train:5398-7196batch: iter_time=2.248e-04, forward_time=0.202, loss_att=41.585, acc=0.964, loss=41.585, backward_time=0.297, grad_norm=110.218, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.128e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 15:02:16,822 (trainer:732) INFO: 53epoch:train:7197-8995batch: iter_time=2.182e-04, forward_time=0.202, loss_att=41.622, acc=0.964, loss=41.622, backward_time=0.297, grad_norm=116.958, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.126e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 15:22:25,258 (trainer:732) INFO: 53epoch:train:8996-10794batch: iter_time=2.196e-04, forward_time=0.202, loss_att=40.968, acc=0.964, loss=40.968, backward_time=0.298, grad_norm=112.243, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.124e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 15:42:36,559 (trainer:732) INFO: 53epoch:train:10795-12593batch: iter_time=2.251e-04, forward_time=0.203, loss_att=42.085, acc=0.964, loss=42.085, backward_time=0.298, grad_norm=118.432, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.122e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 16:02:47,408 (trainer:732) INFO: 53epoch:train:12594-14392batch: iter_time=2.200e-04, forward_time=0.202, loss_att=41.782, acc=0.964, loss=41.782, backward_time=0.298, grad_norm=115.880, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.120e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 16:22:56,609 (trainer:732) INFO: 53epoch:train:14393-16191batch: iter_time=2.190e-04, forward_time=0.202, loss_att=41.964, acc=0.964, loss=41.964, backward_time=0.298, grad_norm=116.601, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=4.118e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 16:43:01,889 (trainer:732) INFO: 53epoch:train:16192-17990batch: iter_time=2.200e-04, forward_time=0.202, loss_att=41.228, acc=0.964, loss=41.228, backward_time=0.297, grad_norm=104.225, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.116e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 17:03:12,707 (trainer:732) INFO: 53epoch:train:17991-19789batch: iter_time=2.218e-04, forward_time=0.203, loss_att=42.059, acc=0.964, loss=42.059, backward_time=0.298, grad_norm=110.154, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.114e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 17:23:21,222 (trainer:732) INFO: 53epoch:train:19790-21588batch: iter_time=2.183e-04, forward_time=0.202, loss_att=42.153, acc=0.964, loss=42.153, backward_time=0.297, grad_norm=106.487, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.112e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 17:43:28,704 (trainer:732) INFO: 53epoch:train:21589-23387batch: iter_time=2.124e-04, forward_time=0.202, loss_att=41.457, acc=0.964, loss=41.457, backward_time=0.297, grad_norm=114.068, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.110e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 18:03:36,641 (trainer:732) INFO: 53epoch:train:23388-25186batch: iter_time=2.185e-04, forward_time=0.202, loss_att=42.202, acc=0.963, loss=42.202, backward_time=0.298, grad_norm=109.167, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.108e-04, train_time=2.686 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<64885> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<15160> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<64005> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<35750> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<31826> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<31852> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<46000> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<64917> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<59663> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<60599> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<35101> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<18932> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<19018> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<59423> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<46865> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<29141> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<38572> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<29138> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<40369> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 144) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<60809> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<51000> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<38708> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<44675> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<21506> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<44667> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<52763> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<53251> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<37156> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<60541> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<60549> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 151) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<37110> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<39514> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 154) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 18:23:43,606 (trainer:732) INFO: 53epoch:train:25187-26985batch: iter_time=2.186e-04, forward_time=0.202, loss_att=41.305, acc=0.964, loss=41.305, backward_time=0.297, grad_norm=109.170, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.106e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 18:43:50,965 (trainer:732) INFO: 53epoch:train:26986-28784batch: iter_time=2.201e-04, forward_time=0.202, loss_att=41.457, acc=0.964, loss=41.457, backward_time=0.298, grad_norm=105.461, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.104e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 19:04:00,589 (trainer:732) INFO: 53epoch:train:28785-30583batch: iter_time=2.174e-04, forward_time=0.203, loss_att=41.516, acc=0.964, loss=41.516, backward_time=0.298, grad_norm=112.422, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.102e-04, train_time=2.690 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 19:24:06,549 (trainer:732) INFO: 53epoch:train:30584-32382batch: iter_time=2.184e-04, forward_time=0.202, loss_att=42.005, acc=0.963, loss=42.005, backward_time=0.297, grad_norm=109.258, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.100e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 19:44:15,976 (trainer:732) INFO: 53epoch:train:32383-34181batch: iter_time=2.173e-04, forward_time=0.203, loss_att=42.173, acc=0.964, loss=42.173, backward_time=0.298, grad_norm=112.274, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.098e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 20:04:24,659 (trainer:732) INFO: 53epoch:train:34182-35980batch: iter_time=2.130e-04, forward_time=0.202, loss_att=42.240, acc=0.963, loss=42.240, backward_time=0.297, grad_norm=113.889, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.097e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 20:12:48,012 (trainer:338) INFO: 53epoch results: [train] iter_time=2.552e-04, forward_time=0.202, loss_att=41.687, acc=0.964, loss=41.687, backward_time=0.298, grad_norm=111.839, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.115e-04, train_time=2.716, time=6 hours, 47 minutes and 27.1 seconds, total_count=1907788, gpu_max_cached_mem_GB=30.396, [valid] loss_att=23.509, acc=0.982, cer=0.022, wer=0.075, loss=23.509, time=4 minutes and 28.67 seconds, total_count=828, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 36.94 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 20:12:51,872 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 20:12:51,885 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/42epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 20:12:51,886 (trainer:272) INFO: 54/60epoch started. Estimated time to finish: 2 days, 29 minutes and 46.66 seconds + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 20:37:21,448 (trainer:732) INFO: 54epoch:train:1-1799batch: iter_time=8.324e-04, forward_time=0.203, loss_att=41.178, acc=0.964, loss=41.178, backward_time=0.298, grad_norm=120.743, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.095e-04, train_time=3.268 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 20:57:31,489 (trainer:732) INFO: 54epoch:train:1800-3598batch: iter_time=2.211e-04, forward_time=0.203, loss_att=41.375, acc=0.964, loss=41.375, backward_time=0.298, grad_norm=115.930, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.093e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 21:17:40,162 (trainer:732) INFO: 54epoch:train:3599-5397batch: iter_time=2.168e-04, forward_time=0.202, loss_att=40.681, acc=0.964, loss=40.681, backward_time=0.297, grad_norm=110.653, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.091e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 21:37:49,195 (trainer:732) INFO: 54epoch:train:5398-7196batch: iter_time=2.218e-04, forward_time=0.203, loss_att=41.346, acc=0.964, loss=41.346, backward_time=0.298, grad_norm=113.351, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.089e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 21:57:56,083 (trainer:732) INFO: 54epoch:train:7197-8995batch: iter_time=2.131e-04, forward_time=0.202, loss_att=41.551, acc=0.964, loss=41.551, backward_time=0.297, grad_norm=113.356, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.087e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 22:18:04,663 (trainer:732) INFO: 54epoch:train:8996-10794batch: iter_time=2.170e-04, forward_time=0.202, loss_att=42.161, acc=0.964, loss=42.161, backward_time=0.297, grad_norm=113.435, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.085e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 22:38:12,791 (trainer:732) INFO: 54epoch:train:10795-12593batch: iter_time=2.129e-04, forward_time=0.202, loss_att=41.671, acc=0.964, loss=41.671, backward_time=0.297, grad_norm=119.991, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.083e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 22:58:22,084 (trainer:732) INFO: 54epoch:train:12594-14392batch: iter_time=2.150e-04, forward_time=0.203, loss_att=42.236, acc=0.964, loss=42.236, backward_time=0.298, grad_norm=109.483, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.081e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 23:18:30,203 (trainer:732) INFO: 54epoch:train:14393-16191batch: iter_time=2.127e-04, forward_time=0.202, loss_att=41.698, acc=0.964, loss=41.698, backward_time=0.297, grad_norm=117.887, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.079e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 23:38:38,155 (trainer:732) INFO: 54epoch:train:16192-17990batch: iter_time=2.165e-04, forward_time=0.202, loss_att=41.687, acc=0.963, loss=41.687, backward_time=0.298, grad_norm=116.856, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.077e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-18 23:58:47,000 (trainer:732) INFO: 54epoch:train:17991-19789batch: iter_time=2.184e-04, forward_time=0.202, loss_att=41.862, acc=0.963, loss=41.862, backward_time=0.298, grad_norm=112.626, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.075e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 00:18:51,891 (trainer:732) INFO: 54epoch:train:19790-21588batch: iter_time=2.195e-04, forward_time=0.202, loss_att=41.412, acc=0.964, loss=41.412, backward_time=0.297, grad_norm=110.738, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.073e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 00:38:58,606 (trainer:732) INFO: 54epoch:train:21589-23387batch: iter_time=2.173e-04, forward_time=0.202, loss_att=42.008, acc=0.963, loss=42.008, backward_time=0.297, grad_norm=114.791, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.072e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 00:59:10,188 (trainer:732) INFO: 54epoch:train:23388-25186batch: iter_time=2.152e-04, forward_time=0.203, loss_att=41.043, acc=0.964, loss=41.043, backward_time=0.299, grad_norm=111.295, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.070e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 01:19:15,915 (trainer:732) INFO: 54epoch:train:25187-26985batch: iter_time=2.144e-04, forward_time=0.202, loss_att=41.041, acc=0.964, loss=41.041, backward_time=0.297, grad_norm=104.834, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.068e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 01:39:25,957 (trainer:732) INFO: 54epoch:train:26986-28784batch: iter_time=2.158e-04, forward_time=0.202, loss_att=41.835, acc=0.964, loss=41.835, backward_time=0.298, grad_norm=114.041, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.066e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 01:59:35,764 (trainer:732) INFO: 54epoch:train:28785-30583batch: iter_time=2.179e-04, forward_time=0.203, loss_att=41.683, acc=0.964, loss=41.683, backward_time=0.298, grad_norm=113.883, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.064e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 02:19:43,976 (trainer:732) INFO: 54epoch:train:30584-32382batch: iter_time=2.140e-04, forward_time=0.203, loss_att=41.859, acc=0.964, loss=41.859, backward_time=0.298, grad_norm=119.037, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.062e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 02:39:51,871 (trainer:732) INFO: 54epoch:train:32383-34181batch: iter_time=2.125e-04, forward_time=0.202, loss_att=41.831, acc=0.964, loss=41.831, backward_time=0.298, grad_norm=108.685, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.060e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 02:59:59,764 (trainer:732) INFO: 54epoch:train:34182-35980batch: iter_time=2.195e-04, forward_time=0.202, loss_att=41.393, acc=0.964, loss=41.393, backward_time=0.297, grad_norm=109.347, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.058e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 03:08:26,246 (trainer:338) INFO: 54epoch results: [train] iter_time=2.471e-04, forward_time=0.202, loss_att=41.575, acc=0.964, loss=41.575, backward_time=0.298, grad_norm=113.531, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.076e-04, train_time=2.715, time=6 hours, 47 minutes and 25.55 seconds, total_count=1943784, gpu_max_cached_mem_GB=30.396, [valid] loss_att=23.182, acc=0.983, cer=0.023, wer=0.075, loss=23.182, time=4 minutes and 28.61 seconds, total_count=864, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 40.19 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 03:08:30,241 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 03:08:30,274 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/45epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 03:08:30,274 (trainer:272) INFO: 55/60epoch started. Estimated time to finish: 1 day, 17 hours and 34 minutes + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 03:32:49,776 (trainer:732) INFO: 55epoch:train:1-1799batch: iter_time=8.801e-04, forward_time=0.202, loss_att=40.801, acc=0.964, loss=40.801, backward_time=0.297, grad_norm=113.794, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.056e-04, train_time=3.246 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 03:53:01,181 (trainer:732) INFO: 55epoch:train:1800-3598batch: iter_time=2.246e-04, forward_time=0.203, loss_att=40.243, acc=0.965, loss=40.243, backward_time=0.298, grad_norm=111.734, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.055e-04, train_time=2.693 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 04:13:11,654 (trainer:732) INFO: 55epoch:train:3599-5397batch: iter_time=2.250e-04, forward_time=0.203, loss_att=41.631, acc=0.964, loss=41.631, backward_time=0.298, grad_norm=111.032, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.053e-04, train_time=2.691 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 04:33:17,575 (trainer:732) INFO: 55epoch:train:5398-7196batch: iter_time=2.206e-04, forward_time=0.202, loss_att=40.801, acc=0.964, loss=40.801, backward_time=0.297, grad_norm=110.730, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.051e-04, train_time=2.681 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 04:53:25,599 (trainer:732) INFO: 55epoch:train:7197-8995batch: iter_time=2.248e-04, forward_time=0.202, loss_att=41.313, acc=0.964, loss=41.313, backward_time=0.297, grad_norm=107.649, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.049e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 05:13:38,029 (trainer:732) INFO: 55epoch:train:8996-10794batch: iter_time=2.220e-04, forward_time=0.203, loss_att=41.692, acc=0.964, loss=41.692, backward_time=0.299, grad_norm=109.817, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.047e-04, train_time=2.695 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 05:33:42,395 (trainer:732) INFO: 55epoch:train:10795-12593batch: iter_time=2.220e-04, forward_time=0.202, loss_att=40.778, acc=0.964, loss=40.778, backward_time=0.297, grad_norm=110.989, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.045e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 05:53:50,776 (trainer:732) INFO: 55epoch:train:12594-14392batch: iter_time=2.204e-04, forward_time=0.202, loss_att=40.733, acc=0.965, loss=40.733, backward_time=0.297, grad_norm=112.283, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.043e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 06:13:56,741 (trainer:732) INFO: 55epoch:train:14393-16191batch: iter_time=2.198e-04, forward_time=0.202, loss_att=41.555, acc=0.964, loss=41.555, backward_time=0.297, grad_norm=113.962, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.042e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 06:34:06,497 (trainer:732) INFO: 55epoch:train:16192-17990batch: iter_time=2.232e-04, forward_time=0.203, loss_att=41.741, acc=0.964, loss=41.741, backward_time=0.298, grad_norm=112.908, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.040e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 06:54:13,449 (trainer:732) INFO: 55epoch:train:17991-19789batch: iter_time=2.234e-04, forward_time=0.202, loss_att=41.604, acc=0.964, loss=41.604, backward_time=0.297, grad_norm=107.688, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.038e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 07:14:23,304 (trainer:732) INFO: 55epoch:train:19790-21588batch: iter_time=2.219e-04, forward_time=0.202, loss_att=42.299, acc=0.964, loss=42.299, backward_time=0.298, grad_norm=109.354, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.036e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 07:34:31,464 (trainer:732) INFO: 55epoch:train:21589-23387batch: iter_time=2.170e-04, forward_time=0.202, loss_att=41.438, acc=0.964, loss=41.438, backward_time=0.298, grad_norm=112.342, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.034e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 07:54:40,974 (trainer:732) INFO: 55epoch:train:23388-25186batch: iter_time=2.224e-04, forward_time=0.203, loss_att=41.729, acc=0.964, loss=41.729, backward_time=0.298, grad_norm=112.233, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.032e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 08:14:48,804 (trainer:732) INFO: 55epoch:train:25187-26985batch: iter_time=2.216e-04, forward_time=0.203, loss_att=41.066, acc=0.964, loss=41.066, backward_time=0.298, grad_norm=115.589, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.030e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 08:34:57,767 (trainer:732) INFO: 55epoch:train:26986-28784batch: iter_time=2.230e-04, forward_time=0.202, loss_att=41.681, acc=0.964, loss=41.681, backward_time=0.298, grad_norm=120.271, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.029e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 08:55:04,458 (trainer:732) INFO: 55epoch:train:28785-30583batch: iter_time=2.230e-04, forward_time=0.202, loss_att=41.786, acc=0.964, loss=41.786, backward_time=0.297, grad_norm=114.738, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.027e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 09:15:15,729 (trainer:732) INFO: 55epoch:train:30584-32382batch: iter_time=2.192e-04, forward_time=0.203, loss_att=41.195, acc=0.964, loss=41.195, backward_time=0.298, grad_norm=114.563, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.025e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 09:35:24,560 (trainer:732) INFO: 55epoch:train:32383-34181batch: iter_time=2.215e-04, forward_time=0.203, loss_att=41.565, acc=0.964, loss=41.565, backward_time=0.298, grad_norm=112.366, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.023e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 09:55:32,724 (trainer:732) INFO: 55epoch:train:34182-35980batch: iter_time=2.193e-04, forward_time=0.202, loss_att=41.966, acc=0.963, loss=41.966, backward_time=0.297, grad_norm=121.406, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.021e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 10:03:56,433 (trainer:338) INFO: 55epoch results: [train] iter_time=2.547e-04, forward_time=0.202, loss_att=41.381, acc=0.964, loss=41.381, backward_time=0.298, grad_norm=112.778, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.039e-04, train_time=2.715, time=6 hours, 47 minutes and 19.83 seconds, total_count=1979780, gpu_max_cached_mem_GB=30.396, [valid] loss_att=22.328, acc=0.983, cer=0.022, wer=0.074, loss=22.328, time=4 minutes and 31.25 seconds, total_count=900, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 35.08 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 10:04:00,483 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 10:04:00,515 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/39epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 10:04:00,516 (trainer:272) INFO: 56/60epoch started. Estimated time to finish: 1 day, 10 hours and 38 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 10:28:29,122 (trainer:732) INFO: 56epoch:train:1-1799batch: iter_time=7.232e-04, forward_time=0.203, loss_att=40.635, acc=0.965, loss=40.635, backward_time=0.298, grad_norm=118.401, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.019e-04, train_time=3.266 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 10:48:35,991 (trainer:732) INFO: 56epoch:train:1800-3598batch: iter_time=2.339e-04, forward_time=0.202, loss_att=40.712, acc=0.964, loss=40.712, backward_time=0.297, grad_norm=116.907, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.018e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 11:08:43,759 (trainer:732) INFO: 56epoch:train:3599-5397batch: iter_time=2.329e-04, forward_time=0.202, loss_att=40.590, acc=0.965, loss=40.590, backward_time=0.297, grad_norm=115.368, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.016e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 11:28:49,526 (trainer:732) INFO: 56epoch:train:5398-7196batch: iter_time=2.303e-04, forward_time=0.202, loss_att=40.805, acc=0.964, loss=40.805, backward_time=0.297, grad_norm=113.360, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.014e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 11:48:58,386 (trainer:732) INFO: 56epoch:train:7197-8995batch: iter_time=2.249e-04, forward_time=0.203, loss_att=41.308, acc=0.964, loss=41.308, backward_time=0.298, grad_norm=115.754, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.012e-04, train_time=2.688 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 12:09:06,811 (trainer:732) INFO: 56epoch:train:8996-10794batch: iter_time=2.253e-04, forward_time=0.202, loss_att=41.338, acc=0.964, loss=41.338, backward_time=0.298, grad_norm=116.820, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.010e-04, train_time=2.687 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 12:29:15,691 (trainer:732) INFO: 56epoch:train:10795-12593batch: iter_time=2.273e-04, forward_time=0.202, loss_att=41.114, acc=0.964, loss=41.114, backward_time=0.298, grad_norm=123.563, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.009e-04, train_time=2.687 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 12:49:24,011 (trainer:732) INFO: 56epoch:train:12594-14392batch: iter_time=2.270e-04, forward_time=0.202, loss_att=40.801, acc=0.964, loss=40.801, backward_time=0.297, grad_norm=115.774, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.007e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 13:09:33,205 (trainer:732) INFO: 56epoch:train:14393-16191batch: iter_time=2.283e-04, forward_time=0.203, loss_att=41.382, acc=0.964, loss=41.382, backward_time=0.298, grad_norm=117.644, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.005e-04, train_time=2.689 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 13:29:41,461 (trainer:732) INFO: 56epoch:train:16192-17990batch: iter_time=2.289e-04, forward_time=0.202, loss_att=40.858, acc=0.964, loss=40.858, backward_time=0.297, grad_norm=109.281, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.003e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 13:49:49,314 (trainer:732) INFO: 56epoch:train:17991-19789batch: iter_time=2.299e-04, forward_time=0.202, loss_att=41.385, acc=0.964, loss=41.385, backward_time=0.298, grad_norm=121.218, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.001e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 14:10:00,472 (trainer:732) INFO: 56epoch:train:19790-21588batch: iter_time=2.267e-04, forward_time=0.203, loss_att=42.116, acc=0.964, loss=42.116, backward_time=0.298, grad_norm=122.381, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.000e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 14:30:10,402 (trainer:732) INFO: 56epoch:train:21589-23387batch: iter_time=2.302e-04, forward_time=0.202, loss_att=41.459, acc=0.964, loss=41.459, backward_time=0.298, grad_norm=116.432, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.998e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 14:50:17,644 (trainer:732) INFO: 56epoch:train:23388-25186batch: iter_time=2.276e-04, forward_time=0.202, loss_att=41.981, acc=0.964, loss=41.981, backward_time=0.297, grad_norm=114.424, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.996e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 15:10:27,289 (trainer:732) INFO: 56epoch:train:25187-26985batch: iter_time=2.284e-04, forward_time=0.203, loss_att=41.812, acc=0.964, loss=41.812, backward_time=0.298, grad_norm=112.539, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.994e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 15:30:37,431 (trainer:732) INFO: 56epoch:train:26986-28784batch: iter_time=2.267e-04, forward_time=0.202, loss_att=41.627, acc=0.964, loss=41.627, backward_time=0.298, grad_norm=117.202, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.992e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 15:50:44,296 (trainer:732) INFO: 56epoch:train:28785-30583batch: iter_time=2.262e-04, forward_time=0.202, loss_att=41.914, acc=0.964, loss=41.914, backward_time=0.297, grad_norm=118.111, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.991e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 16:10:53,927 (trainer:732) INFO: 56epoch:train:30584-32382batch: iter_time=2.272e-04, forward_time=0.203, loss_att=41.414, acc=0.964, loss=41.414, backward_time=0.298, grad_norm=119.047, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.989e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 16:31:02,817 (trainer:732) INFO: 56epoch:train:32383-34181batch: iter_time=2.278e-04, forward_time=0.202, loss_att=41.167, acc=0.964, loss=41.167, backward_time=0.298, grad_norm=116.605, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.987e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 16:51:10,691 (trainer:732) INFO: 56epoch:train:34182-35980batch: iter_time=2.231e-04, forward_time=0.202, loss_att=41.634, acc=0.963, loss=41.634, backward_time=0.297, grad_norm=112.759, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.985e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 16:59:37,839 (trainer:338) INFO: 56epoch results: [train] iter_time=2.527e-04, forward_time=0.202, loss_att=41.303, acc=0.964, loss=41.303, backward_time=0.298, grad_norm=116.678, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.002e-04, train_time=2.716, time=6 hours, 47 minutes and 28.04 seconds, total_count=2015776, gpu_max_cached_mem_GB=30.396, [valid] loss_att=24.052, acc=0.982, cer=0.023, wer=0.076, loss=24.052, time=4 minutes and 31.39 seconds, total_count=936, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 37.89 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 16:59:41,414 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 16:59:41,442 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/49epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 16:59:41,442 (trainer:272) INFO: 57/60epoch started. Estimated time to finish: 1 day, 3 hours and 42 minutes +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 17:24:09,520 (trainer:732) INFO: 57epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=40.800, acc=0.964, loss=40.800, backward_time=0.297, grad_norm=110.105, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.983e-04, train_time=3.265 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 17:44:19,595 (trainer:732) INFO: 57epoch:train:1800-3598batch: iter_time=2.157e-04, forward_time=0.203, loss_att=40.487, acc=0.965, loss=40.487, backward_time=0.298, grad_norm=108.231, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.982e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 18:04:26,694 (trainer:732) INFO: 57epoch:train:3599-5397batch: iter_time=2.162e-04, forward_time=0.202, loss_att=40.972, acc=0.964, loss=40.972, backward_time=0.297, grad_norm=110.739, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.980e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 18:24:37,588 (trainer:732) INFO: 57epoch:train:5398-7196batch: iter_time=2.196e-04, forward_time=0.203, loss_att=40.988, acc=0.965, loss=40.988, backward_time=0.298, grad_norm=117.450, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.978e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 18:44:45,426 (trainer:732) INFO: 57epoch:train:7197-8995batch: iter_time=2.154e-04, forward_time=0.202, loss_att=40.447, acc=0.964, loss=40.447, backward_time=0.298, grad_norm=116.715, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.976e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 19:04:54,404 (trainer:732) INFO: 57epoch:train:8996-10794batch: iter_time=2.172e-04, forward_time=0.202, loss_att=39.918, acc=0.965, loss=39.918, backward_time=0.297, grad_norm=107.914, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.975e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 19:25:00,946 (trainer:732) INFO: 57epoch:train:10795-12593batch: iter_time=2.158e-04, forward_time=0.202, loss_att=41.155, acc=0.964, loss=41.155, backward_time=0.297, grad_norm=110.248, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.973e-04, train_time=2.682 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 19:45:09,511 (trainer:732) INFO: 57epoch:train:12594-14392batch: iter_time=2.132e-04, forward_time=0.202, loss_att=41.146, acc=0.964, loss=41.146, backward_time=0.297, grad_norm=106.206, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.971e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 20:05:20,152 (trainer:732) INFO: 57epoch:train:14393-16191batch: iter_time=2.143e-04, forward_time=0.203, loss_att=41.016, acc=0.965, loss=41.016, backward_time=0.298, grad_norm=114.005, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.969e-04, train_time=2.692 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 20:25:26,713 (trainer:732) INFO: 57epoch:train:16192-17990batch: iter_time=2.146e-04, forward_time=0.202, loss_att=41.030, acc=0.964, loss=41.030, backward_time=0.297, grad_norm=111.755, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.968e-04, train_time=2.682 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 20:45:36,043 (trainer:732) INFO: 57epoch:train:17991-19789batch: iter_time=2.137e-04, forward_time=0.203, loss_att=41.603, acc=0.964, loss=41.603, backward_time=0.298, grad_norm=111.808, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.966e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 21:05:43,612 (trainer:732) INFO: 57epoch:train:19790-21588batch: iter_time=2.175e-04, forward_time=0.202, loss_att=41.018, acc=0.964, loss=41.018, backward_time=0.297, grad_norm=108.987, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.964e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 21:25:50,869 (trainer:732) INFO: 57epoch:train:21589-23387batch: iter_time=2.173e-04, forward_time=0.202, loss_att=41.195, acc=0.964, loss=41.195, backward_time=0.297, grad_norm=109.199, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.962e-04, train_time=2.684 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 21:46:03,348 (trainer:732) INFO: 57epoch:train:23388-25186batch: iter_time=2.162e-04, forward_time=0.203, loss_att=41.811, acc=0.964, loss=41.811, backward_time=0.299, grad_norm=118.762, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.961e-04, train_time=2.696 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 22:06:13,624 (trainer:732) INFO: 57epoch:train:25187-26985batch: iter_time=2.140e-04, forward_time=0.203, loss_att=41.472, acc=0.964, loss=41.472, backward_time=0.298, grad_norm=114.463, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.959e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 22:26:21,396 (trainer:732) INFO: 57epoch:train:26986-28784batch: iter_time=2.186e-04, forward_time=0.202, loss_att=41.856, acc=0.964, loss=41.856, backward_time=0.297, grad_norm=110.995, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.957e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 22:46:31,748 (trainer:732) INFO: 57epoch:train:28785-30583batch: iter_time=2.169e-04, forward_time=0.203, loss_att=41.197, acc=0.964, loss=41.197, backward_time=0.298, grad_norm=110.612, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.955e-04, train_time=2.691 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 23:06:38,309 (trainer:732) INFO: 57epoch:train:30584-32382batch: iter_time=2.125e-04, forward_time=0.202, loss_att=40.741, acc=0.964, loss=40.741, backward_time=0.297, grad_norm=109.398, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.954e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 23:26:45,796 (trainer:732) INFO: 57epoch:train:32383-34181batch: iter_time=2.121e-04, forward_time=0.202, loss_att=41.723, acc=0.964, loss=41.723, backward_time=0.297, grad_norm=109.644, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.952e-04, train_time=2.684 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 23:46:53,411 (trainer:732) INFO: 57epoch:train:34182-35980batch: iter_time=2.133e-04, forward_time=0.202, loss_att=41.091, acc=0.964, loss=41.091, backward_time=0.297, grad_norm=120.095, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.950e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 23:55:25,402 (trainer:338) INFO: 57epoch results: [train] iter_time=2.557e-04, forward_time=0.202, loss_att=41.079, acc=0.964, loss=41.079, backward_time=0.298, grad_norm=111.861, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.967e-04, train_time=2.716, time=6 hours, 47 minutes and 29.9 seconds, total_count=2051772, gpu_max_cached_mem_GB=30.396, [valid] loss_att=23.230, acc=0.983, cer=0.022, wer=0.074, loss=23.230, time=4 minutes and 31.99 seconds, total_count=972, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 42.07 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 23:55:29,043 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 23:55:29,072 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/56epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-19 23:55:29,072 (trainer:272) INFO: 58/60epoch started. Estimated time to finish: 20 hours, 47 minutes and 1.2 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 00:19:51,085 (trainer:732) INFO: 58epoch:train:1-1799batch: iter_time=0.001, forward_time=0.202, loss_att=40.090, acc=0.964, loss=40.090, backward_time=0.296, grad_norm=116.488, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.948e-04, train_time=3.252 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 00:40:02,008 (trainer:732) INFO: 58epoch:train:1800-3598batch: iter_time=2.166e-04, forward_time=0.203, loss_att=40.568, acc=0.965, loss=40.568, backward_time=0.299, grad_norm=128.112, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.947e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 01:00:07,849 (trainer:732) INFO: 58epoch:train:3599-5397batch: iter_time=2.140e-04, forward_time=0.202, loss_att=40.420, acc=0.964, loss=40.420, backward_time=0.297, grad_norm=109.662, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.945e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 01:20:16,730 (trainer:732) INFO: 58epoch:train:5398-7196batch: iter_time=2.145e-04, forward_time=0.203, loss_att=40.803, acc=0.964, loss=40.803, backward_time=0.298, grad_norm=112.904, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.943e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 01:40:29,917 (trainer:732) INFO: 58epoch:train:7197-8995batch: iter_time=2.150e-04, forward_time=0.203, loss_att=41.325, acc=0.965, loss=41.325, backward_time=0.299, grad_norm=115.445, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.941e-04, train_time=2.698 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 02:00:34,824 (trainer:732) INFO: 58epoch:train:8996-10794batch: iter_time=2.092e-04, forward_time=0.202, loss_att=40.916, acc=0.964, loss=40.916, backward_time=0.297, grad_norm=114.654, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.940e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 02:20:46,140 (trainer:732) INFO: 58epoch:train:10795-12593batch: iter_time=2.144e-04, forward_time=0.203, loss_att=41.232, acc=0.964, loss=41.232, backward_time=0.298, grad_norm=110.051, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.938e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 02:40:57,002 (trainer:732) INFO: 58epoch:train:12594-14392batch: iter_time=2.178e-04, forward_time=0.203, loss_att=41.852, acc=0.964, loss=41.852, backward_time=0.298, grad_norm=120.342, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.936e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 03:01:07,014 (trainer:732) INFO: 58epoch:train:14393-16191batch: iter_time=2.142e-04, forward_time=0.203, loss_att=41.214, acc=0.964, loss=41.214, backward_time=0.298, grad_norm=114.627, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.935e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 03:21:15,301 (trainer:732) INFO: 58epoch:train:16192-17990batch: iter_time=2.115e-04, forward_time=0.202, loss_att=40.022, acc=0.965, loss=40.022, backward_time=0.297, grad_norm=110.056, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.933e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 03:41:19,798 (trainer:732) INFO: 58epoch:train:17991-19789batch: iter_time=2.147e-04, forward_time=0.202, loss_att=40.195, acc=0.965, loss=40.195, backward_time=0.297, grad_norm=113.653, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.931e-04, train_time=2.678 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 04:01:27,548 (trainer:732) INFO: 58epoch:train:19790-21588batch: iter_time=2.161e-04, forward_time=0.202, loss_att=41.131, acc=0.964, loss=41.131, backward_time=0.297, grad_norm=108.677, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.929e-04, train_time=2.685 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 04:21:36,851 (trainer:732) INFO: 58epoch:train:21589-23387batch: iter_time=2.165e-04, forward_time=0.202, loss_att=40.754, acc=0.964, loss=40.754, backward_time=0.298, grad_norm=113.977, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.928e-04, train_time=2.689 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 04:41:44,801 (trainer:732) INFO: 58epoch:train:23388-25186batch: iter_time=2.128e-04, forward_time=0.202, loss_att=40.961, acc=0.964, loss=40.961, backward_time=0.297, grad_norm=112.455, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.926e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 05:01:55,617 (trainer:732) INFO: 58epoch:train:25187-26985batch: iter_time=2.129e-04, forward_time=0.203, loss_att=40.963, acc=0.965, loss=40.963, backward_time=0.298, grad_norm=109.741, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.924e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 05:22:04,600 (trainer:732) INFO: 58epoch:train:26986-28784batch: iter_time=2.122e-04, forward_time=0.202, loss_att=41.647, acc=0.964, loss=41.647, backward_time=0.298, grad_norm=116.069, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.923e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 05:42:12,696 (trainer:732) INFO: 58epoch:train:28785-30583batch: iter_time=2.116e-04, forward_time=0.202, loss_att=41.488, acc=0.964, loss=41.488, backward_time=0.297, grad_norm=122.885, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.921e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 06:02:22,219 (trainer:732) INFO: 58epoch:train:30584-32382batch: iter_time=2.129e-04, forward_time=0.203, loss_att=41.107, acc=0.965, loss=41.107, backward_time=0.298, grad_norm=112.636, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.919e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 06:22:31,884 (trainer:732) INFO: 58epoch:train:32383-34181batch: iter_time=2.152e-04, forward_time=0.202, loss_att=41.252, acc=0.964, loss=41.252, backward_time=0.298, grad_norm=112.817, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.918e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 06:42:39,887 (trainer:732) INFO: 58epoch:train:34182-35980batch: iter_time=2.144e-04, forward_time=0.202, loss_att=41.164, acc=0.964, loss=41.164, backward_time=0.298, grad_norm=111.339, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.916e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 06:51:07,607 (trainer:338) INFO: 58epoch results: [train] iter_time=2.554e-04, forward_time=0.202, loss_att=40.955, acc=0.964, loss=40.955, backward_time=0.298, grad_norm=114.338, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.932e-04, train_time=2.716, time=6 hours, 47 minutes and 28.28 seconds, total_count=2087768, gpu_max_cached_mem_GB=30.396, [valid] loss_att=23.227, acc=0.983, cer=0.022, wer=0.075, loss=23.227, time=4 minutes and 31.97 seconds, total_count=1008, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 38.28 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 06:51:11,348 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 06:51:11,381 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/46epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 06:51:11,381 (trainer:272) INFO: 59/60epoch started. Estimated time to finish: 13 hours, 51 minutes and 21.09 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 07:15:36,958 (trainer:732) INFO: 59epoch:train:1-1799batch: iter_time=8.549e-04, forward_time=0.203, loss_att=40.702, acc=0.965, loss=40.702, backward_time=0.298, grad_norm=111.533, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.914e-04, train_time=3.259 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 07:35:45,562 (trainer:732) INFO: 59epoch:train:1800-3598batch: iter_time=2.293e-04, forward_time=0.203, loss_att=40.343, acc=0.965, loss=40.343, backward_time=0.298, grad_norm=107.811, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.912e-04, train_time=2.687 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 07:55:50,698 (trainer:732) INFO: 59epoch:train:3599-5397batch: iter_time=2.337e-04, forward_time=0.202, loss_att=40.375, acc=0.964, loss=40.375, backward_time=0.297, grad_norm=113.486, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.911e-04, train_time=2.679 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 08:16:01,857 (trainer:732) INFO: 59epoch:train:5398-7196batch: iter_time=2.264e-04, forward_time=0.203, loss_att=40.633, acc=0.965, loss=40.633, backward_time=0.298, grad_norm=116.218, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.909e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 08:36:10,957 (trainer:732) INFO: 59epoch:train:7197-8995batch: iter_time=2.227e-04, forward_time=0.203, loss_att=40.752, acc=0.965, loss=40.752, backward_time=0.298, grad_norm=112.134, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.907e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 08:56:16,785 (trainer:732) INFO: 59epoch:train:8996-10794batch: iter_time=2.253e-04, forward_time=0.202, loss_att=40.432, acc=0.965, loss=40.432, backward_time=0.297, grad_norm=113.487, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.906e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 09:16:26,337 (trainer:732) INFO: 59epoch:train:10795-12593batch: iter_time=2.257e-04, forward_time=0.202, loss_att=40.443, acc=0.965, loss=40.443, backward_time=0.298, grad_norm=112.547, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.904e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 09:36:29,935 (trainer:732) INFO: 59epoch:train:12594-14392batch: iter_time=2.249e-04, forward_time=0.201, loss_att=40.958, acc=0.964, loss=40.958, backward_time=0.296, grad_norm=120.232, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.902e-04, train_time=2.676 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 09:56:39,109 (trainer:732) INFO: 59epoch:train:14393-16191batch: iter_time=2.205e-04, forward_time=0.202, loss_att=39.895, acc=0.965, loss=39.895, backward_time=0.297, grad_norm=114.280, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.901e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 10:16:44,441 (trainer:732) INFO: 59epoch:train:16192-17990batch: iter_time=2.187e-04, forward_time=0.202, loss_att=40.410, acc=0.964, loss=40.410, backward_time=0.297, grad_norm=106.211, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.899e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 10:36:54,074 (trainer:732) INFO: 59epoch:train:17991-19789batch: iter_time=2.221e-04, forward_time=0.203, loss_att=41.262, acc=0.964, loss=41.262, backward_time=0.298, grad_norm=114.816, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.897e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 10:57:05,052 (trainer:732) INFO: 59epoch:train:19790-21588batch: iter_time=2.258e-04, forward_time=0.203, loss_att=41.011, acc=0.964, loss=41.011, backward_time=0.299, grad_norm=109.789, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.896e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 11:17:13,055 (trainer:732) INFO: 59epoch:train:21589-23387batch: iter_time=2.274e-04, forward_time=0.202, loss_att=41.138, acc=0.964, loss=41.138, backward_time=0.297, grad_norm=106.161, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.894e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 11:37:24,837 (trainer:732) INFO: 59epoch:train:23388-25186batch: iter_time=2.261e-04, forward_time=0.203, loss_att=41.143, acc=0.964, loss=41.143, backward_time=0.299, grad_norm=110.813, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.892e-04, train_time=2.694 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 11:57:31,462 (trainer:732) INFO: 59epoch:train:25187-26985batch: iter_time=2.259e-04, forward_time=0.202, loss_att=40.587, acc=0.965, loss=40.587, backward_time=0.297, grad_norm=112.311, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.891e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<29823> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<30075> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<17188> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<53282> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<25527> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<25545> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<16514> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<32679> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<49295> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<49483> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<19734> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<39833> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<29991> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<30057> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<15365> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<16607> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 140) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<51538> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<53254> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<33473> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<49326> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<59417> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<59465> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<29616> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<31302> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 7, fd 145) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 8, fd 147) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<29915> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<30310> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<62820> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<23607> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<64141> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<64101> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<44840> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.151<45358> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 155) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 143) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 12:17:42,336 (trainer:732) INFO: 59epoch:train:26986-28784batch: iter_time=2.207e-04, forward_time=0.203, loss_att=41.129, acc=0.965, loss=41.129, backward_time=0.298, grad_norm=114.532, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.889e-04, train_time=2.692 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 12:37:51,040 (trainer:732) INFO: 59epoch:train:28785-30583batch: iter_time=2.288e-04, forward_time=0.202, loss_att=40.706, acc=0.964, loss=40.706, backward_time=0.298, grad_norm=115.192, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.887e-04, train_time=2.687 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 141) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 12:58:01,746 (trainer:732) INFO: 59epoch:train:30584-32382batch: iter_time=2.289e-04, forward_time=0.203, loss_att=41.787, acc=0.964, loss=41.787, backward_time=0.298, grad_norm=117.739, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.886e-04, train_time=2.692 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 13:18:08,066 (trainer:732) INFO: 59epoch:train:32383-34181batch: iter_time=2.271e-04, forward_time=0.202, loss_att=40.750, acc=0.964, loss=40.750, backward_time=0.297, grad_norm=113.521, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.884e-04, train_time=2.682 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 2, fd 152) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 13:38:15,157 (trainer:732) INFO: 59epoch:train:34182-35980batch: iter_time=2.268e-04, forward_time=0.202, loss_att=41.505, acc=0.964, loss=41.505, backward_time=0.297, grad_norm=112.369, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.883e-04, train_time=2.683 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 89) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520217:3521148 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 5, fd 89) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 101) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520218:3521147 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 6, fd 101) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 13:46:42,471 (trainer:338) INFO: 59epoch results: [train] iter_time=2.570e-04, forward_time=0.202, loss_att=40.795, acc=0.964, loss=40.795, backward_time=0.298, grad_norm=112.741, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.898e-04, train_time=2.715, time=6 hours, 47 minutes and 21.55 seconds, total_count=2123764, gpu_max_cached_mem_GB=30.396, [valid] loss_att=22.727, acc=0.983, cer=0.022, wer=0.074, loss=22.727, time=4 minutes and 32.53 seconds, total_count=1044, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 37 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 13:46:46,364 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 13:46:46,395 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/53epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 13:46:46,395 (trainer:272) INFO: 60/60epoch started. Estimated time to finish: 6 hours, 55 minutes and 40.15 seconds +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 14:11:14,130 (trainer:732) INFO: 60epoch:train:1-1799batch: iter_time=9.255e-04, forward_time=0.203, loss_att=39.879, acc=0.965, loss=39.879, backward_time=0.298, grad_norm=118.800, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.881e-04, train_time=3.264 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 14:31:19,581 (trainer:732) INFO: 60epoch:train:1800-3598batch: iter_time=2.327e-04, forward_time=0.202, loss_att=40.359, acc=0.965, loss=40.359, backward_time=0.297, grad_norm=114.451, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.879e-04, train_time=2.680 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 14:51:29,660 (trainer:732) INFO: 60epoch:train:3599-5397batch: iter_time=2.266e-04, forward_time=0.203, loss_att=40.470, acc=0.965, loss=40.470, backward_time=0.298, grad_norm=114.886, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.878e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 15:11:39,922 (trainer:732) INFO: 60epoch:train:5398-7196batch: iter_time=2.295e-04, forward_time=0.203, loss_att=40.673, acc=0.965, loss=40.673, backward_time=0.298, grad_norm=115.877, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.876e-04, train_time=2.690 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 15:31:48,698 (trainer:732) INFO: 60epoch:train:7197-8995batch: iter_time=2.277e-04, forward_time=0.203, loss_att=41.006, acc=0.964, loss=41.006, backward_time=0.298, grad_norm=117.108, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.874e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 15:51:54,510 (trainer:732) INFO: 60epoch:train:8996-10794batch: iter_time=2.238e-04, forward_time=0.202, loss_att=40.177, acc=0.965, loss=40.177, backward_time=0.297, grad_norm=113.267, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.873e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 16:12:00,205 (trainer:732) INFO: 60epoch:train:10795-12593batch: iter_time=2.235e-04, forward_time=0.202, loss_att=40.479, acc=0.964, loss=40.479, backward_time=0.297, grad_norm=105.547, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.871e-04, train_time=2.681 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 16:32:11,536 (trainer:732) INFO: 60epoch:train:12594-14392batch: iter_time=2.256e-04, forward_time=0.203, loss_att=40.693, acc=0.965, loss=40.693, backward_time=0.298, grad_norm=112.192, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.869e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 16:52:20,791 (trainer:732) INFO: 60epoch:train:14393-16191batch: iter_time=2.227e-04, forward_time=0.202, loss_att=40.558, acc=0.965, loss=40.558, backward_time=0.298, grad_norm=122.459, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.868e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 17:12:28,972 (trainer:732) INFO: 60epoch:train:16192-17990batch: iter_time=2.225e-04, forward_time=0.202, loss_att=40.416, acc=0.965, loss=40.416, backward_time=0.298, grad_norm=113.276, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.866e-04, train_time=2.686 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.240<33652> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520219:3521146 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 4, fd 142) + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<53606> +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:3520216:3521149 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 3, fd 153) +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 17:32:37,395 (trainer:732) INFO: 60epoch:train:17991-19789batch: iter_time=2.293e-04, forward_time=0.203, loss_att=40.639, acc=0.965, loss=40.639, backward_time=0.298, grad_norm=127.794, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.865e-04, train_time=2.686 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 17:52:46,458 (trainer:732) INFO: 60epoch:train:19790-21588batch: iter_time=2.221e-04, forward_time=0.202, loss_att=41.072, acc=0.965, loss=41.072, backward_time=0.297, grad_norm=124.253, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.863e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 18:12:53,811 (trainer:732) INFO: 60epoch:train:21589-23387batch: iter_time=2.256e-04, forward_time=0.202, loss_att=40.931, acc=0.964, loss=40.931, backward_time=0.297, grad_norm=109.147, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.861e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 18:33:05,185 (trainer:732) INFO: 60epoch:train:23388-25186batch: iter_time=2.281e-04, forward_time=0.203, loss_att=40.926, acc=0.965, loss=40.926, backward_time=0.298, grad_norm=118.993, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.860e-04, train_time=2.693 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 18:53:12,823 (trainer:732) INFO: 60epoch:train:25187-26985batch: iter_time=2.219e-04, forward_time=0.202, loss_att=40.909, acc=0.964, loss=40.909, backward_time=0.297, grad_norm=121.053, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.858e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 19:13:22,329 (trainer:732) INFO: 60epoch:train:26986-28784batch: iter_time=2.246e-04, forward_time=0.202, loss_att=40.831, acc=0.965, loss=40.831, backward_time=0.298, grad_norm=115.497, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.856e-04, train_time=2.689 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 19:33:31,309 (trainer:732) INFO: 60epoch:train:28785-30583batch: iter_time=2.300e-04, forward_time=0.203, loss_att=40.479, acc=0.965, loss=40.479, backward_time=0.298, grad_norm=113.993, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.855e-04, train_time=2.688 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 19:53:38,000 (trainer:732) INFO: 60epoch:train:30584-32382batch: iter_time=2.232e-04, forward_time=0.202, loss_att=41.009, acc=0.964, loss=41.009, backward_time=0.297, grad_norm=112.543, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.853e-04, train_time=2.683 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:13:45,676 (trainer:732) INFO: 60epoch:train:32383-34181batch: iter_time=2.262e-04, forward_time=0.202, loss_att=40.991, acc=0.964, loss=40.991, backward_time=0.297, grad_norm=118.517, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.852e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:33:53,613 (trainer:732) INFO: 60epoch:train:34182-35980batch: iter_time=2.265e-04, forward_time=0.202, loss_att=41.007, acc=0.964, loss=41.007, backward_time=0.298, grad_norm=120.177, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.850e-04, train_time=2.685 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:41:49,055 (trainer:338) INFO: 60epoch results: [train] iter_time=2.608e-04, forward_time=0.202, loss_att=40.672, acc=0.965, loss=40.672, backward_time=0.298, grad_norm=116.503, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=3.865e-04, train_time=2.715, time=6 hours, 47 minutes and 25.2 seconds, total_count=2159760, gpu_max_cached_mem_GB=30.396, [valid] loss_att=22.778, acc=0.983, cer=0.022, wer=0.073, loss=22.778, time=4 minutes and 33.89 seconds, total_count=1080, gpu_max_cached_mem_GB=30.396, [att_plot] time=3 minutes and 3.57 seconds, total_count=0, gpu_max_cached_mem_GB=30.396 +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:41:52,754 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:41:52,786 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/51epoch.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:41:52,786 (trainer:458) INFO: The training was finished at 60 epochs +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:41:52,820 (average_nbest_models:69) INFO: Averaging 10best models: criterion="valid.acc": exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave_10best.pth +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,306 (average_nbest_models:96) INFO: Accumulating encoder.encoders.0.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,308 (average_nbest_models:96) INFO: Accumulating encoder.encoders.1.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,310 (average_nbest_models:96) INFO: Accumulating encoder.encoders.2.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,312 (average_nbest_models:96) INFO: Accumulating encoder.encoders.3.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,314 (average_nbest_models:96) INFO: Accumulating encoder.encoders.4.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,316 (average_nbest_models:96) INFO: Accumulating encoder.encoders.5.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,318 (average_nbest_models:96) INFO: Accumulating encoder.encoders.6.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,320 (average_nbest_models:96) INFO: Accumulating encoder.encoders.7.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,323 (average_nbest_models:96) INFO: Accumulating encoder.encoders.8.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,325 (average_nbest_models:96) INFO: Accumulating encoder.encoders.9.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,327 (average_nbest_models:96) INFO: Accumulating encoder.encoders.10.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-7-1218101249-5bcbfb5567-jsftr:0/4] 2024-01-20 20:42:01,329 (average_nbest_models:96) INFO: Accumulating encoder.encoders.11.conv_module.norm.num_batches_tracked instead of averaging +# Accounting: time=374195 threads=1 +# Ended (code 0) at Sat Jan 20 20:42:05 CST 2024, elapsed time 374195 seconds diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave.pth new file mode 100644 index 0000000000000000000000000000000000000000..8874a916f4eb662907b9f9ce2b8f451565aee7f8 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75c15394b6137b25ea2b7609fd48046efa36924ddd91279111fd96b3b98162c7 +size 172358249 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave_10best.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave_10best.pth new file mode 100644 index 0000000000000000000000000000000000000000..8874a916f4eb662907b9f9ce2b8f451565aee7f8 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.ave_10best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75c15394b6137b25ea2b7609fd48046efa36924ddd91279111fd96b3b98162c7 +size 172358249 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.best.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.best.pth new file mode 100644 index 0000000000000000000000000000000000000000..fae1c555b52217609dd7c5439c74858c3cdd133c --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new/valid.acc.best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b7e15c311a51848a22efa5626215db5b3c95a4f0e02ef25d0975eaa715f7d06 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/43epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/43epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..b949b15244b09f784cd3d3f8ca06c48fdbb94276 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/43epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85e6aa2f4634193567c8875dffed462acb4ddee25a64e6d2cc6784bb731b5118 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/44epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/44epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..5afefcb32652ca7a71d53ce68c4d07a0b70a092e --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/44epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8eab89d55429339128f95488d43c1130146d94388fe0f035b18944d0486db9d +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/48epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/48epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..b79dd8ecf512cc251deaa915e95c576b8cb19a98 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/48epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5beda5f12025256fd094495ca64605c78a61acaaeb8fea4f3afacedecf54ff51 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/49epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/49epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a1a5e9076d6e7e6c733d744a5568871bd8345dd --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/49epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:770bb8e30bb8128f2f12b52495b484509fad7b46921e2882a89a58694f1a1313 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/51epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/51epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8fc247f1ba4f93003d3ec970fd5c636b9236290 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/51epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c414cae0c18b1d880232d1c56249ed3ad00d3001ea345e7205a6e0d0d76a192f +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/52epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/52epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..c2c8032bfd0f1d9339392f7a3ee1a85ceb809466 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/52epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:669d29c06a8814a43482c1c8483b0a02c3ce58e966971a81fe7e686ce5c9240f +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/54epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/54epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..27a65f71effbd1f999d94355d7e43e5a6fb26b14 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/54epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a38c3b7350afab15a254eced20881144b8c58356a06b08ac7b8dca587363465 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/56epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/56epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81f5090d1008668acb5489bd2727e4006de44b6 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/56epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aa51356d8e20d16ce16cf03478783f3c94872757f700b7e6f6709b6d828ec8b +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/57epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/57epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..6871b2a93b9a67b6820de0347c1229daeb0c432d --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/57epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1389731de517bd849a0f81bcde4f8fab312ff3019b3dd9c9ddd479d5346481b3 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/60epoch.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/60epoch.pth new file mode 100644 index 0000000000000000000000000000000000000000..24f1a93caffc485fb3638444878ccdcf3807bc66 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/60epoch.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf36c914d7d71528a338cd3a769884e121a1ee7eb57709cf3fd9f79a73f3dd04 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/RESULTS.md b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/RESULTS.md new file mode 100644 index 0000000000000000000000000000000000000000..2c4027422830df1740bb26c0e24dd1319d835978 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/RESULTS.md @@ -0,0 +1,63 @@ + +# RESULTS +## Environments +- date: `Tue Mar 12 04:49:59 CST 2024` +- python version: `3.9.18 (main, Sep 11 2023, 13:41:44) [GCC 11.2.0]` +- espnet version: `espnet 202308` +- pytorch version: `pytorch 1.12.1+cu116` +- Git hash: `884659f9ee95374811015381c976fa3b4f6e01db` + - Commit date: `Thu Nov 23 00:23:29 2023 +0800` + +## exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr +### WER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_sot_asr_model_valid.acc.best/dev_2spk|3315|226216|80.6|7.3|12.1|10.5|29.9|97.3| +|decode_sot_asr_model_valid.acc.best/dev_2spk_kaldi_fmt|1606|135101|83.1|5.7|11.2|6.6|23.5|94.5| +|decode_sot_asr_model_valid.acc.best/dev_2spk_spk1_whamr|3315|226216|39.3|15.3|45.5|2.6|63.4|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_spk2_whamr|3315|226216|40.2|13.6|46.2|2.8|62.6|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk|2059|209679|68.5|14.2|17.2|10.0|41.4|99.9| +|decode_sot_asr_model_valid.acc.best/dev_4spk|1467|200029|57.4|19.8|22.7|8.8|51.3|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|301042|81.4|6.6|12.0|9.9|28.5|97.6| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk_spk1_whamr|4570|301042|38.2|14.3|47.5|2.1|64.0|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk_spk2_whamr|4570|301042|38.7|13.7|47.7|2.5|63.8|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|212871|70.0|13.0|17.0|10.7|40.7|99.9| +|decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|185394|59.1|18.8|22.1|8.9|49.8|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|336490|80.4|8.4|11.1|10.4|30.0|99.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk_spk1_whamr|4663|336490|34.6|17.9|47.5|3.0|68.4|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk_spk2_whamr|4663|336490|40.9|14.3|44.8|2.7|61.8|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|266074|67.2|17.0|15.8|10.5|43.4|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|259138|54.9|23.2|22.0|8.4|53.5|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|178761|84.4|4.9|10.7|5.9|21.5|95.0| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|205496|82.8|7.2|10.0|6.7|23.9|98.1| +|decode_sot_asr_model_valid.acc.best/tt_mix_clean_reverb_max_16k|3000|3000|0.0|100.0|0.0|3118.7|3218.7|100.0| + +### CER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| +|decode_sot_asr_model_valid.acc.best/dev_2spk|3315|1230801|85.0|4.6|10.5|9.1|24.1|97.3| +|decode_sot_asr_model_valid.acc.best/dev_2spk_kaldi_fmt|1606|735694|86.4|3.3|10.3|5.6|19.2|94.5| +|decode_sot_asr_model_valid.acc.best/dev_2spk_spk1_whamr|3315|1230801|47.6|6.5|45.9|2.9|55.3|100.0| +|decode_sot_asr_model_valid.acc.best/dev_2spk_spk2_whamr|3315|1230801|47.8|5.8|46.4|2.9|55.1|100.0| +|decode_sot_asr_model_valid.acc.best/dev_3spk|2059|1140428|76.1|7.9|16.0|8.5|32.4|99.9| +|decode_sot_asr_model_valid.acc.best/dev_4spk|1467|1087409|67.5|10.0|22.5|7.6|40.1|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk|4570|1550429|85.3|4.3|10.4|8.5|23.1|97.6| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk_spk1_whamr|4570|1550429|46.6|5.8|47.6|2.6|56.1|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_2spk_spk2_whamr|4570|1550429|46.2|5.8|48.0|2.7|56.5|100.0| +|decode_sot_asr_model_valid.acc.best/test-clean_3spk|2072|1084475|76.8|7.8|15.4|9.0|32.1|99.9| +|decode_sot_asr_model_valid.acc.best/test-clean_4spk|1326|938467|68.6|9.8|21.6|8.0|39.4|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk|4663|1742136|85.4|4.8|9.8|9.2|23.8|99.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk_spk1_whamr|4663|1742136|44.2|7.7|48.1|3.2|59.0|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_2spk_spk2_whamr|4663|1742136|48.9|5.9|45.1|3.0|54.1|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_3spk|2453|1381987|76.0|8.9|15.1|9.2|33.2|100.0| +|decode_sot_asr_model_valid.acc.best/test-other_4spk|1795|1346646|66.2|11.3|22.4|7.7|41.5|100.0| +|decode_sot_asr_model_valid.acc.best/test_clean_2spk_kaldi_fmt|2180|921344|87.3|2.8|9.9|5.1|17.7|95.0| +|decode_sot_asr_model_valid.acc.best/test_other_2spk_kaldi_fmt|2363|1064868|86.9|3.8|9.3|5.8|18.9|98.1| +|decode_sot_asr_model_valid.acc.best/tt_mix_clean_reverb_max_16k|3000|143026|16.3|83.6|0.2|299.2|382.9|100.0| + +### TER + +|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err| +|---|---|---|---|---|---|---|---|---| diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth new file mode 100644 index 0000000000000000000000000000000000000000..a12f617262f6b34ef9913249f5310789763b18e4 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c5aeab579176bc7dd30b6bf64cc1cde7008d1d3d5a4b78251efc694f94e6170 +size 516979550 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3830464bbe2f52f6fa57774e2dfec7bcd3e61357 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml @@ -0,0 +1,226 @@ +config: conf/tuning/train_sot_asr_conformer_large.yaml +print_config: false +log_level: INFO +drop_last_iter: false +dry_run: false +iterator_type: sequence +valid_iterator_type: null +output_dir: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr +ngpu: 1 +seed: 0 +num_workers: 8 +num_att_plot: 3 +dist_backend: nccl +dist_init_method: env:// +dist_world_size: 8 +dist_rank: 0 +local_rank: 0 +dist_master_addr: localhost +dist_master_port: 51345 +dist_launcher: null +multiprocessing_distributed: true +unused_parameters: false +sharded_ddp: false +cudnn_enabled: true +cudnn_benchmark: false +cudnn_deterministic: true +collect_stats: false +write_collected_feats: false +max_epoch: 60 +patience: null +val_scheduler_criterion: +- valid +- loss +early_stopping_criterion: +- valid +- loss +- min +best_model_criterion: +- - valid + - acc + - max +keep_nbest_models: 10 +nbest_averaging_interval: 0 +grad_clip: 5.0 +grad_clip_type: 2.0 +grad_noise: false +accum_grad: 4 +no_forward_run: false +resume: true +train_dtype: float32 +use_amp: false +log_interval: null +use_matplotlib: true +use_tensorboard: true +create_graph_in_tensorboard: false +use_wandb: false +wandb_project: null +wandb_id: null +wandb_entity: null +wandb_name: null +wandb_model_log_interval: -1 +detect_anomaly: false +pretrain_path: null +init_param: [] +ignore_init_mismatch: false +freeze_param: [] +num_iters_per_epoch: null +batch_size: 20 +valid_batch_size: null +batch_bins: 32000000 +valid_batch_bins: null +train_shape_file: +- exp/asr_stats_raw_en_char/train/speech_shape +- exp/asr_stats_raw_en_char/train/text_shape.char +valid_shape_file: +- exp/asr_stats_raw_en_char/valid/speech_shape +- exp/asr_stats_raw_en_char/valid/text_shape.char +batch_type: numel +valid_batch_type: null +fold_length: +- 80000 +- 150 +sort_in_batch: descending +shuffle_within_batch: false +sort_batch: descending +multiple_iterator: false +chunk_length: 500 +chunk_shift_ratio: 0.5 +num_cache_chunks: 1024 +chunk_excluded_key_prefixes: [] +train_data_path_and_name_and_type: +- - dump/raw/large_w_whamr/wav.scp + - speech + - kaldi_ark +- - dump/raw/large_w_whamr/text + - text + - text +valid_data_path_and_name_and_type: +- - dump/raw/cv_mix_clean_reverb_max_16k/wav.scp + - speech + - kaldi_ark +- - dump/raw/cv_mix_clean_reverb_max_16k/text + - text + - text +allow_variable_data_keys: false +max_cache_size: 0.0 +max_cache_fd: 32 +valid_max_cache_size: null +exclude_weight_decay: false +exclude_weight_decay_conf: {} +optim: adam +optim_conf: + lr: 0.002 + weight_decay: 1.0e-06 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 20000 +token_list: +- +- +- +- +- E +- T +- A +- O +- N +- I +- H +- S +- R +- D +- L +- U +- M +- C +- W +- F +- G +- Y +- P +- B +- V +- K +- '''' +- X +- J +- Q +- Z +- +init: null +input_size: null +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: null + zero_infinity: true +joint_net_conf: null +use_preprocessor: true +token_type: char +bpemodel: null +non_linguistic_symbols: null +cleaner: null +g2p: null +speech_volume_normalize: null +rir_scp: null +rir_apply_prob: 1.0 +noise_scp: null +noise_apply_prob: 1.0 +noise_db_range: '13_15' +short_noise_thres: 0.5 +aux_ctc_tasks: [] +frontend: default +frontend_conf: + fs: 16k +specaug: null +specaug_conf: {} +normalize: global_mvn +normalize_conf: + stats_file: exp/asr_stats_raw_en_char/train/feats_stats.npz +model: espnet +model_conf: + ctc_weight: 0.0 + lsm_weight: 0.1 + length_normalized_loss: false +preencoder: null +preencoder_conf: {} +encoder: conformer +encoder_conf: + output_size: 256 + attention_heads: 4 + linear_units: 2048 + num_blocks: 12 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + input_layer: conv2d + normalize_before: true + macaron_style: true + rel_pos_type: latest + pos_enc_layer_type: rel_pos + selfattention_layer_type: rel_selfattn + activation_type: swish + use_cnn_module: true + cnn_module_kernel: 31 +postencoder: null +postencoder_conf: {} +decoder: transformer +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.1 + src_attention_dropout_rate: 0.1 +preprocessor: multi +preprocessor_conf: + speaker_change_symbol: + - +required: +- output_dir +- token_list +version: '202308' +distributed: true diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/acc.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/acc.png new file mode 100644 index 0000000000000000000000000000000000000000..febbce86866cecfa6383922e2a38b5eebdca23ab Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/acc.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/backward_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/backward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..03700722a79478dfdafbd6afaac33133870db34d Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/backward_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/cer.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/cer.png new file mode 100644 index 0000000000000000000000000000000000000000..e0a35911f46645bed70efa3785f2f784a45b2908 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/cer.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/clip.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/clip.png new file mode 100644 index 0000000000000000000000000000000000000000..8dcd44eeb8e61cefd50db5514c608663bad42973 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/clip.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/forward_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/forward_time.png new file mode 100644 index 0000000000000000000000000000000000000000..3552048ae17313487d6f2f901f5f88f3993d1ea9 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/forward_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/gpu_max_cached_mem_GB.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/gpu_max_cached_mem_GB.png new file mode 100644 index 0000000000000000000000000000000000000000..aa890ff88958ff723326492e8759b48535e0236d Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/gpu_max_cached_mem_GB.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/grad_norm.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/grad_norm.png new file mode 100644 index 0000000000000000000000000000000000000000..ea38fd03d2ae4eb5967b0a35c004be043d08b250 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/grad_norm.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/iter_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/iter_time.png new file mode 100644 index 0000000000000000000000000000000000000000..a08a2331cae9b2c4adedfa69b35575c6af39956f Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/iter_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss.png new file mode 100644 index 0000000000000000000000000000000000000000..0c748dca47ed2fb74efc0d6db1044f6075d82776 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_att.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_att.png new file mode 100644 index 0000000000000000000000000000000000000000..9f8ec9a9a832bbe13280b9ffd8b67fa68710bab9 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_att.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_scale.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_scale.png new file mode 100644 index 0000000000000000000000000000000000000000..460bfdb13d9e33389b68b962751b15221d3e531f Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/loss_scale.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim0_lr0.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim0_lr0.png new file mode 100644 index 0000000000000000000000000000000000000000..65bed5c33510389953e80e7158516e7c50af3bc0 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim0_lr0.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim_step_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim_step_time.png new file mode 100644 index 0000000000000000000000000000000000000000..48203a8dbebc6db24adc7288646f4f387fe80297 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/optim_step_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/train_time.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/train_time.png new file mode 100644 index 0000000000000000000000000000000000000000..11187084c3c68403c5ab30fda54e3856a1a45686 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/train_time.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/wer.png b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/wer.png new file mode 100644 index 0000000000000000000000000000000000000000..7eba6a43a05b8992bc91814de16dd0ba665e02e6 Binary files /dev/null and b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/images/wer.png differ diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/latest.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/latest.pth new file mode 100644 index 0000000000000000000000000000000000000000..24f1a93caffc485fb3638444878ccdcf3807bc66 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/latest.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf36c914d7d71528a338cd3a769884e121a1ee7eb57709cf3fd9f79a73f3dd04 +size 172367337 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/run.sh b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..3210e6a53e6b207cc8f7b26f7e5ca95ab631b0cd --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/run.sh @@ -0,0 +1 @@ +./asr.sh --lang en --audio_format flac.ark --stage 11 --stop_stage 15 --feats_type raw --token_type char --sot_asr true --max_wav_duration 50 --speed_perturb_factors '' --feats_normalize global_mvn --use_lm false --asr_config conf/tuning/train_sot_asr_conformer_large.yaml --lm_config conf/tuning/train_lm_transformer.yaml --inference_config conf/tuning/decode_sot.yaml --train_set large_w_whamr --valid_set cv_mix_clean_reverb_max_16k --test_sets tt_mix_clean_reverb_max_16k --ngpu 8 --asr_tag train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --lm_train_text data/local/other_text/text --bpe_train_text data/large_w_whamr/text --stage 11 "$@"; exit $? diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708427294.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2041320.0 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708427294.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2041320.0 new file mode 100644 index 0000000000000000000000000000000000000000..f5f71fadec645c9a9d57c6776431f06e7ce70809 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708427294.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2041320.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cebee9daac4275970a1046e75cdad061bdf22b454860c3f1fd6f9b25356b63c +size 88 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708428578.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2051274.0 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708428578.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2051274.0 new file mode 100644 index 0000000000000000000000000000000000000000..e65828d3071cf5abcd1671d7bbfcef03a62efcf4 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1708428578.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2051274.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f6336cfe7d301908bb6fb8fdd2a20728fb14692e72699aaae8b03040dec4c16 +size 325300119 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709018581.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.229008.0 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709018581.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.229008.0 new file mode 100644 index 0000000000000000000000000000000000000000..387780fca326cace3da870f05bf70ec05bdf5cd9 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709018581.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.229008.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5699ae0263953a80374ef191727cee05343beae1372395c29dfa3b3c1ddb600 +size 88 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709019246.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.240425.0 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709019246.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.240425.0 new file mode 100644 index 0000000000000000000000000000000000000000..e11f2eb62c88244c2e5b58c37c0a29c8c8a131c0 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709019246.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.240425.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5de7b8f5a3d2ea0d77ce7641a1d22dff9ec5c67421bb9efb5c929956b54e2ea0 +size 88 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709019347.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.242511.0 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709019347.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.242511.0 new file mode 100644 index 0000000000000000000000000000000000000000..776801a261782e6db6fa84fbe76115d74ea44054 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709019347.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.242511.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec3f82e9d4c5d28ff385f697970fcaa30b618c3fc622b66f2bd899bed5777c6 +size 5958 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709033444.de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn.2640.0 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709033444.de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn.2640.0 new file mode 100644 index 0000000000000000000000000000000000000000..0ba3020e628b5fae8d174b08ac4e6c6e6388647d --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/train/events.out.tfevents.1709033444.de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn.2640.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bd01ed0cf188e6b43f471ec64088fb78a58a18d9b741680462581f208b8b183 +size 546885368 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708427294.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2041320.1 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708427294.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2041320.1 new file mode 100644 index 0000000000000000000000000000000000000000..0add0bcad31dfa485e223fc515e2bf9d61818431 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708427294.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2041320.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aecbb30957359531d27a6401f18554f88a17c877f05754d53c5202ce86d1f517 +size 88 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708428578.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2051274.1 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708428578.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2051274.1 new file mode 100644 index 0000000000000000000000000000000000000000..10ba431e744b7357f37fcb29f8ffc1c7a164ab4c --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1708428578.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.2051274.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76de77a319838e15bf908398cb7cfb238d919c69a78deb04c1e8959d23540186 +size 6574 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709018581.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.229008.1 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709018581.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.229008.1 new file mode 100644 index 0000000000000000000000000000000000000000..b2e745f21b0f9aabc5de91a215089c86fe2f3b03 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709018581.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.229008.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd8f69e4709081687b1fcb1adcad91cb83c349e6cfa07660d9c74c5e16d1de77 +size 88 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709019246.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.240425.1 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709019246.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.240425.1 new file mode 100644 index 0000000000000000000000000000000000000000..a90b136d96e7b2020b8b379de3449ad10c24d08b --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709019246.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.240425.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b02af437af4e0edaa32836dbd2be70be924b1628ca470a0c46b666249e7c39 +size 88 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709019347.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.242511.1 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709019347.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.242511.1 new file mode 100644 index 0000000000000000000000000000000000000000..103de4e6b5c06f3cf65f317f0a712c631b0a6a01 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709019347.de-74279-k2-train-1-1207150822-75498b8c5f-55j4z.242511.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1288a99ede576a0f2abe8e8ca8887a29522c6dd9ffceabd45bdfd0093f41eba3 +size 88 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709033444.de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn.2640.1 b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709033444.de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn.2640.1 new file mode 100644 index 0000000000000000000000000000000000000000..fe04f3dd9f0181661cc6e9e0aa8d39e5f88e5e90 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/tensorboard/valid/events.out.tfevents.1709033444.de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn.2640.1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5751dd94b18393d622d84ff41001083d111ad00374a88bcb2a4ec78dc393882d +size 10522 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.1.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.1.log new file mode 100644 index 0000000000000000000000000000000000000000..ca69edbfba97c0a6d464230be6880e6acfd2144b --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.1.log @@ -0,0 +1,1043 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +# Started at Tue Feb 27 15:34:53 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:11,516 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:11,517 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 8 nodes. +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:11,542 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:15,128 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:15,139 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:15,139 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:15,140 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:15,141 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:20,900 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:44,827 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/large_w_whamr/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/large_w_whamr/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:44,828 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=18232, batch_bins=32000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:44,832 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=18232, mean=107.4, min=25, max=446 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:45,114 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:45,154 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:45,154 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=28, batch_bins=32000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:45,154 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=28, mean=178.6, min=76, max=290 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:45,166 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:45,205 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:45,205 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=5000, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:45,206 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:46,346 (trainer:159) INFO: The training was resumed using exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242517 [6] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242512 [1] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242515 [4] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] NCCL INFO NET/IB : No device found. + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242514 [3] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242518 [7] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242513 [2] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242516 [5] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO Setting affinity for GPU 3 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO Setting affinity for GPU 4 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO Setting affinity for GPU 6 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO Setting affinity for GPU 7 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO Setting affinity for GPU 5 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO Setting affinity for GPU 1 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO Setting affinity for GPU 2 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO Setting affinity for GPU 0 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO Channel 00 : 6[b4000] -> 7[b5000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO Channel 00 : 2[40000] -> 3[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO Channel 00 : 1[3e000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO Channel 00 : 3[41000] -> 4[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO Channel 01 : 6[b4000] -> 7[b5000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO Channel 01 : 2[40000] -> 3[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO Channel 01 : 1[3e000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO Channel 01 : 3[41000] -> 4[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO Channel 00 : 5[b2000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO Channel 01 : 5[b2000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO Channel 00 : 2[40000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO Channel 01 : 2[40000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO Channel 00 : 4[b1000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO Channel 01 : 4[b1000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO Channel 00 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO Channel 01 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO Channel 00 : 5[b2000] -> 4[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO Channel 00 : 7[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO Channel 01 : 5[b2000] -> 4[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO Channel 01 : 7[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO Channel 00 : 4[b1000] -> 3[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO Channel 01 : 4[b1000] -> 3[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO Channel 00 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO Channel 01 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO Channel 00 : 7[b5000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO Channel 01 : 7[b5000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO Channel 00 : 3[41000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO Channel 01 : 3[41000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO Channel 00 : 6[b4000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO Channel 01 : 6[b4000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242517:242774 [6] NCCL INFO comm 0x7faab4002f70 rank 6 nranks 8 cudaDev 6 busId b4000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242515:242776 [4] NCCL INFO comm 0x7f5cb4002f70 rank 4 nranks 8 cudaDev 4 busId b1000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242513:242779 [2] NCCL INFO comm 0x7f3140002f70 rank 2 nranks 8 cudaDev 2 busId 40000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242516:242780 [5] NCCL INFO comm 0x7f1a00002f70 rank 5 nranks 8 cudaDev 5 busId b2000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242514:242777 [3] NCCL INFO comm 0x7fc3ec002f70 rank 3 nranks 8 cudaDev 3 busId 41000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242773 [0] NCCL INFO comm 0x7f118c002f70 rank 0 nranks 8 cudaDev 0 busId 3d000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242518:242778 [7] NCCL INFO comm 0x7f2770002f70 rank 7 nranks 8 cudaDev 7 busId b5000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242512:242775 [1] NCCL INFO comm 0x7f0d00002f70 rank 1 nranks 8 cudaDev 1 busId 3e000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:242511:242511 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:35:47,671 (trainer:284) INFO: 24/60epoch started +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:38:38,172 (distributed:995) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:49:03,245 (trainer:732) INFO: 24epoch:train:1-911batch: iter_time=0.001, forward_time=0.209, loss_att=49.241, acc=0.957, loss=49.241, backward_time=0.302, grad_norm=75.615, clip=100.000, loss_scale=1.000, optim_step_time=0.071, optim0_lr0=6.197e-04, train_time=3.496 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:59:26,219 (trainer:732) INFO: 24epoch:train:912-1822batch: iter_time=2.323e-04, forward_time=0.202, loss_att=47.615, acc=0.958, loss=47.615, backward_time=0.300, grad_norm=76.453, clip=100.000, loss_scale=1.000, optim_step_time=0.072, optim0_lr0=6.194e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 16:09:48,780 (trainer:732) INFO: 24epoch:train:1823-2733batch: iter_time=2.292e-04, forward_time=0.202, loss_att=47.913, acc=0.958, loss=47.913, backward_time=0.300, grad_norm=74.016, clip=100.000, loss_scale=1.000, optim_step_time=0.070, optim0_lr0=6.190e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 16:20:10,792 (trainer:732) INFO: 24epoch:train:2734-3644batch: iter_time=2.284e-04, forward_time=0.201, loss_att=47.458, acc=0.958, loss=47.458, backward_time=0.300, grad_norm=75.771, clip=100.000, loss_scale=1.000, optim_step_time=0.069, optim0_lr0=6.187e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 16:30:33,922 (trainer:732) INFO: 24epoch:train:3645-4555batch: iter_time=2.282e-04, forward_time=0.201, loss_att=47.699, acc=0.958, loss=47.699, backward_time=0.300, grad_norm=74.378, clip=100.000, loss_scale=1.000, optim_step_time=0.070, optim0_lr0=6.184e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 16:40:56,152 (trainer:732) INFO: 24epoch:train:4556-5466batch: iter_time=2.257e-04, forward_time=0.202, loss_att=47.928, acc=0.958, loss=47.928, backward_time=0.300, grad_norm=74.393, clip=100.000, loss_scale=1.000, optim_step_time=0.069, optim0_lr0=6.180e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 16:51:20,166 (trainer:732) INFO: 24epoch:train:5467-6377batch: iter_time=2.312e-04, forward_time=0.202, loss_att=48.230, acc=0.958, loss=48.230, backward_time=0.301, grad_norm=72.966, clip=100.000, loss_scale=1.000, optim_step_time=0.069, optim0_lr0=6.177e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 17:01:41,808 (trainer:732) INFO: 24epoch:train:6378-7288batch: iter_time=2.291e-04, forward_time=0.201, loss_att=47.796, acc=0.958, loss=47.796, backward_time=0.300, grad_norm=72.242, clip=100.000, loss_scale=1.000, optim_step_time=0.069, optim0_lr0=6.174e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 17:12:05,021 (trainer:732) INFO: 24epoch:train:7289-8199batch: iter_time=2.311e-04, forward_time=0.202, loss_att=47.769, acc=0.959, loss=47.769, backward_time=0.301, grad_norm=73.783, clip=100.000, loss_scale=1.000, optim_step_time=0.070, optim0_lr0=6.170e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 17:22:26,338 (trainer:732) INFO: 24epoch:train:8200-9110batch: iter_time=2.317e-04, forward_time=0.202, loss_att=47.233, acc=0.959, loss=47.233, backward_time=0.301, grad_norm=74.974, clip=100.000, loss_scale=1.000, optim_step_time=0.070, optim0_lr0=6.167e-04, train_time=2.727 +Exception ignored from cffi callback .vio_tell at 0x7f546c3ec670>: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/soundfile.py", line 1264, in vio_tell +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + @_ffi.callback("sf_vio_tell") +KeyboardInterrupt: + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 109, in join + ready = multiprocessing.connection.wait( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt +Process SpawnProcess-8: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 610, in train_one_epoch + stats, weight = recursive_average(stats, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average + obj = recursive_sum(obj, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum + torch.distributed.all_reduce(obj, op=ReduceOp.SUM) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1327, in all_reduce + work.wait() +KeyboardInterrupt +Process SpawnProcess-6: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 610, in train_one_epoch + stats, weight = recursive_average(stats, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average + obj = recursive_sum(obj, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum + torch.distributed.all_reduce(obj, op=ReduceOp.SUM) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1327, in all_reduce + work.wait() +KeyboardInterrupt +Process SpawnProcess-5: +Process SpawnProcess-4: +Process SpawnProcess-1: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 610, in train_one_epoch + stats, weight = recursive_average(stats, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average + obj = recursive_sum(obj, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum + torch.distributed.all_reduce(obj, op=ReduceOp.SUM) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1327, in all_reduce + work.wait() +KeyboardInterrupt +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 610, in train_one_epoch + stats, weight = recursive_average(stats, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average + obj = recursive_sum(obj, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum + torch.distributed.all_reduce(obj, op=ReduceOp.SUM) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1327, in all_reduce + work.wait() +KeyboardInterrupt +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 610, in train_one_epoch + stats, weight = recursive_average(stats, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average + obj = recursive_sum(obj, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum + torch.distributed.all_reduce(obj, op=ReduceOp.SUM) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1327, in all_reduce + work.wait() +KeyboardInterrupt +Process SpawnProcess-3: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 610, in train_one_epoch + stats, weight = recursive_average(stats, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 41, in recursive_average + obj = recursive_sum(obj, weight, distributed) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in recursive_sum + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 13, in + return {k: recursive_sum(v, weight, distributed) for k, v in obj.items()} + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/torch_utils/recursive_op.py", line 18, in recursive_sum + torch.distributed.all_reduce(obj, op=ReduceOp.SUM) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 1327, in all_reduce + work.wait() +KeyboardInterrupt +/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 64 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.2.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.2.log new file mode 100644 index 0000000000000000000000000000000000000000..557db5850267daf9d7b2267ab970c74be687a09b --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.2.log @@ -0,0 +1,1231 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +# Started at Tue Feb 27 15:33:09 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:30,009 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:30,009 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 8 nodes. +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:30,040 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:33,704 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:33,714 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:33,715 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:33,715 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:33,717 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:33:39,722 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:03,733 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/large_w_whamr/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/large_w_whamr/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:03,733 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=18232, batch_bins=32000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:03,738 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=18232, mean=107.4, min=25, max=446 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:04,010 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:04,046 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:04,047 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=28, batch_bins=32000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:04,047 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=28, mean=178.6, min=76, max=290 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:04,059 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:04,087 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:04,087 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=5000, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:04,087 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:05,258 (trainer:159) INFO: The training was resumed using exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240428 [2] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240426 [1] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240431 [5] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240429 [3] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240433 [7] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240432 [6] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240430 [4] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO Setting affinity for GPU 2 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO Setting affinity for GPU 3 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO Setting affinity for GPU 4 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO Setting affinity for GPU 7 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO Setting affinity for GPU 6 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO Setting affinity for GPU 5 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO Setting affinity for GPU 0 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO Setting affinity for GPU 1 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO Channel 00 : 3[41000] -> 4[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO Channel 00 : 2[40000] -> 3[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO Channel 00 : 6[b4000] -> 7[b5000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO Channel 01 : 3[41000] -> 4[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO Channel 00 : 7[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO Channel 01 : 2[40000] -> 3[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO Channel 01 : 6[b4000] -> 7[b5000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO Channel 01 : 7[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO Channel 00 : 1[3e000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO Channel 01 : 1[3e000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO Channel 00 : 5[b2000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO Channel 01 : 5[b2000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO Channel 00 : 4[b1000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO Channel 00 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO Channel 01 : 4[b1000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO Channel 01 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO Channel 00 : 2[40000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO Channel 01 : 2[40000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO Channel 00 : 6[b4000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO Channel 01 : 6[b4000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO Channel 00 : 7[b5000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO Channel 01 : 7[b5000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO Channel 00 : 5[b2000] -> 4[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO Channel 01 : 5[b2000] -> 4[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO Channel 00 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO Channel 00 : 4[b1000] -> 3[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO Channel 01 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO Channel 01 : 4[b1000] -> 3[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO Channel 00 : 3[41000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO Channel 01 : 3[41000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240430:240744 [4] NCCL INFO comm 0x7f0770002f70 rank 4 nranks 8 cudaDev 4 busId b1000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240431:240740 [5] NCCL INFO comm 0x7f0d44002f70 rank 5 nranks 8 cudaDev 5 busId b2000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240428:240738 [2] NCCL INFO comm 0x7ff684002f70 rank 2 nranks 8 cudaDev 2 busId 40000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240429:240741 [3] NCCL INFO comm 0x7fc470002f70 rank 3 nranks 8 cudaDev 3 busId 41000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240734 [0] NCCL INFO comm 0x7fbc70002f70 rank 0 nranks 8 cudaDev 0 busId 3d000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240432:240743 [6] NCCL INFO comm 0x7f2f6c002f70 rank 6 nranks 8 cudaDev 6 busId b4000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240426:240739 [1] NCCL INFO comm 0x7efeb0002f70 rank 1 nranks 8 cudaDev 1 busId 3e000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240433:240742 [7] NCCL INFO comm 0x7f379c002f70 rank 7 nranks 8 cudaDev 7 busId b5000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:240425:240425 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:34:06,537 (trainer:284) INFO: 24/60epoch started +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 109, in join + ready = multiprocessing.connection.wait( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt +Process SpawnProcess-2: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 390, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1077, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch + reduction.dump(process_obj, fp) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) +KeyboardInterrupt +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__ + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1468, in _shutdown_workers + if self._persistent_workers or self._workers_status[worker_id]: +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +Process SpawnProcess-6: +Process SpawnProcess-7: +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 390, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1077, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 390, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1077, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch + reduction.dump(process_obj, fp) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch + reduction.dump(process_obj, fp) +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) +KeyboardInterrupt +Process SpawnProcess-8: +Process SpawnProcess-1: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 390, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1077, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch + reduction.dump(process_obj, fp) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) +KeyboardInterrupt +Process SpawnProcess-3: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__ + return self._get_iterator() +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 390, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1077, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch + reduction.dump(process_obj, fp) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/queues.py", line 57, in __getstate__ + def __getstate__(self): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__ + return self._get_iterator() +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 390, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1077, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch + reduction.dump(process_obj, fp) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/queues.py", line 57, in __getstate__ + def __getstate__(self): +KeyboardInterrupt +Process SpawnProcess-4: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 390, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1077, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch + reduction.dump(process_obj, fp) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/queues.py", line 57, in __getstate__ + def __getstate__(self): +KeyboardInterrupt +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__ +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__ + self._shutdown_workers() + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1468, in _shutdown_workers + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1468, in _shutdown_workers + if self._persistent_workers or self._workers_status[worker_id]: + if self._persistent_workers or self._workers_status[worker_id]: +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +Process SpawnProcess-5: +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__ + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1468, in _shutdown_workers + if self._persistent_workers or self._workers_status[worker_id]: +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 510, in train_one_epoch + for iiter, (utt_id, batch) in enumerate( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/reporter.py", line 263, in measure_iter_time + iterator = iter(iterable) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 444, in __iter__ + return self._get_iterator() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 390, in _get_iterator + return _MultiProcessingDataLoaderIter(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1077, in __init__ + w.start() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 121, in start + self._popen = self._Popen(self) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 224, in _Popen + return _default_context.get_context().Process._Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/context.py", line 284, in _Popen + return Popen(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 32, in __init__ + super().__init__(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_fork.py", line 19, in __init__ + self._launch(process_obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/popen_spawn_posix.py", line 47, in _launch + reduction.dump(process_obj, fp) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/reduction.py", line 60, in dump + ForkingPickler(file, protocol).dump(obj) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/queues.py", line 57, in __getstate__ + def __getstate__(self): +KeyboardInterrupt +Exception ignored in: +Exception ignored in: +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__ + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__ + self._shutdown_workers() + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1468, in _shutdown_workers + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1468, in _shutdown_workers + if self._persistent_workers or self._workers_status[worker_id]: + if self._persistent_workers or self._workers_status[worker_id]: +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__ + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1468, in _shutdown_workers + if self._persistent_workers or self._workers_status[worker_id]: +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' +Exception ignored in: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1510, in __del__ + self._shutdown_workers() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1468, in _shutdown_workers + if self._persistent_workers or self._workers_status[worker_id]: +AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.3.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.3.log new file mode 100644 index 0000000000000000000000000000000000000000..fd276b71e1189c956e2257d7ee3994e10d21a674 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.3.log @@ -0,0 +1,909 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +# Started at Tue Feb 27 15:22:00 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +[W socket.cpp:558] [c10d] The client socket has failed to connect to [localhost]:60093 (errno: 99 - Cannot assign requested address). +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:19,640 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:19,640 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 8 nodes. +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:19,663 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:23,224 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:23,234 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:23,234 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:23,234 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:23,235 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:28,908 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:52,736 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/large_w_whamr/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/large_w_whamr/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:52,736 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=36211, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:52,744 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=36211, mean=54.1, min=12, max=260 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:53,069 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:53,104 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:53,105 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=55, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:53,105 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=55, mean=90.9, min=37, max=153 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:53,116 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:53,143 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:53,144 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=5000, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:53,144 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:22:55,610 (trainer:159) INFO: The training was resumed using exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229014 [6] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229009 [1] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229013 [5] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229012 [4] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229011 [3] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229010 [2] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229015 [7] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO Setting affinity for GPU 3 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO Setting affinity for GPU 4 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO Setting affinity for GPU 5 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO Setting affinity for GPU 7 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO Setting affinity for GPU 6 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO Setting affinity for GPU 1 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO Setting affinity for GPU 2 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO Setting affinity for GPU 0 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO Channel 00 : 5[b2000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO Channel 00 : 2[40000] -> 3[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO Channel 00 : 3[41000] -> 4[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO Channel 00 : 6[b4000] -> 7[b5000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO Channel 01 : 5[b2000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO Channel 00 : 1[3e000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO Channel 01 : 2[40000] -> 3[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO Channel 00 : 7[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO Channel 01 : 6[b4000] -> 7[b5000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO Channel 01 : 3[41000] -> 4[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO Channel 01 : 1[3e000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO Channel 01 : 7[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO Channel 00 : 4[b1000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO Channel 01 : 4[b1000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO Channel 00 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO Channel 01 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO Channel 00 : 2[40000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO Channel 01 : 2[40000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO Channel 00 : 6[b4000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO Channel 01 : 6[b4000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO Channel 00 : 7[b5000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO Channel 01 : 7[b5000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO Channel 00 : 5[b2000] -> 4[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO Channel 00 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO Channel 01 : 5[b2000] -> 4[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO Channel 00 : 4[b1000] -> 3[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO Channel 01 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO Channel 01 : 4[b1000] -> 3[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO Channel 00 : 3[41000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO Channel 01 : 3[41000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229608 [0] NCCL INFO comm 0x7f4d08002f70 rank 0 nranks 8 cudaDev 0 busId 3d000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229012:229615 [4] NCCL INFO comm 0x7f6d1c002f70 rank 4 nranks 8 cudaDev 4 busId b1000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229011:229616 [3] NCCL INFO comm 0x7fcf24002f70 rank 3 nranks 8 cudaDev 3 busId 41000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229013:229613 [5] NCCL INFO comm 0x7fad2c002f70 rank 5 nranks 8 cudaDev 5 busId b2000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229015:229621 [7] NCCL INFO comm 0x7f34c0002f70 rank 7 nranks 8 cudaDev 7 busId b5000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229010:229617 [2] NCCL INFO comm 0x7efdf8002f70 rank 2 nranks 8 cudaDev 2 busId 40000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229009:229614 [1] NCCL INFO comm 0x7f1b80002f70 rank 1 nranks 8 cudaDev 1 busId 3e000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229014:229612 [6] NCCL INFO comm 0x7f0710002f70 rank 6 nranks 8 cudaDev 6 busId b4000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:229008:229008 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:23:01,565 (trainer:284) INFO: 24/60epoch started +Traceback (most recent call last): + File "", line 1, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + exitcode = _main(fd, parent_sentinel) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 126, in _main + self = reduction.pickle.load(from_parent) +_pickle.UnpicklingError: pickle data was truncated +Traceback (most recent call last): + File "", line 1, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + exitcode = _main(fd, parent_sentinel) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 126, in _main + self = reduction.pickle.load(from_parent) +_pickle.UnpicklingError: pickle data was truncated +Traceback (most recent call last): + File "", line 1, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + exitcode = _main(fd, parent_sentinel) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 126, in _main + self = reduction.pickle.load(from_parent) +_pickle.UnpicklingError: pickle data was truncated +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 140, in join + raise ProcessExitedException( +torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with signal SIGKILL +# Accounting: time=308 threads=1 +# Ended (code 1) at Tue Feb 27 15:27:08 CST 2024, elapsed time 308 seconds +/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 319 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.4.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.4.log new file mode 100644 index 0000000000000000000000000000000000000000..b914c3c1799a80e59a1b00e393d930267989957d --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.4.log @@ -0,0 +1,845 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +# Started at Tue Feb 27 15:21:02 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:22,325 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:22,326 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 8 nodes. +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:22,360 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:27,579 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:27,589 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:27,589 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:27,589 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:27,592 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:27,607 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/8] 2024-02-27 15:21:35,008 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +Process SpawnProcess-5: +Process SpawnProcess-8: +Process SpawnProcess-3: +Process SpawnProcess-1: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 109, in join +Process SpawnProcess-6: + ready = multiprocessing.connection.wait( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt +Process SpawnProcess-7: +Traceback (most recent call last): +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1324, in main_worker + train_iter_factory = cls.build_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1324, in main_worker + train_iter_factory = cls.build_iter_factory( +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1324, in main_worker + train_iter_factory = cls.build_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1537, in build_iter_factory + return cls.build_sequence_iter_factory( +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1537, in build_iter_factory + return cls.build_sequence_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1537, in build_iter_factory + return cls.build_sequence_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1593, in build_sequence_iter_factory + batch_sampler = build_batch_sampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1593, in build_sequence_iter_factory + batch_sampler = build_batch_sampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/build_batch_sampler.py", line 140, in build_batch_sampler + retval = NumElementsBatchSampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1593, in build_sequence_iter_factory + batch_sampler = build_batch_sampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1324, in main_worker + train_iter_factory = cls.build_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/build_batch_sampler.py", line 140, in build_batch_sampler + retval = NumElementsBatchSampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 41, in __init__ + utt2shapes = [ + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 42, in + load_num_sequence_text(s, loader_type="csv_int") for s in shape_files + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1324, in main_worker + train_iter_factory = cls.build_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1537, in build_iter_factory + return cls.build_sequence_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/build_batch_sampler.py", line 140, in build_batch_sampler + retval = NumElementsBatchSampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1324, in main_worker + train_iter_factory = cls.build_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1537, in build_iter_factory + return cls.build_sequence_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 41, in __init__ + utt2shapes = [ + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in load_num_sequence_text + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1593, in build_sequence_iter_factory + batch_sampler = build_batch_sampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/build_batch_sampler.py", line 140, in build_batch_sampler + retval = NumElementsBatchSampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 41, in __init__ + utt2shapes = [ + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1537, in build_iter_factory + return cls.build_sequence_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1593, in build_sequence_iter_factory + batch_sampler = build_batch_sampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 42, in + load_num_sequence_text(s, loader_type="csv_int") for s in shape_files +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 41, in __init__ + utt2shapes = [ + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 42, in + load_num_sequence_text(s, loader_type="csv_int") for s in shape_files + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 42, in + load_num_sequence_text(s, loader_type="csv_int") for s in shape_files + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1593, in build_sequence_iter_factory + batch_sampler = build_batch_sampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/build_batch_sampler.py", line 140, in build_batch_sampler + retval = NumElementsBatchSampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in load_num_sequence_text + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in load_num_sequence_text + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in load_num_sequence_text + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/build_batch_sampler.py", line 140, in build_batch_sampler + retval = NumElementsBatchSampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 41, in __init__ + utt2shapes = [ +KeyboardInterrupt +KeyboardInterrupt +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 41, in __init__ + utt2shapes = [ + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 42, in + load_num_sequence_text(s, loader_type="csv_int") for s in shape_files + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 42, in + load_num_sequence_text(s, loader_type="csv_int") for s in shape_files + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in load_num_sequence_text + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in load_num_sequence_text + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in + retval[k] = [dtype(i) for i in v.split(delimiter)] +KeyboardInterrupt +KeyboardInterrupt +Process SpawnProcess-4: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1324, in main_worker + train_iter_factory = cls.build_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1537, in build_iter_factory + return cls.build_sequence_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1593, in build_sequence_iter_factory + batch_sampler = build_batch_sampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/build_batch_sampler.py", line 140, in build_batch_sampler + retval = NumElementsBatchSampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 41, in __init__ + utt2shapes = [ + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 42, in + load_num_sequence_text(s, loader_type="csv_int") for s in shape_files + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in load_num_sequence_text + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in + retval[k] = [dtype(i) for i in v.split(delimiter)] +KeyboardInterrupt +Process SpawnProcess-2: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1324, in main_worker + train_iter_factory = cls.build_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1537, in build_iter_factory + return cls.build_sequence_iter_factory( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1593, in build_sequence_iter_factory + batch_sampler = build_batch_sampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/build_batch_sampler.py", line 140, in build_batch_sampler + retval = NumElementsBatchSampler( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 41, in __init__ + utt2shapes = [ + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/samplers/num_elements_batch_sampler.py", line 42, in + load_num_sequence_text(s, loader_type="csv_int") for s in shape_files + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in load_num_sequence_text + retval[k] = [dtype(i) for i in v.split(delimiter)] + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/fileio/read_text.py", line 124, in + retval[k] = [dtype(i) for i in v.split(delimiter)] +KeyboardInterrupt diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.5.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.5.log new file mode 100644 index 0000000000000000000000000000000000000000..988e48eee3c947dac666d5740c004b084afe3fd0 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.5.log @@ -0,0 +1,180 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Tue Feb 27 15:19:11 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +Traceback (most recent call last): +Traceback (most recent call last): +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 197, in _run_module_as_main + File "", line 1, in + File "", line 1, in + File "", line 1, in +Traceback (most recent call last): + File "", line 1, in + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 109, in join + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main + exitcode = _main(fd, parent_sentinel) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 125, in _main + exitcode = _main(fd, parent_sentinel) + exitcode = _main(fd, parent_sentinel) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 125, in _main + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 125, in _main + exitcode = _main(fd, parent_sentinel) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 125, in _main + prepare(preparation_data) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 234, in prepare + prepare(preparation_data) + prepare(preparation_data) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 234, in prepare + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 234, in prepare + prepare(preparation_data) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 234, in prepare + ready = multiprocessing.connection.wait( + _fixup_main_from_name(data['init_main_from_name']) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/connection.py", line 931, in wait + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 258, in _fixup_main_from_name + _fixup_main_from_name(data['init_main_from_name']) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 258, in _fixup_main_from_name + _fixup_main_from_name(data['init_main_from_name']) + _fixup_main_from_name(data['init_main_from_name']) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 258, in _fixup_main_from_name + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/spawn.py", line 258, in _fixup_main_from_name + main_content = runpy.run_module(mod_name, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 225, in run_module + main_content = runpy.run_module(mod_name, + main_content = runpy.run_module(mod_name, + main_content = runpy.run_module(mod_name, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 225, in run_module + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 225, in run_module + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 225, in run_module + return _run_module_code(code, init_globals, run_name, mod_spec) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 97, in _run_module_code + ready = selector.select(timeout) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/selectors.py", line 416, in select + _run_code(code, mod_globals, init_globals, + return _run_module_code(code, init_globals, run_name, mod_spec) + return _run_module_code(code, init_globals, run_name, mod_spec) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + return _run_module_code(code, init_globals, run_name, mod_spec) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 97, in _run_module_code + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 97, in _run_module_code + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 97, in _run_module_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 2, in + _run_code(code, mod_globals, init_globals, + _run_code(code, mod_globals, init_globals, + _run_code(code, mod_globals, init_globals, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + from espnet2.tasks.asr import ASRTask + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/asr.py", line 74, in + fd_event_list = self._selector.poll(timeout) + exec(code, run_globals) + exec(code, run_globals) + exec(code, run_globals) +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 2, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 2, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 2, in + from espnet2.tasks.asr import ASRTask + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/asr.py", line 74, in + from espnet2.tasks.asr import ASRTask + from espnet2.tasks.asr import ASRTask + from espnet2.text.phoneme_tokenizer import g2p_choices + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/asr.py", line 74, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/asr.py", line 74, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/text/phoneme_tokenizer.py", line 7, in + from espnet2.text.phoneme_tokenizer import g2p_choices + from espnet2.text.phoneme_tokenizer import g2p_choices + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/text/phoneme_tokenizer.py", line 7, in + from espnet2.text.phoneme_tokenizer import g2p_choices + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/text/phoneme_tokenizer.py", line 7, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/text/phoneme_tokenizer.py", line 7, in + import g2p_en + import g2p_en + import g2p_en + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/g2p_en/__init__.py", line 1, in + import g2p_en + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/g2p_en/__init__.py", line 1, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/g2p_en/__init__.py", line 1, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/g2p_en/__init__.py", line 1, in + from .g2p import G2p + from .g2p import G2p + from .g2p import G2p + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/g2p_en/g2p.py", line 7, in + from .g2p import G2p + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/g2p_en/g2p.py", line 7, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/g2p_en/g2p.py", line 7, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/g2p_en/g2p.py", line 7, in + from nltk import pos_tag + from nltk import pos_tag + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/__init__.py", line 145, in + from nltk import pos_tag + from nltk import pos_tag + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/__init__.py", line 143, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/__init__.py", line 145, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/__init__.py", line 145, in + from nltk.inference import * + from nltk.inference import * + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/inference/__init__.py", line 14, in + from nltk.inference import * + from nltk.chunk import * + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/inference/__init__.py", line 14, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/inference/__init__.py", line 14, in + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/chunk/__init__.py", line 157, in + from nltk.chunk.api import ChunkParserI + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/chunk/api.py", line 13, in + from nltk.parse import ParserI + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/parse/__init__.py", line 102, in + from nltk.inference.api import ParallelProverBuilder, ParallelProverBuilderCommand + File "", line 1007, in _find_and_load + from nltk.inference.api import ParallelProverBuilder, ParallelProverBuilderCommand + from nltk.inference.api import ParallelProverBuilder, ParallelProverBuilderCommand + File "", line 1007, in _find_and_load + File "", line 1007, in _find_and_load + from nltk.parse.corenlp import CoreNLPParser, CoreNLPDependencyParser + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/parse/corenlp.py", line 19, in + File "", line 986, in _find_and_load_unlocked + from nltk.tag.api import TaggerI + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/tag/__init__.py", line 83, in + File "", line 986, in _find_and_load_unlocked + File "", line 986, in _find_and_load_unlocked + File "", line 680, in _load_unlocked + File "", line 680, in _load_unlocked + from nltk.tag.brill import BrillTagger + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/tag/brill.py", line 16, in + File "", line 680, in _load_unlocked + File "", line 846, in exec_module + File "", line 846, in exec_module + File "", line 846, in exec_module + File "", line 978, in get_code + File "", line 941, in get_code + File "", line 978, in get_code + from nltk.tbl import Feature, Template + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/tbl/__init__.py", line 18, in + File "", line 647, in _compile_bytecode + File "", line 1040, in get_data + File "", line 647, in _compile_bytecode +KeyboardInterrupt + from nltk.tbl.template import Template + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/tbl/template.py", line 16, in +KeyboardInterrupt +KeyboardInterrupt + from nltk.tbl.rule import Rule + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/nltk/tbl/rule.py", line 23, in + class TagRule(object): +KeyboardInterrupt diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.6.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.6.log new file mode 100644 index 0000000000000000000000000000000000000000..9cee4adc504f62026409b6afe3e29a7607172eb6 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.6.log @@ -0,0 +1,1862 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Tue Feb 20 19:27:36 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:28:55,617 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:28:55,617 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:28:55,650 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:00,970 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:00,982 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:00,993 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:00,994 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:01,007 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:01,028 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:08,224 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:33,726 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/large_w_whamr/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/large_w_whamr/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:33,726 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=36211, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:33,734 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=36211, mean=54.1, min=12, max=260 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:34,059 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:34,098 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:34,098 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=56, batch_bins=16000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:34,098 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=56, mean=89.3, min=4, max=153 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:34,110 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:34,140 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:34,140 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=5000, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:34,141 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051276 [1] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051278 [3] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051277 [2] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO Setting affinity for GPU 3 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO Setting affinity for GPU 4 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO Setting affinity for GPU 5 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO Setting affinity for GPU 2 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO Channel 00 : 1[41000] -> 2[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO Channel 00 : 3[b2000] -> 0[40000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO Channel 01 : 1[41000] -> 2[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO Channel 01 : 3[b2000] -> 0[40000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO Channel 00 : 2[b1000] -> 3[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO Channel 01 : 2[b1000] -> 3[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO Channel 00 : 0[40000] -> 1[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO Channel 01 : 0[40000] -> 1[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO Channel 00 : 2[b1000] -> 1[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO Channel 01 : 2[b1000] -> 1[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO Channel 00 : 3[b2000] -> 2[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO Channel 01 : 3[b2000] -> 2[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO Channel 00 : 1[41000] -> 0[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO Channel 01 : 1[41000] -> 0[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051827 [1] NCCL INFO comm 0x7f87b0002f70 rank 1 nranks 4 cudaDev 1 busId 41000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051826 [0] NCCL INFO comm 0x7f5884002f70 rank 0 nranks 4 cudaDev 0 busId 40000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051838 [2] NCCL INFO comm 0x7f6e1c002f70 rank 2 nranks 4 cudaDev 2 busId b1000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051828 [3] NCCL INFO comm 0x7febe0002f70 rank 3 nranks 4 cudaDev 3 busId b2000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051274 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:29:38,975 (trainer:284) INFO: 1/60epoch started +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:35:36,025 (distributed:995) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:56:03,080 (trainer:732) INFO: 1epoch:train:1-1810batch: iter_time=0.001, forward_time=0.209, loss_att=626.526, acc=0.519, loss=626.526, backward_time=0.304, grad_norm=259.374, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.275e-05, train_time=3.501 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 20:16:29,839 (trainer:732) INFO: 1epoch:train:1811-3620batch: iter_time=3.100e-04, forward_time=0.205, loss_att=466.643, acc=0.612, loss=466.643, backward_time=0.304, grad_norm=78.427, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=6.800e-05, train_time=2.710 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 20:36:56,310 (trainer:732) INFO: 1epoch:train:3621-5430batch: iter_time=2.892e-04, forward_time=0.205, loss_att=425.413, acc=0.643, loss=425.413, backward_time=0.304, grad_norm=75.402, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=1.133e-04, train_time=2.710 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 20:57:21,367 (trainer:732) INFO: 1epoch:train:5431-7240batch: iter_time=2.833e-04, forward_time=0.204, loss_att=394.673, acc=0.666, loss=394.673, backward_time=0.303, grad_norm=78.046, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=1.585e-04, train_time=2.707 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 21:17:48,409 (trainer:732) INFO: 1epoch:train:7241-9050batch: iter_time=2.811e-04, forward_time=0.205, loss_att=375.211, acc=0.683, loss=375.211, backward_time=0.304, grad_norm=84.215, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.037e-04, train_time=2.712 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 21:38:12,842 (trainer:732) INFO: 1epoch:train:9051-10860batch: iter_time=2.834e-04, forward_time=0.204, loss_att=353.627, acc=0.698, loss=353.627, backward_time=0.303, grad_norm=88.866, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.490e-04, train_time=2.705 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 21:58:41,274 (trainer:732) INFO: 1epoch:train:10861-12670batch: iter_time=2.886e-04, forward_time=0.205, loss_att=340.151, acc=0.712, loss=340.151, backward_time=0.304, grad_norm=94.618, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=2.942e-04, train_time=2.714 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 22:19:09,448 (trainer:732) INFO: 1epoch:train:12671-14480batch: iter_time=2.859e-04, forward_time=0.205, loss_att=324.417, acc=0.723, loss=324.417, backward_time=0.304, grad_norm=96.256, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=3.395e-04, train_time=2.714 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 22:39:34,639 (trainer:732) INFO: 1epoch:train:14481-16290batch: iter_time=2.921e-04, forward_time=0.204, loss_att=311.369, acc=0.733, loss=311.369, backward_time=0.303, grad_norm=96.548, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=3.847e-04, train_time=2.707 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 23:00:06,514 (trainer:732) INFO: 1epoch:train:16291-18100batch: iter_time=2.863e-04, forward_time=0.206, loss_att=303.407, acc=0.744, loss=303.407, backward_time=0.305, grad_norm=102.496, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=4.300e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 23:20:32,855 (trainer:732) INFO: 1epoch:train:18101-19910batch: iter_time=2.932e-04, forward_time=0.204, loss_att=289.104, acc=0.751, loss=289.104, backward_time=0.304, grad_norm=94.283, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=4.752e-04, train_time=2.710 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 23:40:59,092 (trainer:732) INFO: 1epoch:train:19911-21720batch: iter_time=2.818e-04, forward_time=0.205, loss_att=279.763, acc=0.759, loss=279.763, backward_time=0.303, grad_norm=98.740, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=5.205e-04, train_time=2.709 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 00:01:25,286 (trainer:732) INFO: 1epoch:train:21721-23530batch: iter_time=2.860e-04, forward_time=0.205, loss_att=274.390, acc=0.765, loss=274.390, backward_time=0.304, grad_norm=99.107, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.658e-04, train_time=2.710 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 00:21:50,776 (trainer:732) INFO: 1epoch:train:23531-25340batch: iter_time=2.861e-04, forward_time=0.205, loss_att=267.582, acc=0.771, loss=267.582, backward_time=0.303, grad_norm=96.640, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=6.110e-04, train_time=2.707 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 00:42:17,188 (trainer:732) INFO: 1epoch:train:25341-27150batch: iter_time=3.255e-04, forward_time=0.205, loss_att=256.688, acc=0.779, loss=256.688, backward_time=0.304, grad_norm=95.954, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=6.562e-04, train_time=2.710 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 01:02:45,105 (trainer:732) INFO: 1epoch:train:27151-28960batch: iter_time=2.815e-04, forward_time=0.205, loss_att=244.742, acc=0.791, loss=244.742, backward_time=0.304, grad_norm=99.990, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.015e-04, train_time=2.713 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 01:22:56,662 (trainer:732) INFO: 1epoch:train:28961-30770batch: iter_time=2.702e-04, forward_time=0.204, loss_att=232.904, acc=0.800, loss=232.904, backward_time=0.298, grad_norm=102.536, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=7.467e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 01:43:06,659 (trainer:732) INFO: 1epoch:train:30771-32580batch: iter_time=2.654e-04, forward_time=0.204, loss_att=220.259, acc=0.813, loss=220.259, backward_time=0.296, grad_norm=104.136, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.920e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 02:03:13,152 (trainer:732) INFO: 1epoch:train:32581-34390batch: iter_time=2.613e-04, forward_time=0.203, loss_att=209.897, acc=0.819, loss=209.897, backward_time=0.295, grad_norm=104.781, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.372e-04, train_time=2.666 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 02:23:19,826 (trainer:732) INFO: 1epoch:train:34391-36200batch: iter_time=2.887e-04, forward_time=0.203, loss_att=201.209, acc=0.826, loss=201.209, backward_time=0.295, grad_norm=108.293, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.825e-04, train_time=2.666 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 02:32:51,701 (trainer:338) INFO: 1epoch results: [train] iter_time=3.427e-04, forward_time=0.205, loss_att=319.677, acc=0.731, loss=319.677, backward_time=0.302, grad_norm=102.924, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=4.528e-04, train_time=2.742, time=6 hours, 54 minutes and 0.64 seconds, total_count=36211, gpu_max_cached_mem_GB=29.945, [valid] loss_att=65.834, acc=0.874, cer=0.150, wer=0.451, loss=65.834, time=5 minutes and 24.92 seconds, total_count=56, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 47.14 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 02:32:56,506 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 02:32:56,507 (trainer:272) INFO: 2/60epoch started. Estimated time to finish: 2 weeks, 3 days and 8 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 02:57:38,451 (trainer:732) INFO: 2epoch:train:1-1810batch: iter_time=9.254e-04, forward_time=0.205, loss_att=193.993, acc=0.833, loss=193.993, backward_time=0.295, grad_norm=107.256, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=9.279e-04, train_time=3.275 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 03:17:49,257 (trainer:732) INFO: 2epoch:train:1811-3620batch: iter_time=3.012e-04, forward_time=0.205, loss_att=188.863, acc=0.839, loss=188.863, backward_time=0.295, grad_norm=108.870, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=9.732e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 03:38:00,532 (trainer:732) INFO: 2epoch:train:3621-5430batch: iter_time=2.982e-04, forward_time=0.205, loss_att=183.251, acc=0.843, loss=183.251, backward_time=0.295, grad_norm=106.828, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 03:58:07,357 (trainer:732) INFO: 2epoch:train:5431-7240batch: iter_time=3.024e-04, forward_time=0.204, loss_att=176.054, acc=0.848, loss=176.054, backward_time=0.294, grad_norm=107.071, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 04:18:20,905 (trainer:732) INFO: 2epoch:train:7241-9050batch: iter_time=3.152e-04, forward_time=0.205, loss_att=176.121, acc=0.851, loss=176.121, backward_time=0.296, grad_norm=109.917, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 04:38:29,692 (trainer:732) INFO: 2epoch:train:9051-10860batch: iter_time=3.016e-04, forward_time=0.204, loss_att=171.024, acc=0.854, loss=171.024, backward_time=0.295, grad_norm=107.478, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 04:58:40,270 (trainer:732) INFO: 2epoch:train:10861-12670batch: iter_time=3.101e-04, forward_time=0.205, loss_att=165.128, acc=0.858, loss=165.128, backward_time=0.295, grad_norm=109.967, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 05:18:50,328 (trainer:732) INFO: 2epoch:train:12671-14480batch: iter_time=3.002e-04, forward_time=0.204, loss_att=161.921, acc=0.861, loss=161.921, backward_time=0.295, grad_norm=110.059, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 05:39:00,315 (trainer:732) INFO: 2epoch:train:14481-16290batch: iter_time=2.995e-04, forward_time=0.205, loss_att=159.036, acc=0.864, loss=159.036, backward_time=0.295, grad_norm=110.312, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 05:59:10,050 (trainer:732) INFO: 2epoch:train:16291-18100batch: iter_time=3.204e-04, forward_time=0.204, loss_att=155.653, acc=0.867, loss=155.653, backward_time=0.295, grad_norm=111.906, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 06:19:15,579 (trainer:732) INFO: 2epoch:train:18101-19910batch: iter_time=2.946e-04, forward_time=0.204, loss_att=153.619, acc=0.868, loss=153.619, backward_time=0.294, grad_norm=113.414, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.664 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 06:39:22,570 (trainer:732) INFO: 2epoch:train:19911-21720batch: iter_time=2.965e-04, forward_time=0.204, loss_att=150.031, acc=0.871, loss=150.031, backward_time=0.294, grad_norm=110.352, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 06:59:26,410 (trainer:732) INFO: 2epoch:train:21721-23530batch: iter_time=2.940e-04, forward_time=0.204, loss_att=145.407, acc=0.873, loss=145.407, backward_time=0.293, grad_norm=109.257, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 07:19:36,643 (trainer:732) INFO: 2epoch:train:23531-25340batch: iter_time=2.909e-04, forward_time=0.205, loss_att=146.998, acc=0.875, loss=146.998, backward_time=0.295, grad_norm=111.017, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.002, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 07:39:45,278 (trainer:732) INFO: 2epoch:train:25341-27150batch: iter_time=2.877e-04, forward_time=0.204, loss_att=142.729, acc=0.877, loss=142.729, backward_time=0.295, grad_norm=110.774, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.002, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 07:59:56,508 (trainer:732) INFO: 2epoch:train:27151-28960batch: iter_time=3.030e-04, forward_time=0.205, loss_att=142.776, acc=0.878, loss=142.776, backward_time=0.295, grad_norm=111.577, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.002, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 08:20:06,312 (trainer:732) INFO: 2epoch:train:28961-30770batch: iter_time=2.999e-04, forward_time=0.205, loss_att=139.652, acc=0.879, loss=139.652, backward_time=0.295, grad_norm=110.656, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.002, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 08:40:19,319 (trainer:732) INFO: 2epoch:train:30771-32580batch: iter_time=3.084e-04, forward_time=0.205, loss_att=140.005, acc=0.880, loss=140.005, backward_time=0.296, grad_norm=115.576, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.002, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 09:00:28,914 (trainer:732) INFO: 2epoch:train:32581-34390batch: iter_time=3.015e-04, forward_time=0.205, loss_att=137.969, acc=0.882, loss=137.969, backward_time=0.295, grad_norm=109.211, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.002, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 09:20:36,456 (trainer:732) INFO: 2epoch:train:34391-36200batch: iter_time=2.958e-04, forward_time=0.204, loss_att=135.773, acc=0.884, loss=135.773, backward_time=0.295, grad_norm=118.059, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.002, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 09:29:30,914 (trainer:338) INFO: 2epoch results: [train] iter_time=3.323e-04, forward_time=0.205, loss_att=158.297, acc=0.864, loss=158.297, backward_time=0.295, grad_norm=110.476, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=0.001, train_time=2.702, time=6 hours, 48 minutes and 1.98 seconds, total_count=72422, gpu_max_cached_mem_GB=29.945, [valid] loss_att=37.803, acc=0.928, cer=0.086, wer=0.297, loss=37.803, time=4 minutes and 59.83 seconds, total_count=112, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 32.59 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 09:29:34,373 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 09:29:34,375 (trainer:272) INFO: 3/60epoch started. Estimated time to finish: 2 weeks, 2 days and 21 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 09:54:41,628 (trainer:732) INFO: 3epoch:train:1-1810batch: iter_time=9.635e-04, forward_time=0.206, loss_att=132.901, acc=0.885, loss=132.901, backward_time=0.296, grad_norm=111.268, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=3.331 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 10:14:57,921 (trainer:732) INFO: 3epoch:train:1811-3620batch: iter_time=3.183e-04, forward_time=0.205, loss_att=131.775, acc=0.887, loss=131.775, backward_time=0.297, grad_norm=113.417, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.002, train_time=2.687 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 10:35:16,176 (trainer:732) INFO: 3epoch:train:3621-5430batch: iter_time=3.182e-04, forward_time=0.206, loss_att=130.777, acc=0.887, loss=130.777, backward_time=0.298, grad_norm=110.178, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.692 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 10:55:32,707 (trainer:732) INFO: 3epoch:train:5431-7240batch: iter_time=3.032e-04, forward_time=0.205, loss_att=130.359, acc=0.888, loss=130.359, backward_time=0.298, grad_norm=111.849, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.002, train_time=2.688 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 11:15:53,197 (trainer:732) INFO: 3epoch:train:7241-9050batch: iter_time=3.094e-04, forward_time=0.205, loss_att=129.038, acc=0.890, loss=129.038, backward_time=0.298, grad_norm=108.059, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.697 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 11:36:09,939 (trainer:732) INFO: 3epoch:train:9051-10860batch: iter_time=3.199e-04, forward_time=0.205, loss_att=127.542, acc=0.890, loss=127.542, backward_time=0.297, grad_norm=111.140, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.688 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 11:56:31,096 (trainer:732) INFO: 3epoch:train:10861-12670batch: iter_time=3.317e-04, forward_time=0.206, loss_att=125.704, acc=0.892, loss=125.704, backward_time=0.298, grad_norm=111.492, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.002, train_time=2.698 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 12:16:49,502 (trainer:732) INFO: 3epoch:train:12671-14480batch: iter_time=3.257e-04, forward_time=0.205, loss_att=123.366, acc=0.893, loss=123.366, backward_time=0.298, grad_norm=111.449, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.002, train_time=2.692 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 12:37:12,748 (trainer:732) INFO: 3epoch:train:14481-16290batch: iter_time=3.244e-04, forward_time=0.206, loss_att=121.676, acc=0.896, loss=121.676, backward_time=0.299, grad_norm=108.686, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.703 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 12:57:28,886 (trainer:732) INFO: 3epoch:train:16291-18100batch: iter_time=3.100e-04, forward_time=0.205, loss_att=119.511, acc=0.897, loss=119.511, backward_time=0.298, grad_norm=108.157, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.687 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 13:17:47,386 (trainer:732) INFO: 3epoch:train:18101-19910batch: iter_time=3.176e-04, forward_time=0.206, loss_att=118.473, acc=0.898, loss=118.473, backward_time=0.298, grad_norm=107.585, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.693 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 13:38:05,883 (trainer:732) INFO: 3epoch:train:19911-21720batch: iter_time=3.160e-04, forward_time=0.205, loss_att=117.127, acc=0.899, loss=117.127, backward_time=0.298, grad_norm=108.730, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.692 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 13:58:25,280 (trainer:732) INFO: 3epoch:train:21721-23530batch: iter_time=3.294e-04, forward_time=0.206, loss_att=115.549, acc=0.900, loss=115.549, backward_time=0.299, grad_norm=109.470, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.695 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 14:18:48,148 (trainer:732) INFO: 3epoch:train:23531-25340batch: iter_time=3.272e-04, forward_time=0.206, loss_att=115.380, acc=0.901, loss=115.380, backward_time=0.299, grad_norm=106.637, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.702 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 14:39:06,387 (trainer:732) INFO: 3epoch:train:25341-27150batch: iter_time=3.290e-04, forward_time=0.206, loss_att=111.924, acc=0.904, loss=111.924, backward_time=0.298, grad_norm=105.901, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.692 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 14:59:27,752 (trainer:732) INFO: 3epoch:train:27151-28960batch: iter_time=3.139e-04, forward_time=0.206, loss_att=112.866, acc=0.903, loss=112.866, backward_time=0.298, grad_norm=110.720, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.698 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 15:19:42,647 (trainer:732) INFO: 3epoch:train:28961-30770batch: iter_time=3.147e-04, forward_time=0.206, loss_att=112.084, acc=0.904, loss=112.084, backward_time=0.297, grad_norm=103.388, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.685 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 15:39:48,135 (trainer:732) INFO: 3epoch:train:30771-32580batch: iter_time=3.276e-04, forward_time=0.204, loss_att=108.751, acc=0.905, loss=108.751, backward_time=0.294, grad_norm=103.074, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 15:59:56,095 (trainer:732) INFO: 3epoch:train:32581-34390batch: iter_time=3.144e-04, forward_time=0.205, loss_att=106.435, acc=0.907, loss=106.435, backward_time=0.294, grad_norm=104.045, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.002, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 16:20:06,459 (trainer:732) INFO: 3epoch:train:34391-36200batch: iter_time=3.102e-04, forward_time=0.205, loss_att=105.730, acc=0.908, loss=105.730, backward_time=0.295, grad_norm=102.950, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.002, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 16:29:27,300 (trainer:338) INFO: 3epoch results: [train] iter_time=3.512e-04, forward_time=0.205, loss_att=119.828, acc=0.897, loss=119.828, backward_time=0.297, grad_norm=108.409, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.002, train_time=2.721, time=6 hours, 50 minutes and 54.45 seconds, total_count=108633, gpu_max_cached_mem_GB=29.945, [valid] loss_att=29.420, acc=0.945, cer=0.061, wer=0.226, loss=29.420, time=5 minutes and 20.32 seconds, total_count=168, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 38.15 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 16:29:30,925 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 16:29:30,927 (trainer:272) INFO: 4/60epoch started. Estimated time to finish: 2 weeks, 2 days and 14 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 16:54:25,253 (trainer:732) INFO: 4epoch:train:1-1810batch: iter_time=0.001, forward_time=0.203, loss_att=104.381, acc=0.910, loss=104.381, backward_time=0.295, grad_norm=105.517, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=3.303 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 17:14:33,203 (trainer:732) INFO: 4epoch:train:1811-3620batch: iter_time=2.980e-04, forward_time=0.202, loss_att=104.771, acc=0.910, loss=104.771, backward_time=0.295, grad_norm=102.778, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.002, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 17:34:43,991 (trainer:732) INFO: 4epoch:train:3621-5430batch: iter_time=3.039e-04, forward_time=0.203, loss_att=103.675, acc=0.911, loss=103.675, backward_time=0.296, grad_norm=104.925, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.002, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 17:54:49,763 (trainer:732) INFO: 4epoch:train:5431-7240batch: iter_time=3.252e-04, forward_time=0.202, loss_att=100.252, acc=0.912, loss=100.252, backward_time=0.294, grad_norm=104.300, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.002, train_time=2.664 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 18:15:06,487 (trainer:732) INFO: 4epoch:train:7241-9050batch: iter_time=3.182e-04, forward_time=0.204, loss_att=102.139, acc=0.911, loss=102.139, backward_time=0.297, grad_norm=104.175, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.002, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 18:35:24,738 (trainer:732) INFO: 4epoch:train:9051-10860batch: iter_time=3.098e-04, forward_time=0.204, loss_att=100.315, acc=0.913, loss=100.315, backward_time=0.298, grad_norm=103.154, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.002, train_time=2.692 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 18:55:49,273 (trainer:732) INFO: 4epoch:train:10861-12670batch: iter_time=3.094e-04, forward_time=0.205, loss_att=100.498, acc=0.914, loss=100.498, backward_time=0.299, grad_norm=111.222, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.002, train_time=2.706 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 19:16:11,906 (trainer:732) INFO: 4epoch:train:12671-14480batch: iter_time=3.036e-04, forward_time=0.205, loss_att=98.297, acc=0.915, loss=98.297, backward_time=0.299, grad_norm=102.175, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.002, train_time=2.702 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 19:36:32,783 (trainer:732) INFO: 4epoch:train:14481-16290batch: iter_time=3.093e-04, forward_time=0.204, loss_att=99.571, acc=0.914, loss=99.571, backward_time=0.298, grad_norm=99.526, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=0.002, train_time=2.698 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 19:56:57,146 (trainer:732) INFO: 4epoch:train:16291-18100batch: iter_time=2.978e-04, forward_time=0.205, loss_att=100.173, acc=0.915, loss=100.173, backward_time=0.299, grad_norm=104.662, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.002, train_time=2.705 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 20:17:14,395 (trainer:732) INFO: 4epoch:train:18101-19910batch: iter_time=2.936e-04, forward_time=0.204, loss_att=97.640, acc=0.915, loss=97.640, backward_time=0.298, grad_norm=102.933, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.002, train_time=2.690 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 20:37:34,771 (trainer:732) INFO: 4epoch:train:19911-21720batch: iter_time=2.972e-04, forward_time=0.204, loss_att=94.770, acc=0.917, loss=94.770, backward_time=0.298, grad_norm=102.803, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.002, train_time=2.696 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 20:57:54,555 (trainer:732) INFO: 4epoch:train:21721-23530batch: iter_time=2.984e-04, forward_time=0.204, loss_att=96.280, acc=0.917, loss=96.280, backward_time=0.298, grad_norm=100.127, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.002, train_time=2.695 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 21:18:18,283 (trainer:732) INFO: 4epoch:train:23531-25340batch: iter_time=2.923e-04, forward_time=0.205, loss_att=95.257, acc=0.918, loss=95.257, backward_time=0.299, grad_norm=105.307, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=0.002, train_time=2.704 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 21:38:35,523 (trainer:732) INFO: 4epoch:train:25341-27150batch: iter_time=2.940e-04, forward_time=0.204, loss_att=93.481, acc=0.919, loss=93.481, backward_time=0.298, grad_norm=107.076, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.002, train_time=2.690 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 21:58:51,495 (trainer:732) INFO: 4epoch:train:27151-28960batch: iter_time=2.800e-04, forward_time=0.203, loss_att=93.222, acc=0.919, loss=93.222, backward_time=0.298, grad_norm=101.462, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.002, train_time=2.686 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 22:19:10,936 (trainer:732) INFO: 4epoch:train:28961-30770batch: iter_time=2.798e-04, forward_time=0.204, loss_att=93.862, acc=0.919, loss=93.862, backward_time=0.298, grad_norm=99.070, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.002, train_time=2.695 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 22:39:27,234 (trainer:732) INFO: 4epoch:train:30771-32580batch: iter_time=2.845e-04, forward_time=0.203, loss_att=92.682, acc=0.920, loss=92.682, backward_time=0.297, grad_norm=105.346, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.002, train_time=2.687 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 22:59:42,392 (trainer:732) INFO: 4epoch:train:32581-34390batch: iter_time=2.887e-04, forward_time=0.203, loss_att=91.723, acc=0.920, loss=91.723, backward_time=0.297, grad_norm=101.615, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.002, train_time=2.686 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 23:19:59,922 (trainer:732) INFO: 4epoch:train:34391-36200batch: iter_time=2.861e-04, forward_time=0.204, loss_att=93.025, acc=0.920, loss=93.025, backward_time=0.298, grad_norm=101.233, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 23:30:10,423 (trainer:338) INFO: 4epoch results: [train] iter_time=3.342e-04, forward_time=0.204, loss_att=97.795, acc=0.915, loss=97.795, backward_time=0.297, grad_norm=103.472, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.002, train_time=2.721, time=6 hours, 50 minutes and 52.06 seconds, total_count=144844, gpu_max_cached_mem_GB=29.945, [valid] loss_att=23.522, acc=0.956, cer=0.053, wer=0.196, loss=23.522, time=5 minutes and 54.9 seconds, total_count=224, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 52.53 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 23:30:14,825 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 23:30:14,830 (trainer:272) INFO: 5/60epoch started. Estimated time to finish: 2 weeks, 2 days and 8 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-21 23:55:18,615 (trainer:732) INFO: 5epoch:train:1-1810batch: iter_time=0.001, forward_time=0.203, loss_att=90.568, acc=0.921, loss=90.568, backward_time=0.297, grad_norm=103.605, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=3.323 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 00:15:38,992 (trainer:732) INFO: 5epoch:train:1811-3620batch: iter_time=2.773e-04, forward_time=0.204, loss_att=90.916, acc=0.922, loss=90.916, backward_time=0.299, grad_norm=103.785, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.697 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 00:35:51,151 (trainer:732) INFO: 5epoch:train:3621-5430batch: iter_time=2.800e-04, forward_time=0.203, loss_att=88.306, acc=0.922, loss=88.306, backward_time=0.297, grad_norm=101.356, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.679 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 00:56:10,208 (trainer:732) INFO: 5epoch:train:5431-7240batch: iter_time=2.737e-04, forward_time=0.204, loss_att=89.347, acc=0.923, loss=89.347, backward_time=0.298, grad_norm=99.638, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.693 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 01:16:26,958 (trainer:732) INFO: 5epoch:train:7241-9050batch: iter_time=2.825e-04, forward_time=0.204, loss_att=88.720, acc=0.923, loss=88.720, backward_time=0.298, grad_norm=99.826, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 01:36:47,360 (trainer:732) INFO: 5epoch:train:9051-10860batch: iter_time=2.699e-04, forward_time=0.204, loss_att=87.976, acc=0.924, loss=87.976, backward_time=0.299, grad_norm=99.054, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.696 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 01:57:00,494 (trainer:732) INFO: 5epoch:train:10861-12670batch: iter_time=2.719e-04, forward_time=0.203, loss_att=87.753, acc=0.924, loss=87.753, backward_time=0.297, grad_norm=101.543, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 02:17:16,002 (trainer:732) INFO: 5epoch:train:12671-14480batch: iter_time=2.746e-04, forward_time=0.203, loss_att=87.016, acc=0.924, loss=87.016, backward_time=0.297, grad_norm=100.968, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 02:37:32,156 (trainer:732) INFO: 5epoch:train:14481-16290batch: iter_time=2.840e-04, forward_time=0.203, loss_att=87.075, acc=0.924, loss=87.075, backward_time=0.298, grad_norm=106.236, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 02:57:48,888 (trainer:732) INFO: 5epoch:train:16291-18100batch: iter_time=2.831e-04, forward_time=0.203, loss_att=87.186, acc=0.925, loss=87.186, backward_time=0.298, grad_norm=101.354, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 03:18:06,551 (trainer:732) INFO: 5epoch:train:18101-19910batch: iter_time=2.779e-04, forward_time=0.203, loss_att=86.205, acc=0.926, loss=86.205, backward_time=0.298, grad_norm=105.459, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.691 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 03:38:20,859 (trainer:732) INFO: 5epoch:train:19911-21720batch: iter_time=2.758e-04, forward_time=0.203, loss_att=85.858, acc=0.925, loss=85.858, backward_time=0.297, grad_norm=105.995, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 03:58:37,657 (trainer:732) INFO: 5epoch:train:21721-23530batch: iter_time=2.767e-04, forward_time=0.203, loss_att=85.870, acc=0.926, loss=85.870, backward_time=0.298, grad_norm=103.929, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 04:18:52,431 (trainer:732) INFO: 5epoch:train:23531-25340batch: iter_time=2.794e-04, forward_time=0.203, loss_att=84.744, acc=0.926, loss=84.744, backward_time=0.297, grad_norm=103.669, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 04:39:09,803 (trainer:732) INFO: 5epoch:train:25341-27150batch: iter_time=2.743e-04, forward_time=0.204, loss_att=85.075, acc=0.927, loss=85.075, backward_time=0.298, grad_norm=104.348, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 04:59:21,227 (trainer:732) INFO: 5epoch:train:27151-28960batch: iter_time=2.805e-04, forward_time=0.202, loss_att=83.986, acc=0.926, loss=83.986, backward_time=0.296, grad_norm=96.875, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 05:19:35,943 (trainer:732) INFO: 5epoch:train:28961-30770batch: iter_time=2.759e-04, forward_time=0.203, loss_att=83.866, acc=0.927, loss=83.866, backward_time=0.297, grad_norm=99.673, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.001, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 05:39:53,383 (trainer:732) INFO: 5epoch:train:30771-32580batch: iter_time=2.769e-04, forward_time=0.203, loss_att=84.211, acc=0.927, loss=84.211, backward_time=0.298, grad_norm=98.897, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 06:00:10,651 (trainer:732) INFO: 5epoch:train:32581-34390batch: iter_time=2.799e-04, forward_time=0.203, loss_att=83.666, acc=0.928, loss=83.666, backward_time=0.298, grad_norm=100.659, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 06:20:28,750 (trainer:732) INFO: 5epoch:train:34391-36200batch: iter_time=2.710e-04, forward_time=0.203, loss_att=83.110, acc=0.928, loss=83.110, backward_time=0.298, grad_norm=96.318, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.691 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 06:29:59,151 (trainer:338) INFO: 5epoch results: [train] iter_time=3.338e-04, forward_time=0.203, loss_att=86.567, acc=0.925, loss=86.567, backward_time=0.298, grad_norm=101.657, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.719, time=6 hours, 50 minutes and 35.19 seconds, total_count=181055, gpu_max_cached_mem_GB=29.945, [valid] loss_att=20.604, acc=0.961, cer=0.046, wer=0.176, loss=20.604, time=5 minutes and 23.46 seconds, total_count=280, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 45.67 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 06:30:03,037 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 06:30:03,040 (trainer:272) INFO: 6/60epoch started. Estimated time to finish: 2 weeks, 2 days and 1 hour +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 06:55:16,369 (trainer:732) INFO: 6epoch:train:1-1810batch: iter_time=0.001, forward_time=0.204, loss_att=81.451, acc=0.930, loss=81.451, backward_time=0.298, grad_norm=99.439, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=3.345 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 07:15:34,659 (trainer:732) INFO: 6epoch:train:1811-3620batch: iter_time=2.850e-04, forward_time=0.204, loss_att=83.176, acc=0.929, loss=83.176, backward_time=0.299, grad_norm=101.714, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.692 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 07:35:50,857 (trainer:732) INFO: 6epoch:train:3621-5430batch: iter_time=2.869e-04, forward_time=0.203, loss_att=81.531, acc=0.929, loss=81.531, backward_time=0.298, grad_norm=101.914, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 07:56:04,505 (trainer:732) INFO: 6epoch:train:5431-7240batch: iter_time=2.854e-04, forward_time=0.203, loss_att=80.437, acc=0.930, loss=80.437, backward_time=0.297, grad_norm=102.097, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 08:16:27,813 (trainer:732) INFO: 6epoch:train:7241-9050batch: iter_time=2.868e-04, forward_time=0.204, loss_att=81.187, acc=0.930, loss=81.187, backward_time=0.299, grad_norm=101.507, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.703 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 08:36:42,581 (trainer:732) INFO: 6epoch:train:9051-10860batch: iter_time=2.839e-04, forward_time=0.203, loss_att=80.195, acc=0.929, loss=80.195, backward_time=0.297, grad_norm=99.322, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 08:56:56,490 (trainer:732) INFO: 6epoch:train:10861-12670batch: iter_time=2.896e-04, forward_time=0.203, loss_att=79.219, acc=0.930, loss=79.219, backward_time=0.297, grad_norm=96.671, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 09:17:12,111 (trainer:732) INFO: 6epoch:train:12671-14480batch: iter_time=2.909e-04, forward_time=0.203, loss_att=80.711, acc=0.930, loss=80.711, backward_time=0.298, grad_norm=102.602, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.686 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 09:37:29,041 (trainer:732) INFO: 6epoch:train:14481-16290batch: iter_time=2.841e-04, forward_time=0.203, loss_att=80.247, acc=0.931, loss=80.247, backward_time=0.298, grad_norm=98.854, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 09:57:47,642 (trainer:732) INFO: 6epoch:train:16291-18100batch: iter_time=2.787e-04, forward_time=0.204, loss_att=80.535, acc=0.931, loss=80.535, backward_time=0.298, grad_norm=103.203, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.693 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 10:18:01,735 (trainer:732) INFO: 6epoch:train:18101-19910batch: iter_time=2.846e-04, forward_time=0.203, loss_att=79.556, acc=0.931, loss=79.556, backward_time=0.298, grad_norm=103.658, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 10:38:18,736 (trainer:732) INFO: 6epoch:train:19911-21720batch: iter_time=2.818e-04, forward_time=0.203, loss_att=79.315, acc=0.931, loss=79.315, backward_time=0.298, grad_norm=101.395, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 10:58:35,165 (trainer:732) INFO: 6epoch:train:21721-23530batch: iter_time=2.828e-04, forward_time=0.203, loss_att=78.338, acc=0.932, loss=78.338, backward_time=0.298, grad_norm=98.455, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 11:18:52,498 (trainer:732) INFO: 6epoch:train:23531-25340batch: iter_time=2.849e-04, forward_time=0.204, loss_att=78.238, acc=0.932, loss=78.238, backward_time=0.297, grad_norm=100.559, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.690 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 11:39:08,969 (trainer:732) INFO: 6epoch:train:25341-27150batch: iter_time=2.869e-04, forward_time=0.204, loss_att=78.897, acc=0.932, loss=78.897, backward_time=0.298, grad_norm=101.709, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.001, train_time=2.688 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 11:59:19,630 (trainer:732) INFO: 6epoch:train:27151-28960batch: iter_time=2.754e-04, forward_time=0.202, loss_att=77.714, acc=0.933, loss=77.714, backward_time=0.297, grad_norm=99.027, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 12:19:28,798 (trainer:732) INFO: 6epoch:train:28961-30770batch: iter_time=2.761e-04, forward_time=0.202, loss_att=78.181, acc=0.932, loss=78.181, backward_time=0.297, grad_norm=96.316, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 12:39:42,620 (trainer:732) INFO: 6epoch:train:30771-32580batch: iter_time=2.869e-04, forward_time=0.203, loss_att=77.462, acc=0.933, loss=77.462, backward_time=0.298, grad_norm=98.728, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 12:59:52,467 (trainer:732) INFO: 6epoch:train:32581-34390batch: iter_time=2.887e-04, forward_time=0.203, loss_att=77.186, acc=0.933, loss=77.186, backward_time=0.296, grad_norm=96.088, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 13:19:58,545 (trainer:732) INFO: 6epoch:train:34391-36200batch: iter_time=2.734e-04, forward_time=0.202, loss_att=76.570, acc=0.933, loss=76.570, backward_time=0.295, grad_norm=96.860, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 13:28:18,754 (trainer:338) INFO: 6epoch results: [train] iter_time=3.233e-04, forward_time=0.203, loss_att=79.501, acc=0.931, loss=79.501, backward_time=0.297, grad_norm=100.011, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.717, time=6 hours, 50 minutes and 11.62 seconds, total_count=217266, gpu_max_cached_mem_GB=29.945, [valid] loss_att=18.731, acc=0.965, cer=0.042, wer=0.161, loss=18.731, time=4 minutes and 31.31 seconds, total_count=336, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 32.78 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 13:28:22,850 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 13:28:22,853 (trainer:272) INFO: 7/60epoch started. Estimated time to finish: 2 weeks, 1 day and 17 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 13:53:29,740 (trainer:732) INFO: 7epoch:train:1-1810batch: iter_time=0.001, forward_time=0.207, loss_att=74.849, acc=0.935, loss=74.849, backward_time=0.296, grad_norm=97.710, clip=100.000, loss_scale=1.000, optim_step_time=0.077, optim0_lr0=0.001, train_time=3.330 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 14:13:53,226 (trainer:732) INFO: 7epoch:train:1811-3620batch: iter_time=3.687e-04, forward_time=0.206, loss_att=75.285, acc=0.934, loss=75.285, backward_time=0.295, grad_norm=102.403, clip=100.000, loss_scale=1.000, optim_step_time=0.075, optim0_lr0=0.001, train_time=2.703 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 14:34:02,861 (trainer:732) INFO: 7epoch:train:3621-5430batch: iter_time=2.894e-04, forward_time=0.203, loss_att=75.654, acc=0.934, loss=75.654, backward_time=0.295, grad_norm=98.545, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 14:54:06,286 (trainer:732) INFO: 7epoch:train:5431-7240batch: iter_time=2.599e-04, forward_time=0.202, loss_att=75.314, acc=0.934, loss=75.314, backward_time=0.294, grad_norm=97.152, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.659 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 15:14:13,469 (trainer:732) INFO: 7epoch:train:7241-9050batch: iter_time=2.683e-04, forward_time=0.202, loss_att=75.199, acc=0.935, loss=75.199, backward_time=0.295, grad_norm=98.848, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 15:34:18,823 (trainer:732) INFO: 7epoch:train:9051-10860batch: iter_time=2.578e-04, forward_time=0.202, loss_att=75.396, acc=0.935, loss=75.396, backward_time=0.295, grad_norm=97.298, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 15:54:23,767 (trainer:732) INFO: 7epoch:train:10861-12670batch: iter_time=2.614e-04, forward_time=0.202, loss_att=76.217, acc=0.934, loss=76.217, backward_time=0.294, grad_norm=103.638, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 16:14:29,986 (trainer:732) INFO: 7epoch:train:12671-14480batch: iter_time=2.662e-04, forward_time=0.202, loss_att=74.922, acc=0.935, loss=74.922, backward_time=0.294, grad_norm=97.174, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.665 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 16:35:00,175 (trainer:732) INFO: 7epoch:train:14481-16290batch: iter_time=3.626e-04, forward_time=0.207, loss_att=75.483, acc=0.935, loss=75.483, backward_time=0.297, grad_norm=99.775, clip=100.000, loss_scale=1.000, optim_step_time=0.076, optim0_lr0=0.001, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 16:55:23,812 (trainer:732) INFO: 7epoch:train:16291-18100batch: iter_time=3.779e-04, forward_time=0.206, loss_att=74.432, acc=0.935, loss=74.432, backward_time=0.296, grad_norm=100.774, clip=100.000, loss_scale=1.000, optim_step_time=0.073, optim0_lr0=0.001, train_time=2.703 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 17:15:38,158 (trainer:732) INFO: 7epoch:train:18101-19910batch: iter_time=2.846e-04, forward_time=0.203, loss_att=74.863, acc=0.936, loss=74.863, backward_time=0.296, grad_norm=98.899, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 17:35:43,131 (trainer:732) INFO: 7epoch:train:19911-21720batch: iter_time=2.819e-04, forward_time=0.202, loss_att=74.155, acc=0.935, loss=74.155, backward_time=0.294, grad_norm=99.976, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 17:55:45,600 (trainer:732) INFO: 7epoch:train:21721-23530batch: iter_time=2.854e-04, forward_time=0.202, loss_att=74.140, acc=0.935, loss=74.140, backward_time=0.294, grad_norm=97.234, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.657 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 18:15:54,928 (trainer:732) INFO: 7epoch:train:23531-25340batch: iter_time=2.844e-04, forward_time=0.202, loss_att=74.206, acc=0.936, loss=74.206, backward_time=0.295, grad_norm=102.138, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 18:36:06,410 (trainer:732) INFO: 7epoch:train:25341-27150batch: iter_time=2.776e-04, forward_time=0.203, loss_att=74.075, acc=0.936, loss=74.075, backward_time=0.296, grad_norm=99.331, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 18:56:11,259 (trainer:732) INFO: 7epoch:train:27151-28960batch: iter_time=2.733e-04, forward_time=0.202, loss_att=73.092, acc=0.936, loss=73.092, backward_time=0.294, grad_norm=101.147, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.662 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 19:16:18,784 (trainer:732) INFO: 7epoch:train:28961-30770batch: iter_time=2.720e-04, forward_time=0.202, loss_att=72.774, acc=0.937, loss=72.774, backward_time=0.295, grad_norm=99.928, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 19:36:29,173 (trainer:732) INFO: 7epoch:train:30771-32580batch: iter_time=2.709e-04, forward_time=0.203, loss_att=72.933, acc=0.937, loss=72.933, backward_time=0.296, grad_norm=97.479, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 19:56:37,177 (trainer:732) INFO: 7epoch:train:32581-34390batch: iter_time=2.797e-04, forward_time=0.202, loss_att=74.721, acc=0.936, loss=74.721, backward_time=0.295, grad_norm=99.840, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 20:16:48,025 (trainer:732) INFO: 7epoch:train:34391-36200batch: iter_time=2.894e-04, forward_time=0.203, loss_att=73.868, acc=0.936, loss=73.868, backward_time=0.295, grad_norm=98.936, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 20:25:55,919 (trainer:338) INFO: 7epoch results: [train] iter_time=3.450e-04, forward_time=0.203, loss_att=74.577, acc=0.935, loss=74.577, backward_time=0.295, grad_norm=99.407, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.707, time=6 hours, 48 minutes and 47.95 seconds, total_count=253477, gpu_max_cached_mem_GB=29.945, [valid] loss_att=18.372, acc=0.966, cer=0.039, wer=0.153, loss=18.372, time=5 minutes and 3.55 seconds, total_count=392, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 41.57 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 20:26:00,672 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 20:26:00,676 (trainer:272) INFO: 8/60epoch started. Estimated time to finish: 2 weeks, 1 day and 10 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 20:51:15,938 (trainer:732) INFO: 8epoch:train:1-1810batch: iter_time=0.001, forward_time=0.205, loss_att=72.315, acc=0.938, loss=72.315, backward_time=0.296, grad_norm=100.866, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.001, train_time=3.349 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 21:11:29,992 (trainer:732) INFO: 8epoch:train:1811-3620batch: iter_time=3.271e-04, forward_time=0.204, loss_att=71.542, acc=0.937, loss=71.542, backward_time=0.295, grad_norm=98.715, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 21:31:38,055 (trainer:732) INFO: 8epoch:train:3621-5430batch: iter_time=3.063e-04, forward_time=0.203, loss_att=71.128, acc=0.937, loss=71.128, backward_time=0.294, grad_norm=96.502, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 21:51:52,041 (trainer:732) INFO: 8epoch:train:5431-7240batch: iter_time=2.930e-04, forward_time=0.203, loss_att=71.847, acc=0.938, loss=71.847, backward_time=0.296, grad_norm=99.659, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 22:12:02,075 (trainer:732) INFO: 8epoch:train:7241-9050batch: iter_time=2.925e-04, forward_time=0.203, loss_att=70.841, acc=0.938, loss=70.841, backward_time=0.295, grad_norm=95.286, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 22:32:11,273 (trainer:732) INFO: 8epoch:train:9051-10860batch: iter_time=2.884e-04, forward_time=0.203, loss_att=70.527, acc=0.938, loss=70.527, backward_time=0.295, grad_norm=98.656, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 22:52:19,736 (trainer:732) INFO: 8epoch:train:10861-12670batch: iter_time=2.839e-04, forward_time=0.202, loss_att=71.494, acc=0.938, loss=71.494, backward_time=0.295, grad_norm=100.543, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 23:12:32,031 (trainer:732) INFO: 8epoch:train:12671-14480batch: iter_time=2.813e-04, forward_time=0.203, loss_att=71.196, acc=0.939, loss=71.196, backward_time=0.296, grad_norm=96.901, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 23:32:38,942 (trainer:732) INFO: 8epoch:train:14481-16290batch: iter_time=2.825e-04, forward_time=0.202, loss_att=71.038, acc=0.938, loss=71.038, backward_time=0.294, grad_norm=98.147, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-22 23:55:33,747 (trainer:732) INFO: 8epoch:train:16291-18100batch: iter_time=5.129e-04, forward_time=0.213, loss_att=71.991, acc=0.938, loss=71.991, backward_time=0.301, grad_norm=98.971, clip=100.000, loss_scale=1.000, optim_step_time=0.090, optim0_lr0=0.001, train_time=3.037 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 00:21:16,006 (trainer:732) INFO: 8epoch:train:18101-19910batch: iter_time=5.942e-04, forward_time=0.216, loss_att=71.218, acc=0.939, loss=71.218, backward_time=0.303, grad_norm=97.581, clip=100.000, loss_scale=1.000, optim_step_time=0.095, optim0_lr0=0.001, train_time=3.408 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 00:50:00,863 (trainer:732) INFO: 8epoch:train:19911-21720batch: iter_time=5.398e-04, forward_time=0.216, loss_att=70.751, acc=0.938, loss=70.751, backward_time=0.304, grad_norm=99.191, clip=100.000, loss_scale=1.000, optim_step_time=0.098, optim0_lr0=0.001, train_time=3.810 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 01:14:17,949 (trainer:732) INFO: 8epoch:train:21721-23530batch: iter_time=4.712e-04, forward_time=0.213, loss_att=70.255, acc=0.939, loss=70.255, backward_time=0.303, grad_norm=99.867, clip=100.000, loss_scale=1.000, optim_step_time=0.087, optim0_lr0=0.001, train_time=3.219 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 01:42:06,455 (trainer:732) INFO: 8epoch:train:23531-25340batch: iter_time=5.615e-04, forward_time=0.220, loss_att=70.729, acc=0.939, loss=70.729, backward_time=0.305, grad_norm=99.794, clip=100.000, loss_scale=1.000, optim_step_time=0.104, optim0_lr0=0.001, train_time=3.687 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 02:06:43,579 (trainer:732) INFO: 8epoch:train:25341-27150batch: iter_time=4.853e-04, forward_time=0.214, loss_att=70.784, acc=0.938, loss=70.784, backward_time=0.302, grad_norm=94.370, clip=100.000, loss_scale=1.000, optim_step_time=0.094, optim0_lr0=0.001, train_time=3.265 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 02:27:11,485 (trainer:732) INFO: 8epoch:train:27151-28960batch: iter_time=3.660e-04, forward_time=0.205, loss_att=70.573, acc=0.939, loss=70.573, backward_time=0.297, grad_norm=99.985, clip=100.000, loss_scale=1.000, optim_step_time=0.071, optim0_lr0=0.001, train_time=2.713 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 02:47:33,983 (trainer:732) INFO: 8epoch:train:28961-30770batch: iter_time=3.222e-04, forward_time=0.205, loss_att=69.424, acc=0.939, loss=69.424, backward_time=0.297, grad_norm=100.019, clip=100.000, loss_scale=1.000, optim_step_time=0.070, optim0_lr0=0.001, train_time=2.701 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 03:07:51,096 (trainer:732) INFO: 8epoch:train:30771-32580batch: iter_time=2.964e-04, forward_time=0.204, loss_att=70.567, acc=0.939, loss=70.567, backward_time=0.296, grad_norm=96.832, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=0.001, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 03:28:00,441 (trainer:732) INFO: 8epoch:train:32581-34390batch: iter_time=2.742e-04, forward_time=0.203, loss_att=69.315, acc=0.940, loss=69.315, backward_time=0.295, grad_norm=101.492, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 03:48:12,319 (trainer:732) INFO: 8epoch:train:34391-36200batch: iter_time=2.711e-04, forward_time=0.203, loss_att=69.633, acc=0.940, loss=69.633, backward_time=0.296, grad_norm=98.687, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 03:57:18,999 (trainer:338) INFO: 8epoch results: [train] iter_time=4.027e-04, forward_time=0.207, loss_att=70.862, acc=0.938, loss=70.862, backward_time=0.298, grad_norm=98.603, clip=100.000, loss_scale=1.000, optim_step_time=0.075, optim0_lr0=0.001, train_time=2.931, time=7 hours, 22 minutes and 28.28 seconds, total_count=289688, gpu_max_cached_mem_GB=29.945, [valid] loss_att=17.125, acc=0.968, cer=0.037, wer=0.144, loss=17.125, time=5 minutes and 15.8 seconds, total_count=448, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 34.24 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 03:57:22,877 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 03:57:22,881 (trainer:272) INFO: 9/60epoch started. Estimated time to finish: 2 weeks, 1 day and 7 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 04:22:27,000 (trainer:732) INFO: 9epoch:train:1-1810batch: iter_time=0.001, forward_time=0.203, loss_att=68.271, acc=0.941, loss=68.271, backward_time=0.295, grad_norm=99.837, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=3.324 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 04:42:33,477 (trainer:732) INFO: 9epoch:train:1811-3620batch: iter_time=2.798e-04, forward_time=0.202, loss_att=68.472, acc=0.941, loss=68.472, backward_time=0.294, grad_norm=98.543, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 05:02:42,731 (trainer:732) INFO: 9epoch:train:3621-5430batch: iter_time=2.731e-04, forward_time=0.202, loss_att=68.296, acc=0.941, loss=68.296, backward_time=0.295, grad_norm=102.736, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 05:22:52,799 (trainer:732) INFO: 9epoch:train:5431-7240batch: iter_time=2.719e-04, forward_time=0.203, loss_att=68.924, acc=0.940, loss=68.924, backward_time=0.295, grad_norm=102.042, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 05:43:00,600 (trainer:732) INFO: 9epoch:train:7241-9050batch: iter_time=2.778e-04, forward_time=0.202, loss_att=67.616, acc=0.941, loss=67.616, backward_time=0.294, grad_norm=95.859, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 06:03:10,203 (trainer:732) INFO: 9epoch:train:9051-10860batch: iter_time=2.751e-04, forward_time=0.202, loss_att=67.999, acc=0.941, loss=67.999, backward_time=0.295, grad_norm=105.029, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 06:23:14,098 (trainer:732) INFO: 9epoch:train:10861-12670batch: iter_time=2.753e-04, forward_time=0.202, loss_att=67.171, acc=0.941, loss=67.171, backward_time=0.294, grad_norm=94.234, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.660 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 06:43:22,358 (trainer:732) INFO: 9epoch:train:12671-14480batch: iter_time=2.797e-04, forward_time=0.202, loss_att=67.886, acc=0.941, loss=67.886, backward_time=0.295, grad_norm=99.531, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 07:03:32,939 (trainer:732) INFO: 9epoch:train:14481-16290batch: iter_time=2.754e-04, forward_time=0.203, loss_att=67.865, acc=0.941, loss=67.865, backward_time=0.296, grad_norm=106.174, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 07:23:38,181 (trainer:732) INFO: 9epoch:train:16291-18100batch: iter_time=2.660e-04, forward_time=0.202, loss_att=68.483, acc=0.940, loss=68.483, backward_time=0.294, grad_norm=96.616, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.663 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 07:43:47,739 (trainer:732) INFO: 9epoch:train:18101-19910batch: iter_time=2.693e-04, forward_time=0.203, loss_att=68.292, acc=0.941, loss=68.292, backward_time=0.295, grad_norm=97.793, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 08:03:54,148 (trainer:732) INFO: 9epoch:train:19911-21720batch: iter_time=2.693e-04, forward_time=0.202, loss_att=67.876, acc=0.941, loss=67.876, backward_time=0.294, grad_norm=98.373, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.666 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 08:24:02,613 (trainer:732) INFO: 9epoch:train:21721-23530batch: iter_time=2.603e-04, forward_time=0.203, loss_att=68.136, acc=0.941, loss=68.136, backward_time=0.295, grad_norm=97.999, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 08:44:12,140 (trainer:732) INFO: 9epoch:train:23531-25340batch: iter_time=2.689e-04, forward_time=0.202, loss_att=67.778, acc=0.941, loss=67.778, backward_time=0.295, grad_norm=95.510, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 09:04:18,823 (trainer:732) INFO: 9epoch:train:25341-27150batch: iter_time=2.690e-04, forward_time=0.202, loss_att=66.678, acc=0.941, loss=66.678, backward_time=0.295, grad_norm=96.923, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=0.001, train_time=2.667 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 09:24:26,527 (trainer:732) INFO: 9epoch:train:27151-28960batch: iter_time=2.670e-04, forward_time=0.202, loss_att=67.071, acc=0.942, loss=67.071, backward_time=0.295, grad_norm=97.540, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 09:44:34,346 (trainer:732) INFO: 9epoch:train:28961-30770batch: iter_time=2.647e-04, forward_time=0.202, loss_att=68.262, acc=0.941, loss=68.262, backward_time=0.295, grad_norm=98.108, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 10:04:45,185 (trainer:732) INFO: 9epoch:train:30771-32580batch: iter_time=2.640e-04, forward_time=0.203, loss_att=67.656, acc=0.942, loss=67.656, backward_time=0.296, grad_norm=94.403, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.979e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 10:24:53,387 (trainer:732) INFO: 9epoch:train:32581-34390batch: iter_time=2.796e-04, forward_time=0.202, loss_att=67.300, acc=0.942, loss=67.300, backward_time=0.295, grad_norm=99.010, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.951e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 10:45:05,263 (trainer:732) INFO: 9epoch:train:34391-36200batch: iter_time=2.763e-04, forward_time=0.203, loss_att=67.668, acc=0.942, loss=67.668, backward_time=0.296, grad_norm=99.437, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.923e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 10:54:27,609 (trainer:338) INFO: 9epoch results: [train] iter_time=3.158e-04, forward_time=0.202, loss_att=67.885, acc=0.941, loss=67.885, backward_time=0.295, grad_norm=98.781, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=0.001, train_time=2.703, time=6 hours, 48 minutes and 4.87 seconds, total_count=325899, gpu_max_cached_mem_GB=29.945, [valid] loss_att=15.542, acc=0.971, cer=0.035, wer=0.135, loss=15.542, time=5 minutes and 21.96 seconds, total_count=504, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 37.9 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 10:54:32,738 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 10:54:32,745 (trainer:272) INFO: 10/60epoch started. Estimated time to finish: 2 weeks, 23 hours and 21 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 11:19:43,711 (trainer:732) INFO: 10epoch:train:1-1810batch: iter_time=9.584e-04, forward_time=0.204, loss_att=65.152, acc=0.943, loss=65.152, backward_time=0.295, grad_norm=99.477, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=9.896e-04, train_time=3.339 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 11:40:00,998 (trainer:732) INFO: 10epoch:train:1811-3620batch: iter_time=3.294e-04, forward_time=0.204, loss_att=65.523, acc=0.943, loss=65.523, backward_time=0.296, grad_norm=96.438, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=9.868e-04, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 12:00:12,676 (trainer:732) INFO: 10epoch:train:3621-5430batch: iter_time=3.014e-04, forward_time=0.203, loss_att=65.657, acc=0.942, loss=65.657, backward_time=0.295, grad_norm=96.890, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=9.841e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 12:20:23,289 (trainer:732) INFO: 10epoch:train:5431-7240batch: iter_time=2.853e-04, forward_time=0.203, loss_att=65.623, acc=0.943, loss=65.623, backward_time=0.295, grad_norm=99.990, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=9.814e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 12:40:34,187 (trainer:732) INFO: 10epoch:train:7241-9050batch: iter_time=2.852e-04, forward_time=0.203, loss_att=65.537, acc=0.943, loss=65.537, backward_time=0.295, grad_norm=97.055, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.788e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 13:00:47,355 (trainer:732) INFO: 10epoch:train:9051-10860batch: iter_time=2.821e-04, forward_time=0.203, loss_att=65.933, acc=0.943, loss=65.933, backward_time=0.295, grad_norm=96.739, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=9.761e-04, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 13:21:00,311 (trainer:732) INFO: 10epoch:train:10861-12670batch: iter_time=3.001e-04, forward_time=0.203, loss_att=64.998, acc=0.944, loss=64.998, backward_time=0.296, grad_norm=97.317, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=9.735e-04, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 13:41:15,269 (trainer:732) INFO: 10epoch:train:12671-14480batch: iter_time=3.101e-04, forward_time=0.204, loss_att=66.219, acc=0.943, loss=66.219, backward_time=0.296, grad_norm=98.284, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=9.709e-04, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 14:01:30,378 (trainer:732) INFO: 10epoch:train:14481-16290batch: iter_time=3.204e-04, forward_time=0.204, loss_att=65.585, acc=0.943, loss=65.585, backward_time=0.295, grad_norm=97.220, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=9.683e-04, train_time=2.685 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 14:21:42,150 (trainer:732) INFO: 10epoch:train:16291-18100batch: iter_time=3.244e-04, forward_time=0.206, loss_att=66.197, acc=0.943, loss=66.197, backward_time=0.295, grad_norm=99.359, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.658e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 14:41:57,794 (trainer:732) INFO: 10epoch:train:18101-19910batch: iter_time=3.486e-04, forward_time=0.206, loss_att=65.678, acc=0.943, loss=65.678, backward_time=0.296, grad_norm=100.502, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.633e-04, train_time=2.686 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 15:02:13,560 (trainer:732) INFO: 10epoch:train:19911-21720batch: iter_time=3.461e-04, forward_time=0.206, loss_att=65.473, acc=0.943, loss=65.473, backward_time=0.296, grad_norm=94.675, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.607e-04, train_time=2.686 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 15:22:29,655 (trainer:732) INFO: 10epoch:train:21721-23530batch: iter_time=3.195e-04, forward_time=0.205, loss_att=65.034, acc=0.943, loss=65.034, backward_time=0.295, grad_norm=96.575, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.582e-04, train_time=2.687 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 15:42:39,378 (trainer:732) INFO: 10epoch:train:23531-25340batch: iter_time=3.327e-04, forward_time=0.205, loss_att=64.645, acc=0.943, loss=64.645, backward_time=0.294, grad_norm=96.084, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.558e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 16:02:55,989 (trainer:732) INFO: 10epoch:train:25341-27150batch: iter_time=3.306e-04, forward_time=0.206, loss_att=66.151, acc=0.943, loss=66.151, backward_time=0.296, grad_norm=94.548, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.533e-04, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 16:23:09,953 (trainer:732) INFO: 10epoch:train:27151-28960batch: iter_time=3.227e-04, forward_time=0.205, loss_att=64.911, acc=0.944, loss=64.911, backward_time=0.296, grad_norm=101.235, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=9.509e-04, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 16:43:21,596 (trainer:732) INFO: 10epoch:train:28961-30770batch: iter_time=3.297e-04, forward_time=0.205, loss_att=64.690, acc=0.943, loss=64.690, backward_time=0.295, grad_norm=101.595, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.484e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 17:03:35,491 (trainer:732) INFO: 10epoch:train:30771-32580batch: iter_time=3.309e-04, forward_time=0.206, loss_att=65.793, acc=0.943, loss=65.793, backward_time=0.296, grad_norm=99.751, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.460e-04, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 17:23:43,393 (trainer:732) INFO: 10epoch:train:32581-34390batch: iter_time=3.111e-04, forward_time=0.205, loss_att=64.413, acc=0.943, loss=64.413, backward_time=0.294, grad_norm=100.124, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.436e-04, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 17:43:54,308 (trainer:732) INFO: 10epoch:train:34391-36200batch: iter_time=3.319e-04, forward_time=0.205, loss_att=65.683, acc=0.943, loss=65.683, backward_time=0.295, grad_norm=100.240, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=9.413e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 17:53:20,005 (trainer:338) INFO: 10epoch results: [train] iter_time=3.500e-04, forward_time=0.205, loss_att=65.440, acc=0.943, loss=65.440, backward_time=0.295, grad_norm=98.207, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.648e-04, train_time=2.714, time=6 hours, 49 minutes and 42.45 seconds, total_count=362110, gpu_max_cached_mem_GB=29.945, [valid] loss_att=15.606, acc=0.971, cer=0.033, wer=0.129, loss=15.606, time=5 minutes and 9.95 seconds, total_count=560, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 54.86 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 17:53:25,516 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 17:53:25,522 (trainer:272) INFO: 11/60epoch started. Estimated time to finish: 2 weeks, 15 hours and 58 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 18:18:34,013 (trainer:732) INFO: 11epoch:train:1-1810batch: iter_time=0.001, forward_time=0.206, loss_att=64.088, acc=0.944, loss=64.088, backward_time=0.296, grad_norm=100.521, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=9.389e-04, train_time=3.334 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 18:38:54,486 (trainer:732) INFO: 11epoch:train:1811-3620batch: iter_time=3.469e-04, forward_time=0.207, loss_att=63.243, acc=0.945, loss=63.243, backward_time=0.296, grad_norm=99.823, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=9.366e-04, train_time=2.696 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 18:59:14,305 (trainer:732) INFO: 11epoch:train:3621-5430batch: iter_time=3.170e-04, forward_time=0.206, loss_att=63.337, acc=0.945, loss=63.337, backward_time=0.296, grad_norm=99.871, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.343e-04, train_time=2.696 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 19:19:28,085 (trainer:732) INFO: 11epoch:train:5431-7240batch: iter_time=3.287e-04, forward_time=0.206, loss_att=63.987, acc=0.945, loss=63.987, backward_time=0.295, grad_norm=99.143, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.320e-04, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 19:39:50,895 (trainer:732) INFO: 11epoch:train:7241-9050batch: iter_time=3.476e-04, forward_time=0.207, loss_att=64.385, acc=0.944, loss=64.385, backward_time=0.296, grad_norm=95.444, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=9.297e-04, train_time=2.703 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 20:00:10,326 (trainer:732) INFO: 11epoch:train:9051-10860batch: iter_time=3.518e-04, forward_time=0.207, loss_att=63.459, acc=0.945, loss=63.459, backward_time=0.296, grad_norm=96.679, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=9.274e-04, train_time=2.694 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 20:20:21,509 (trainer:732) INFO: 11epoch:train:10861-12670batch: iter_time=3.315e-04, forward_time=0.205, loss_att=63.796, acc=0.944, loss=63.796, backward_time=0.294, grad_norm=96.217, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.252e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 20:40:39,161 (trainer:732) INFO: 11epoch:train:12671-14480batch: iter_time=3.605e-04, forward_time=0.206, loss_att=62.965, acc=0.945, loss=62.965, backward_time=0.295, grad_norm=94.953, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=9.229e-04, train_time=2.690 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 21:00:59,512 (trainer:732) INFO: 11epoch:train:14481-16290batch: iter_time=3.390e-04, forward_time=0.207, loss_att=63.723, acc=0.945, loss=63.723, backward_time=0.296, grad_norm=105.498, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=9.207e-04, train_time=2.697 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 21:21:11,029 (trainer:732) INFO: 11epoch:train:16291-18100batch: iter_time=3.248e-04, forward_time=0.205, loss_att=63.496, acc=0.944, loss=63.496, backward_time=0.295, grad_norm=96.719, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=9.185e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 21:41:28,875 (trainer:732) INFO: 11epoch:train:18101-19910batch: iter_time=3.499e-04, forward_time=0.207, loss_att=63.315, acc=0.945, loss=63.315, backward_time=0.296, grad_norm=98.138, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=9.164e-04, train_time=2.691 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 22:01:43,709 (trainer:732) INFO: 11epoch:train:19911-21720batch: iter_time=3.447e-04, forward_time=0.206, loss_att=63.447, acc=0.945, loss=63.447, backward_time=0.295, grad_norm=100.878, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=9.142e-04, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 22:22:11,896 (trainer:732) INFO: 11epoch:train:21721-23530batch: iter_time=3.904e-04, forward_time=0.210, loss_att=64.940, acc=0.944, loss=64.940, backward_time=0.296, grad_norm=100.917, clip=100.000, loss_scale=1.000, optim_step_time=0.072, optim0_lr0=9.120e-04, train_time=2.713 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 22:42:41,628 (trainer:732) INFO: 11epoch:train:23531-25340batch: iter_time=3.943e-04, forward_time=0.209, loss_att=62.006, acc=0.945, loss=62.006, backward_time=0.295, grad_norm=97.448, clip=100.000, loss_scale=1.000, optim_step_time=0.074, optim0_lr0=9.099e-04, train_time=2.717 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 23:03:09,083 (trainer:732) INFO: 11epoch:train:25341-27150batch: iter_time=4.029e-04, forward_time=0.209, loss_att=62.321, acc=0.945, loss=62.321, backward_time=0.295, grad_norm=101.995, clip=100.000, loss_scale=1.000, optim_step_time=0.074, optim0_lr0=9.078e-04, train_time=2.712 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 23:23:38,009 (trainer:732) INFO: 11epoch:train:27151-28960batch: iter_time=3.975e-04, forward_time=0.209, loss_att=63.389, acc=0.945, loss=63.389, backward_time=0.296, grad_norm=100.772, clip=100.000, loss_scale=1.000, optim_step_time=0.072, optim0_lr0=9.057e-04, train_time=2.715 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-23 23:44:11,522 (trainer:732) INFO: 11epoch:train:28961-30770batch: iter_time=3.963e-04, forward_time=0.210, loss_att=63.463, acc=0.945, loss=63.463, backward_time=0.297, grad_norm=99.883, clip=100.000, loss_scale=1.000, optim_step_time=0.074, optim0_lr0=9.036e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 00:04:44,008 (trainer:732) INFO: 11epoch:train:30771-32580batch: iter_time=4.258e-04, forward_time=0.210, loss_att=62.971, acc=0.945, loss=62.971, backward_time=0.297, grad_norm=100.713, clip=100.000, loss_scale=1.000, optim_step_time=0.073, optim0_lr0=9.015e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 00:25:06,892 (trainer:732) INFO: 11epoch:train:32581-34390batch: iter_time=3.777e-04, forward_time=0.208, loss_att=63.024, acc=0.945, loss=63.024, backward_time=0.295, grad_norm=102.175, clip=100.000, loss_scale=1.000, optim_step_time=0.071, optim0_lr0=8.994e-04, train_time=2.702 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 00:45:37,145 (trainer:732) INFO: 11epoch:train:34391-36200batch: iter_time=3.842e-04, forward_time=0.209, loss_att=63.291, acc=0.946, loss=63.291, backward_time=0.297, grad_norm=100.825, clip=100.000, loss_scale=1.000, optim_step_time=0.073, optim0_lr0=8.974e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 00:54:53,307 (trainer:338) INFO: 11epoch results: [train] iter_time=3.963e-04, forward_time=0.208, loss_att=63.433, acc=0.945, loss=63.433, backward_time=0.296, grad_norm=99.436, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=9.177e-04, train_time=2.732, time=6 hours, 52 minutes and 38.1 seconds, total_count=398321, gpu_max_cached_mem_GB=29.945, [valid] loss_att=14.570, acc=0.973, cer=0.033, wer=0.128, loss=14.570, time=5 minutes and 18.05 seconds, total_count=616, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 31.63 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 00:54:57,413 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 00:54:57,419 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/1epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 00:54:57,420 (trainer:272) INFO: 12/60epoch started. Estimated time to finish: 2 weeks, 8 hours and 52 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 01:20:08,608 (trainer:732) INFO: 12epoch:train:1-1810batch: iter_time=0.002, forward_time=0.206, loss_att=62.758, acc=0.946, loss=62.758, backward_time=0.296, grad_norm=101.979, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.953e-04, train_time=3.340 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 01:40:22,345 (trainer:732) INFO: 12epoch:train:1811-3620batch: iter_time=3.392e-04, forward_time=0.205, loss_att=61.927, acc=0.946, loss=61.927, backward_time=0.295, grad_norm=97.016, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.933e-04, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 02:00:35,963 (trainer:732) INFO: 12epoch:train:3621-5430batch: iter_time=3.202e-04, forward_time=0.205, loss_att=62.145, acc=0.946, loss=62.145, backward_time=0.295, grad_norm=95.262, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.913e-04, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 02:20:50,409 (trainer:732) INFO: 12epoch:train:5431-7240batch: iter_time=3.096e-04, forward_time=0.205, loss_att=61.624, acc=0.946, loss=61.624, backward_time=0.295, grad_norm=98.879, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.893e-04, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 02:41:08,088 (trainer:732) INFO: 12epoch:train:7241-9050batch: iter_time=3.206e-04, forward_time=0.206, loss_att=62.234, acc=0.946, loss=62.234, backward_time=0.296, grad_norm=91.276, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.873e-04, train_time=2.691 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 03:01:27,331 (trainer:732) INFO: 12epoch:train:9051-10860batch: iter_time=3.242e-04, forward_time=0.206, loss_att=62.067, acc=0.946, loss=62.067, backward_time=0.296, grad_norm=95.552, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=8.853e-04, train_time=2.693 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 03:21:38,991 (trainer:732) INFO: 12epoch:train:10861-12670batch: iter_time=3.077e-04, forward_time=0.205, loss_att=62.472, acc=0.946, loss=62.472, backward_time=0.295, grad_norm=98.578, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.834e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 03:41:56,827 (trainer:732) INFO: 12epoch:train:12671-14480batch: iter_time=3.275e-04, forward_time=0.205, loss_att=61.493, acc=0.946, loss=61.493, backward_time=0.295, grad_norm=96.373, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.814e-04, train_time=2.691 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 04:02:09,433 (trainer:732) INFO: 12epoch:train:14481-16290batch: iter_time=3.256e-04, forward_time=0.205, loss_att=60.734, acc=0.946, loss=60.734, backward_time=0.294, grad_norm=102.027, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=8.795e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 04:22:22,201 (trainer:732) INFO: 12epoch:train:16291-18100batch: iter_time=3.200e-04, forward_time=0.205, loss_att=61.795, acc=0.946, loss=61.795, backward_time=0.295, grad_norm=97.470, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=8.776e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 04:42:34,801 (trainer:732) INFO: 12epoch:train:18101-19910batch: iter_time=3.320e-04, forward_time=0.205, loss_att=61.386, acc=0.946, loss=61.386, backward_time=0.295, grad_norm=97.624, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.757e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 05:02:51,653 (trainer:732) INFO: 12epoch:train:19911-21720batch: iter_time=3.231e-04, forward_time=0.206, loss_att=61.523, acc=0.946, loss=61.523, backward_time=0.296, grad_norm=96.055, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=8.738e-04, train_time=2.688 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 05:23:05,656 (trainer:732) INFO: 12epoch:train:21721-23530batch: iter_time=3.165e-04, forward_time=0.205, loss_att=61.537, acc=0.947, loss=61.537, backward_time=0.296, grad_norm=101.767, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=8.719e-04, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 05:43:27,919 (trainer:732) INFO: 12epoch:train:23531-25340batch: iter_time=3.327e-04, forward_time=0.206, loss_att=62.324, acc=0.947, loss=62.324, backward_time=0.297, grad_norm=98.372, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=8.701e-04, train_time=2.700 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 06:03:47,190 (trainer:732) INFO: 12epoch:train:25341-27150batch: iter_time=3.309e-04, forward_time=0.205, loss_att=61.046, acc=0.947, loss=61.046, backward_time=0.296, grad_norm=94.497, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=8.682e-04, train_time=2.695 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 06:24:00,300 (trainer:732) INFO: 12epoch:train:27151-28960batch: iter_time=3.134e-04, forward_time=0.205, loss_att=61.184, acc=0.947, loss=61.184, backward_time=0.295, grad_norm=100.838, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=8.663e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 06:44:12,579 (trainer:732) INFO: 12epoch:train:28961-30770batch: iter_time=3.302e-04, forward_time=0.205, loss_att=61.303, acc=0.946, loss=61.303, backward_time=0.295, grad_norm=94.737, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.645e-04, train_time=2.679 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 07:04:25,932 (trainer:732) INFO: 12epoch:train:30771-32580batch: iter_time=3.274e-04, forward_time=0.206, loss_att=61.389, acc=0.947, loss=61.389, backward_time=0.296, grad_norm=94.758, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.627e-04, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 07:24:39,034 (trainer:732) INFO: 12epoch:train:32581-34390batch: iter_time=3.174e-04, forward_time=0.205, loss_att=61.788, acc=0.946, loss=61.788, backward_time=0.296, grad_norm=96.751, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.609e-04, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 07:44:55,040 (trainer:732) INFO: 12epoch:train:34391-36200batch: iter_time=3.174e-04, forward_time=0.206, loss_att=61.671, acc=0.946, loss=61.671, backward_time=0.295, grad_norm=96.222, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.591e-04, train_time=2.687 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 07:53:47,599 (trainer:338) INFO: 12epoch results: [train] iter_time=3.832e-04, forward_time=0.205, loss_att=61.719, acc=0.946, loss=61.719, backward_time=0.295, grad_norm=97.304, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.768e-04, train_time=2.718, time=6 hours, 50 minutes and 20.75 seconds, total_count=434532, gpu_max_cached_mem_GB=29.945, [valid] loss_att=14.339, acc=0.974, cer=0.032, wer=0.123, loss=14.339, time=4 minutes and 49.74 seconds, total_count=672, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 39.69 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 07:53:51,868 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 07:53:51,875 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/2epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 07:53:51,875 (trainer:272) INFO: 13/60epoch started. Estimated time to finish: 2 weeks, 1 hour and 36 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 08:19:15,002 (trainer:732) INFO: 13epoch:train:1-1810batch: iter_time=0.001, forward_time=0.206, loss_att=61.141, acc=0.947, loss=61.141, backward_time=0.295, grad_norm=98.213, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=8.573e-04, train_time=3.366 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 08:39:29,979 (trainer:732) INFO: 13epoch:train:1811-3620batch: iter_time=3.126e-04, forward_time=0.205, loss_att=60.025, acc=0.947, loss=60.025, backward_time=0.295, grad_norm=98.779, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=8.555e-04, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 08:59:45,129 (trainer:732) INFO: 13epoch:train:3621-5430batch: iter_time=3.167e-04, forward_time=0.205, loss_att=60.473, acc=0.947, loss=60.473, backward_time=0.296, grad_norm=97.165, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.537e-04, train_time=2.685 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 09:20:02,020 (trainer:732) INFO: 13epoch:train:5431-7240batch: iter_time=3.116e-04, forward_time=0.205, loss_att=60.170, acc=0.948, loss=60.170, backward_time=0.296, grad_norm=96.664, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.520e-04, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 09:40:18,910 (trainer:732) INFO: 13epoch:train:7241-9050batch: iter_time=3.249e-04, forward_time=0.206, loss_att=60.240, acc=0.948, loss=60.240, backward_time=0.295, grad_norm=100.577, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.503e-04, train_time=2.689 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 10:00:34,226 (trainer:732) INFO: 13epoch:train:9051-10860batch: iter_time=2.992e-04, forward_time=0.205, loss_att=59.796, acc=0.948, loss=59.796, backward_time=0.295, grad_norm=98.233, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.485e-04, train_time=2.685 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 10:20:48,916 (trainer:732) INFO: 13epoch:train:10861-12670batch: iter_time=2.997e-04, forward_time=0.205, loss_att=59.818, acc=0.948, loss=59.818, backward_time=0.295, grad_norm=96.944, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.468e-04, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 10:41:09,191 (trainer:732) INFO: 13epoch:train:12671-14480batch: iter_time=3.116e-04, forward_time=0.206, loss_att=60.194, acc=0.948, loss=60.194, backward_time=0.296, grad_norm=98.238, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.451e-04, train_time=2.696 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 11:01:23,306 (trainer:732) INFO: 13epoch:train:14481-16290batch: iter_time=3.037e-04, forward_time=0.205, loss_att=60.391, acc=0.947, loss=60.391, backward_time=0.295, grad_norm=96.950, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.434e-04, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 11:21:35,067 (trainer:732) INFO: 13epoch:train:16291-18100batch: iter_time=3.008e-04, forward_time=0.204, loss_att=60.420, acc=0.947, loss=60.420, backward_time=0.294, grad_norm=97.869, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.417e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 11:41:54,954 (trainer:732) INFO: 13epoch:train:18101-19910batch: iter_time=3.027e-04, forward_time=0.206, loss_att=60.648, acc=0.947, loss=60.648, backward_time=0.296, grad_norm=99.989, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=8.400e-04, train_time=2.696 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 12:02:10,863 (trainer:732) INFO: 13epoch:train:19911-21720batch: iter_time=2.948e-04, forward_time=0.205, loss_att=60.508, acc=0.948, loss=60.508, backward_time=0.296, grad_norm=97.794, clip=100.000, loss_scale=1.000, optim_step_time=0.062, optim0_lr0=8.383e-04, train_time=2.686 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 12:22:26,295 (trainer:732) INFO: 13epoch:train:21721-23530batch: iter_time=3.025e-04, forward_time=0.205, loss_att=60.149, acc=0.947, loss=60.149, backward_time=0.295, grad_norm=99.280, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.367e-04, train_time=2.686 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 12:42:44,630 (trainer:732) INFO: 13epoch:train:23531-25340batch: iter_time=3.268e-04, forward_time=0.207, loss_att=60.542, acc=0.947, loss=60.542, backward_time=0.296, grad_norm=95.590, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=8.350e-04, train_time=2.692 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 13:02:59,378 (trainer:732) INFO: 13epoch:train:25341-27150batch: iter_time=2.996e-04, forward_time=0.205, loss_att=60.371, acc=0.948, loss=60.371, backward_time=0.296, grad_norm=98.803, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.334e-04, train_time=2.685 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 13:23:14,708 (trainer:732) INFO: 13epoch:train:27151-28960batch: iter_time=3.134e-04, forward_time=0.205, loss_att=59.450, acc=0.948, loss=59.450, backward_time=0.295, grad_norm=95.688, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=8.318e-04, train_time=2.685 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 13:43:28,900 (trainer:732) INFO: 13epoch:train:28961-30770batch: iter_time=3.017e-04, forward_time=0.205, loss_att=59.367, acc=0.948, loss=59.367, backward_time=0.295, grad_norm=93.885, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.301e-04, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 14:03:43,517 (trainer:732) INFO: 13epoch:train:30771-32580batch: iter_time=3.118e-04, forward_time=0.205, loss_att=59.499, acc=0.948, loss=59.499, backward_time=0.295, grad_norm=94.697, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.285e-04, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 14:24:00,087 (trainer:732) INFO: 13epoch:train:32581-34390batch: iter_time=3.051e-04, forward_time=0.206, loss_att=59.460, acc=0.948, loss=59.460, backward_time=0.296, grad_norm=96.967, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=8.269e-04, train_time=2.688 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.246<34947> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.246<36601> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 14:44:11,664 (trainer:732) INFO: 13epoch:train:34391-36200batch: iter_time=2.971e-04, forward_time=0.205, loss_att=59.669, acc=0.948, loss=59.669, backward_time=0.294, grad_norm=99.221, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.253e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 14:53:11,247 (trainer:338) INFO: 13epoch results: [train] iter_time=3.483e-04, forward_time=0.205, loss_att=60.117, acc=0.948, loss=60.117, backward_time=0.295, grad_norm=97.598, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=8.410e-04, train_time=2.720, time=6 hours, 50 minutes and 43.23 seconds, total_count=470743, gpu_max_cached_mem_GB=29.945, [valid] loss_att=14.009, acc=0.975, cer=0.031, wer=0.120, loss=14.009, time=4 minutes and 52.83 seconds, total_count=728, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 43.31 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 14:53:15,534 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 14:53:15,543 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/3epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 14:53:15,543 (trainer:272) INFO: 14/60epoch started. Estimated time to finish: 1 week, 6 days and 18 hours + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.246<46752> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.246<49524> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 140) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.246<56203> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.246<56361> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.246<52289> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.246<52527> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 15:18:30,283 (trainer:732) INFO: 14epoch:train:1-1810batch: iter_time=9.901e-04, forward_time=0.205, loss_att=58.499, acc=0.949, loss=58.499, backward_time=0.295, grad_norm=95.049, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.237e-04, train_time=3.348 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 15:38:38,182 (trainer:732) INFO: 14epoch:train:1811-3620batch: iter_time=3.185e-04, forward_time=0.204, loss_att=58.167, acc=0.949, loss=58.167, backward_time=0.294, grad_norm=95.674, clip=100.000, loss_scale=1.000, optim_step_time=0.059, optim0_lr0=8.221e-04, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 15:58:51,759 (trainer:732) INFO: 14epoch:train:3621-5430batch: iter_time=3.174e-04, forward_time=0.205, loss_att=59.175, acc=0.949, loss=59.175, backward_time=0.296, grad_norm=98.189, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.206e-04, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 16:19:01,560 (trainer:732) INFO: 14epoch:train:5431-7240batch: iter_time=3.422e-04, forward_time=0.205, loss_att=58.958, acc=0.949, loss=58.958, backward_time=0.295, grad_norm=98.063, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.190e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 16:39:10,171 (trainer:732) INFO: 14epoch:train:7241-9050batch: iter_time=3.238e-04, forward_time=0.204, loss_att=58.431, acc=0.949, loss=58.431, backward_time=0.294, grad_norm=99.321, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.175e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 16:59:20,132 (trainer:732) INFO: 14epoch:train:9051-10860batch: iter_time=3.192e-04, forward_time=0.205, loss_att=59.241, acc=0.949, loss=59.241, backward_time=0.295, grad_norm=93.544, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.159e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 17:19:33,532 (trainer:732) INFO: 14epoch:train:10861-12670batch: iter_time=3.156e-04, forward_time=0.206, loss_att=59.485, acc=0.949, loss=59.485, backward_time=0.296, grad_norm=99.757, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.144e-04, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 17:39:41,179 (trainer:732) INFO: 14epoch:train:12671-14480batch: iter_time=3.180e-04, forward_time=0.204, loss_att=58.785, acc=0.948, loss=58.785, backward_time=0.294, grad_norm=95.413, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.129e-04, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 17:59:51,091 (trainer:732) INFO: 14epoch:train:14481-16290batch: iter_time=3.208e-04, forward_time=0.205, loss_att=59.214, acc=0.948, loss=59.214, backward_time=0.295, grad_norm=97.099, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.114e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 18:20:01,716 (trainer:732) INFO: 14epoch:train:16291-18100batch: iter_time=3.111e-04, forward_time=0.205, loss_att=58.880, acc=0.949, loss=58.880, backward_time=0.295, grad_norm=101.050, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.099e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 18:40:13,674 (trainer:732) INFO: 14epoch:train:18101-19910batch: iter_time=3.189e-04, forward_time=0.205, loss_att=58.780, acc=0.949, loss=58.780, backward_time=0.296, grad_norm=97.880, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.084e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 19:00:25,208 (trainer:732) INFO: 14epoch:train:19911-21720batch: iter_time=3.295e-04, forward_time=0.205, loss_att=59.069, acc=0.948, loss=59.069, backward_time=0.295, grad_norm=100.827, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.069e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 19:20:33,103 (trainer:732) INFO: 14epoch:train:21721-23530batch: iter_time=3.219e-04, forward_time=0.205, loss_att=58.214, acc=0.949, loss=58.214, backward_time=0.295, grad_norm=96.118, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.054e-04, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 19:40:40,398 (trainer:732) INFO: 14epoch:train:23531-25340batch: iter_time=3.295e-04, forward_time=0.204, loss_att=58.923, acc=0.949, loss=58.923, backward_time=0.294, grad_norm=96.393, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=8.039e-04, train_time=2.667 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 20:00:49,582 (trainer:732) INFO: 14epoch:train:25341-27150batch: iter_time=3.202e-04, forward_time=0.205, loss_att=59.537, acc=0.949, loss=59.537, backward_time=0.295, grad_norm=98.812, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.024e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 20:21:00,822 (trainer:732) INFO: 14epoch:train:27151-28960batch: iter_time=3.274e-04, forward_time=0.205, loss_att=58.863, acc=0.949, loss=58.863, backward_time=0.295, grad_norm=97.046, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.010e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 20:41:11,291 (trainer:732) INFO: 14epoch:train:28961-30770batch: iter_time=3.227e-04, forward_time=0.205, loss_att=58.316, acc=0.949, loss=58.316, backward_time=0.295, grad_norm=98.690, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.995e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 21:01:23,688 (trainer:732) INFO: 14epoch:train:30771-32580batch: iter_time=3.174e-04, forward_time=0.205, loss_att=58.361, acc=0.949, loss=58.361, backward_time=0.296, grad_norm=96.423, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.981e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 21:21:32,449 (trainer:732) INFO: 14epoch:train:32581-34390batch: iter_time=3.290e-04, forward_time=0.205, loss_att=58.305, acc=0.949, loss=58.305, backward_time=0.295, grad_norm=99.425, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.967e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 21:41:41,398 (trainer:732) INFO: 14epoch:train:34391-36200batch: iter_time=3.144e-04, forward_time=0.205, loss_att=58.527, acc=0.949, loss=58.527, backward_time=0.295, grad_norm=98.204, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.952e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 21:50:39,354 (trainer:338) INFO: 14epoch results: [train] iter_time=3.553e-04, forward_time=0.205, loss_att=58.784, acc=0.949, loss=58.784, backward_time=0.295, grad_norm=97.646, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=8.092e-04, train_time=2.707, time=6 hours, 48 minutes and 48.3 seconds, total_count=506954, gpu_max_cached_mem_GB=29.945, [valid] loss_att=14.041, acc=0.974, cer=0.030, wer=0.118, loss=14.041, time=5 minutes and 2.09 seconds, total_count=784, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 33.42 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 21:50:43,655 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 21:50:43,664 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/4epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 21:50:43,664 (trainer:272) INFO: 15/60epoch started. Estimated time to finish: 1 week, 6 days and 11 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 22:15:49,873 (trainer:732) INFO: 15epoch:train:1-1810batch: iter_time=0.001, forward_time=0.205, loss_att=57.699, acc=0.950, loss=57.699, backward_time=0.295, grad_norm=94.091, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.938e-04, train_time=3.329 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 22:36:02,969 (trainer:732) INFO: 15epoch:train:1811-3620batch: iter_time=3.237e-04, forward_time=0.205, loss_att=58.087, acc=0.950, loss=58.087, backward_time=0.296, grad_norm=98.651, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.924e-04, train_time=2.680 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 22:56:12,693 (trainer:732) INFO: 15epoch:train:3621-5430batch: iter_time=3.301e-04, forward_time=0.205, loss_att=56.812, acc=0.950, loss=56.812, backward_time=0.295, grad_norm=99.376, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.910e-04, train_time=2.673 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 23:16:24,574 (trainer:732) INFO: 15epoch:train:5431-7240batch: iter_time=3.203e-04, forward_time=0.205, loss_att=57.965, acc=0.950, loss=57.965, backward_time=0.295, grad_norm=94.975, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.896e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 23:36:33,964 (trainer:732) INFO: 15epoch:train:7241-9050batch: iter_time=3.219e-04, forward_time=0.205, loss_att=57.484, acc=0.950, loss=57.484, backward_time=0.295, grad_norm=96.284, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.882e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-24 23:56:42,680 (trainer:732) INFO: 15epoch:train:9051-10860batch: iter_time=3.306e-04, forward_time=0.205, loss_att=58.688, acc=0.949, loss=58.688, backward_time=0.295, grad_norm=94.821, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.868e-04, train_time=2.670 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 00:16:52,523 (trainer:732) INFO: 15epoch:train:10861-12670batch: iter_time=3.101e-04, forward_time=0.205, loss_att=57.940, acc=0.950, loss=57.940, backward_time=0.295, grad_norm=96.201, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.855e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 00:37:02,013 (trainer:732) INFO: 15epoch:train:12671-14480batch: iter_time=3.126e-04, forward_time=0.204, loss_att=57.684, acc=0.950, loss=57.684, backward_time=0.295, grad_norm=100.393, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.841e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 00:57:13,625 (trainer:732) INFO: 15epoch:train:14481-16290batch: iter_time=3.240e-04, forward_time=0.205, loss_att=57.173, acc=0.950, loss=57.173, backward_time=0.295, grad_norm=101.888, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.827e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 01:17:27,623 (trainer:732) INFO: 15epoch:train:16291-18100batch: iter_time=3.150e-04, forward_time=0.205, loss_att=57.853, acc=0.950, loss=57.853, backward_time=0.296, grad_norm=97.742, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.814e-04, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 01:37:37,216 (trainer:732) INFO: 15epoch:train:18101-19910batch: iter_time=3.129e-04, forward_time=0.204, loss_att=56.954, acc=0.950, loss=56.954, backward_time=0.295, grad_norm=102.270, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.800e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 01:57:48,130 (trainer:732) INFO: 15epoch:train:19911-21720batch: iter_time=3.088e-04, forward_time=0.205, loss_att=57.396, acc=0.950, loss=57.396, backward_time=0.295, grad_norm=103.030, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.787e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 02:17:58,898 (trainer:732) INFO: 15epoch:train:21721-23530batch: iter_time=3.101e-04, forward_time=0.205, loss_att=57.133, acc=0.950, loss=57.133, backward_time=0.295, grad_norm=100.725, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.774e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 02:38:06,567 (trainer:732) INFO: 15epoch:train:23531-25340batch: iter_time=3.112e-04, forward_time=0.204, loss_att=57.191, acc=0.950, loss=57.191, backward_time=0.294, grad_norm=97.353, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.760e-04, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 02:58:19,521 (trainer:732) INFO: 15epoch:train:25341-27150batch: iter_time=3.120e-04, forward_time=0.205, loss_att=58.164, acc=0.950, loss=58.164, backward_time=0.296, grad_norm=95.474, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.747e-04, train_time=2.681 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 03:18:30,421 (trainer:732) INFO: 15epoch:train:27151-28960batch: iter_time=3.182e-04, forward_time=0.205, loss_att=57.026, acc=0.950, loss=57.026, backward_time=0.295, grad_norm=99.744, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.734e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 03:38:41,334 (trainer:732) INFO: 15epoch:train:28961-30770batch: iter_time=3.105e-04, forward_time=0.205, loss_att=57.917, acc=0.950, loss=57.917, backward_time=0.296, grad_norm=97.735, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.721e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 03:58:50,768 (trainer:732) INFO: 15epoch:train:30771-32580batch: iter_time=3.072e-04, forward_time=0.205, loss_att=57.218, acc=0.950, loss=57.218, backward_time=0.295, grad_norm=100.544, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.708e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 04:19:02,011 (trainer:732) INFO: 15epoch:train:32581-34390batch: iter_time=3.130e-04, forward_time=0.205, loss_att=57.900, acc=0.950, loss=57.900, backward_time=0.295, grad_norm=96.911, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.695e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 04:39:07,938 (trainer:732) INFO: 15epoch:train:34391-36200batch: iter_time=3.179e-04, forward_time=0.204, loss_att=56.937, acc=0.950, loss=56.937, backward_time=0.294, grad_norm=96.772, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.682e-04, train_time=2.665 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 04:48:27,132 (trainer:338) INFO: 15epoch results: [train] iter_time=3.693e-04, forward_time=0.205, loss_att=57.559, acc=0.950, loss=57.559, backward_time=0.295, grad_norm=98.248, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.808e-04, train_time=2.707, time=6 hours, 48 minutes and 49.03 seconds, total_count=543165, gpu_max_cached_mem_GB=29.945, [valid] loss_att=13.392, acc=0.976, cer=0.030, wer=0.115, loss=13.392, time=5 minutes and 24 seconds, total_count=840, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 30.43 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 04:48:31,365 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 04:48:31,373 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/5epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 04:48:31,373 (trainer:272) INFO: 16/60epoch started. Estimated time to finish: 1 week, 6 days and 3 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 05:13:43,313 (trainer:732) INFO: 16epoch:train:1-1810batch: iter_time=7.955e-04, forward_time=0.205, loss_att=56.549, acc=0.951, loss=56.549, backward_time=0.296, grad_norm=100.094, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.669e-04, train_time=3.341 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 05:33:52,450 (trainer:732) INFO: 16epoch:train:1811-3620batch: iter_time=3.094e-04, forward_time=0.205, loss_att=56.172, acc=0.951, loss=56.172, backward_time=0.295, grad_norm=96.509, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.657e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 05:54:03,758 (trainer:732) INFO: 16epoch:train:3621-5430batch: iter_time=3.191e-04, forward_time=0.205, loss_att=56.672, acc=0.951, loss=56.672, backward_time=0.295, grad_norm=96.142, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.644e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 06:14:14,479 (trainer:732) INFO: 16epoch:train:5431-7240batch: iter_time=3.194e-04, forward_time=0.205, loss_att=56.600, acc=0.951, loss=56.600, backward_time=0.295, grad_norm=97.736, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.631e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 06:34:23,374 (trainer:732) INFO: 16epoch:train:7241-9050batch: iter_time=3.107e-04, forward_time=0.205, loss_att=56.570, acc=0.951, loss=56.570, backward_time=0.295, grad_norm=99.723, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.619e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 06:54:35,304 (trainer:732) INFO: 16epoch:train:9051-10860batch: iter_time=3.098e-04, forward_time=0.205, loss_att=56.667, acc=0.951, loss=56.667, backward_time=0.296, grad_norm=96.918, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.606e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 07:14:41,559 (trainer:732) INFO: 16epoch:train:10861-12670batch: iter_time=3.034e-04, forward_time=0.204, loss_att=56.013, acc=0.951, loss=56.013, backward_time=0.294, grad_norm=97.962, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.594e-04, train_time=2.666 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 07:34:50,510 (trainer:732) INFO: 16epoch:train:12671-14480batch: iter_time=3.115e-04, forward_time=0.205, loss_att=56.275, acc=0.951, loss=56.275, backward_time=0.295, grad_norm=99.677, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.582e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 07:54:55,547 (trainer:732) INFO: 16epoch:train:14481-16290batch: iter_time=3.220e-04, forward_time=0.204, loss_att=56.059, acc=0.951, loss=56.059, backward_time=0.294, grad_norm=93.989, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.569e-04, train_time=2.663 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 08:15:06,769 (trainer:732) INFO: 16epoch:train:16291-18100batch: iter_time=3.198e-04, forward_time=0.205, loss_att=56.362, acc=0.951, loss=56.362, backward_time=0.296, grad_norm=100.265, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.557e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 08:35:18,066 (trainer:732) INFO: 16epoch:train:18101-19910batch: iter_time=3.111e-04, forward_time=0.205, loss_att=56.931, acc=0.951, loss=56.931, backward_time=0.296, grad_norm=98.885, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.545e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 08:55:26,416 (trainer:732) INFO: 16epoch:train:19911-21720batch: iter_time=3.114e-04, forward_time=0.205, loss_att=57.125, acc=0.950, loss=57.125, backward_time=0.295, grad_norm=99.870, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.533e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 09:15:36,578 (trainer:732) INFO: 16epoch:train:21721-23530batch: iter_time=3.159e-04, forward_time=0.205, loss_att=57.072, acc=0.950, loss=57.072, backward_time=0.296, grad_norm=98.973, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.521e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 09:35:46,607 (trainer:732) INFO: 16epoch:train:23531-25340batch: iter_time=3.137e-04, forward_time=0.205, loss_att=56.347, acc=0.951, loss=56.347, backward_time=0.295, grad_norm=96.763, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.509e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 09:55:52,535 (trainer:732) INFO: 16epoch:train:25341-27150batch: iter_time=3.171e-04, forward_time=0.204, loss_att=56.637, acc=0.951, loss=56.637, backward_time=0.294, grad_norm=100.249, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.497e-04, train_time=2.665 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 10:16:02,344 (trainer:732) INFO: 16epoch:train:27151-28960batch: iter_time=3.156e-04, forward_time=0.205, loss_att=56.833, acc=0.951, loss=56.833, backward_time=0.295, grad_norm=95.836, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.485e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 10:36:14,909 (trainer:732) INFO: 16epoch:train:28961-30770batch: iter_time=3.173e-04, forward_time=0.205, loss_att=56.344, acc=0.951, loss=56.344, backward_time=0.296, grad_norm=99.845, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.473e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 10:56:19,577 (trainer:732) INFO: 16epoch:train:30771-32580batch: iter_time=3.306e-04, forward_time=0.204, loss_att=55.078, acc=0.951, loss=55.078, backward_time=0.294, grad_norm=103.484, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.461e-04, train_time=2.662 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 11:16:28,367 (trainer:732) INFO: 16epoch:train:32581-34390batch: iter_time=3.108e-04, forward_time=0.204, loss_att=56.405, acc=0.951, loss=56.405, backward_time=0.295, grad_norm=101.978, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.450e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 11:36:38,951 (trainer:732) INFO: 16epoch:train:34391-36200batch: iter_time=3.147e-04, forward_time=0.205, loss_att=56.339, acc=0.951, loss=56.339, backward_time=0.295, grad_norm=99.611, clip=100.000, loss_scale=1.000, optim_step_time=0.061, optim0_lr0=7.438e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 11:45:28,926 (trainer:338) INFO: 16epoch results: [train] iter_time=3.389e-04, forward_time=0.205, loss_att=56.452, acc=0.951, loss=56.452, backward_time=0.295, grad_norm=98.720, clip=100.000, loss_scale=1.000, optim_step_time=0.060, optim0_lr0=7.552e-04, train_time=2.705, time=6 hours, 48 minutes and 26.83 seconds, total_count=579376, gpu_max_cached_mem_GB=29.945, [valid] loss_att=12.928, acc=0.977, cer=0.029, wer=0.111, loss=12.928, time=4 minutes and 53.27 seconds, total_count=896, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 37.45 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 11:45:32,753 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 11:45:32,760 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/6epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 11:45:32,760 (trainer:272) INFO: 17/60epoch started. Estimated time to finish: 1 week, 5 days and 20 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 12:10:52,272 (trainer:732) INFO: 17epoch:train:1-1810batch: iter_time=9.152e-04, forward_time=0.204, loss_att=55.391, acc=0.951, loss=55.391, backward_time=0.295, grad_norm=99.157, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.426e-04, train_time=3.359 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 12:31:02,067 (trainer:732) INFO: 17epoch:train:1811-3620batch: iter_time=3.007e-04, forward_time=0.203, loss_att=54.836, acc=0.952, loss=54.836, backward_time=0.295, grad_norm=96.822, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.415e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 12:51:12,256 (trainer:732) INFO: 17epoch:train:3621-5430batch: iter_time=3.025e-04, forward_time=0.203, loss_att=55.420, acc=0.952, loss=55.420, backward_time=0.295, grad_norm=97.046, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.403e-04, train_time=2.674 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 13:11:22,480 (trainer:732) INFO: 17epoch:train:5431-7240batch: iter_time=3.103e-04, forward_time=0.203, loss_att=54.675, acc=0.952, loss=54.675, backward_time=0.295, grad_norm=98.602, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.392e-04, train_time=2.674 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 13:31:30,884 (trainer:732) INFO: 17epoch:train:7241-9050batch: iter_time=3.006e-04, forward_time=0.203, loss_att=56.145, acc=0.951, loss=56.145, backward_time=0.295, grad_norm=97.858, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.380e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 13:51:44,578 (trainer:732) INFO: 17epoch:train:9051-10860batch: iter_time=2.979e-04, forward_time=0.204, loss_att=55.687, acc=0.952, loss=55.687, backward_time=0.296, grad_norm=102.222, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.369e-04, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 14:11:53,740 (trainer:732) INFO: 17epoch:train:10861-12670batch: iter_time=3.067e-04, forward_time=0.203, loss_att=55.205, acc=0.952, loss=55.205, backward_time=0.295, grad_norm=103.027, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.358e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 14:32:06,194 (trainer:732) INFO: 17epoch:train:12671-14480batch: iter_time=3.013e-04, forward_time=0.204, loss_att=55.955, acc=0.952, loss=55.955, backward_time=0.296, grad_norm=101.168, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.347e-04, train_time=2.679 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 14:52:12,872 (trainer:732) INFO: 17epoch:train:14481-16290batch: iter_time=3.034e-04, forward_time=0.203, loss_att=55.655, acc=0.951, loss=55.655, backward_time=0.295, grad_norm=93.974, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.335e-04, train_time=2.667 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 15:12:23,942 (trainer:732) INFO: 17epoch:train:16291-18100batch: iter_time=3.045e-04, forward_time=0.203, loss_att=55.590, acc=0.952, loss=55.590, backward_time=0.296, grad_norm=103.803, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.324e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 15:32:32,708 (trainer:732) INFO: 17epoch:train:18101-19910batch: iter_time=2.993e-04, forward_time=0.203, loss_att=55.982, acc=0.951, loss=55.982, backward_time=0.295, grad_norm=99.289, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.313e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 15:52:38,740 (trainer:732) INFO: 17epoch:train:19911-21720batch: iter_time=3.038e-04, forward_time=0.202, loss_att=54.846, acc=0.952, loss=54.846, backward_time=0.294, grad_norm=103.204, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.302e-04, train_time=2.665 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 16:12:44,741 (trainer:732) INFO: 17epoch:train:21721-23530batch: iter_time=3.197e-04, forward_time=0.203, loss_att=55.683, acc=0.951, loss=55.683, backward_time=0.294, grad_norm=97.168, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.291e-04, train_time=2.665 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 16:32:54,327 (trainer:732) INFO: 17epoch:train:23531-25340batch: iter_time=3.082e-04, forward_time=0.203, loss_att=55.997, acc=0.951, loss=55.997, backward_time=0.295, grad_norm=98.446, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.280e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 16:53:04,066 (trainer:732) INFO: 17epoch:train:25341-27150batch: iter_time=3.010e-04, forward_time=0.203, loss_att=55.249, acc=0.952, loss=55.249, backward_time=0.295, grad_norm=100.092, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.269e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 17:13:12,801 (trainer:732) INFO: 17epoch:train:27151-28960batch: iter_time=2.943e-04, forward_time=0.203, loss_att=54.831, acc=0.952, loss=54.831, backward_time=0.295, grad_norm=97.808, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=7.258e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 17:33:21,480 (trainer:732) INFO: 17epoch:train:28961-30770batch: iter_time=3.041e-04, forward_time=0.203, loss_att=55.272, acc=0.951, loss=55.272, backward_time=0.295, grad_norm=99.826, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.248e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 17:53:33,420 (trainer:732) INFO: 17epoch:train:30771-32580batch: iter_time=3.031e-04, forward_time=0.204, loss_att=55.111, acc=0.952, loss=55.111, backward_time=0.296, grad_norm=97.618, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.237e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 18:13:43,497 (trainer:732) INFO: 17epoch:train:32581-34390batch: iter_time=3.003e-04, forward_time=0.203, loss_att=56.252, acc=0.951, loss=56.252, backward_time=0.296, grad_norm=102.155, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.226e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 18:33:52,164 (trainer:732) INFO: 17epoch:train:34391-36200batch: iter_time=2.986e-04, forward_time=0.203, loss_att=56.198, acc=0.951, loss=56.198, backward_time=0.295, grad_norm=104.130, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.216e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 18:42:47,788 (trainer:338) INFO: 17epoch results: [train] iter_time=3.337e-04, forward_time=0.203, loss_att=55.496, acc=0.952, loss=55.496, backward_time=0.295, grad_norm=99.710, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.319e-04, train_time=2.707, time=6 hours, 48 minutes and 41.41 seconds, total_count=615587, gpu_max_cached_mem_GB=29.945, [valid] loss_att=12.840, acc=0.977, cer=0.029, wer=0.111, loss=12.840, time=5 minutes and 2.27 seconds, total_count=952, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 31.34 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 18:42:51,568 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 18:42:51,577 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/7epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 18:42:51,577 (trainer:272) INFO: 18/60epoch started. Estimated time to finish: 1 week, 5 days and 13 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 19:07:54,513 (trainer:732) INFO: 18epoch:train:1-1810batch: iter_time=0.001, forward_time=0.203, loss_att=53.891, acc=0.953, loss=53.891, backward_time=0.295, grad_norm=93.440, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.205e-04, train_time=3.322 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 19:28:04,016 (trainer:732) INFO: 18epoch:train:1811-3620batch: iter_time=2.983e-04, forward_time=0.203, loss_att=54.303, acc=0.953, loss=54.303, backward_time=0.295, grad_norm=96.121, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.194e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 19:48:11,418 (trainer:732) INFO: 18epoch:train:3621-5430batch: iter_time=2.958e-04, forward_time=0.203, loss_att=54.207, acc=0.952, loss=54.207, backward_time=0.295, grad_norm=99.602, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.184e-04, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 20:08:21,734 (trainer:732) INFO: 18epoch:train:5431-7240batch: iter_time=2.864e-04, forward_time=0.203, loss_att=55.083, acc=0.952, loss=55.083, backward_time=0.295, grad_norm=96.154, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.173e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 20:28:30,992 (trainer:732) INFO: 18epoch:train:7241-9050batch: iter_time=2.883e-04, forward_time=0.203, loss_att=54.936, acc=0.952, loss=54.936, backward_time=0.295, grad_norm=103.275, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.163e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 20:48:41,787 (trainer:732) INFO: 18epoch:train:9051-10860batch: iter_time=2.844e-04, forward_time=0.203, loss_att=55.273, acc=0.952, loss=55.273, backward_time=0.295, grad_norm=94.898, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.153e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 21:08:52,205 (trainer:732) INFO: 18epoch:train:10861-12670batch: iter_time=2.828e-04, forward_time=0.203, loss_att=55.246, acc=0.952, loss=55.246, backward_time=0.296, grad_norm=97.570, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.142e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 21:29:00,647 (trainer:732) INFO: 18epoch:train:12671-14480batch: iter_time=2.768e-04, forward_time=0.203, loss_att=54.554, acc=0.952, loss=54.554, backward_time=0.295, grad_norm=100.642, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.132e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 21:49:11,627 (trainer:732) INFO: 18epoch:train:14481-16290batch: iter_time=2.865e-04, forward_time=0.203, loss_att=55.070, acc=0.952, loss=55.070, backward_time=0.296, grad_norm=97.371, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.122e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 22:09:21,802 (trainer:732) INFO: 18epoch:train:16291-18100batch: iter_time=2.796e-04, forward_time=0.203, loss_att=54.651, acc=0.952, loss=54.651, backward_time=0.295, grad_norm=97.543, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.112e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 22:29:31,143 (trainer:732) INFO: 18epoch:train:18101-19910batch: iter_time=2.802e-04, forward_time=0.203, loss_att=54.313, acc=0.952, loss=54.313, backward_time=0.295, grad_norm=96.051, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.101e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 22:49:41,870 (trainer:732) INFO: 18epoch:train:19911-21720batch: iter_time=2.803e-04, forward_time=0.203, loss_att=54.593, acc=0.953, loss=54.593, backward_time=0.295, grad_norm=102.285, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.091e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 23:09:52,032 (trainer:732) INFO: 18epoch:train:21721-23530batch: iter_time=2.814e-04, forward_time=0.203, loss_att=54.054, acc=0.952, loss=54.054, backward_time=0.295, grad_norm=98.322, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.081e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 23:29:58,614 (trainer:732) INFO: 18epoch:train:23531-25340batch: iter_time=2.841e-04, forward_time=0.202, loss_att=53.903, acc=0.952, loss=53.903, backward_time=0.294, grad_norm=100.060, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.071e-04, train_time=2.666 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-25 23:50:07,707 (trainer:732) INFO: 18epoch:train:25341-27150batch: iter_time=2.904e-04, forward_time=0.203, loss_att=54.233, acc=0.953, loss=54.233, backward_time=0.294, grad_norm=98.332, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=7.061e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 00:10:19,407 (trainer:732) INFO: 18epoch:train:27151-28960batch: iter_time=2.912e-04, forward_time=0.203, loss_att=54.531, acc=0.953, loss=54.531, backward_time=0.296, grad_norm=96.641, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.051e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 00:30:28,256 (trainer:732) INFO: 18epoch:train:28961-30770batch: iter_time=2.793e-04, forward_time=0.203, loss_att=54.654, acc=0.952, loss=54.654, backward_time=0.295, grad_norm=96.935, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.041e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 00:50:37,489 (trainer:732) INFO: 18epoch:train:30771-32580batch: iter_time=2.872e-04, forward_time=0.203, loss_att=54.792, acc=0.952, loss=54.792, backward_time=0.295, grad_norm=97.709, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.032e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 01:10:46,482 (trainer:732) INFO: 18epoch:train:32581-34390batch: iter_time=2.816e-04, forward_time=0.203, loss_att=54.443, acc=0.952, loss=54.443, backward_time=0.295, grad_norm=98.366, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.022e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 01:30:59,662 (trainer:732) INFO: 18epoch:train:34391-36200batch: iter_time=2.868e-04, forward_time=0.203, loss_att=55.005, acc=0.952, loss=55.005, backward_time=0.296, grad_norm=100.241, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.012e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 01:40:13,802 (trainer:338) INFO: 18epoch results: [train] iter_time=3.429e-04, forward_time=0.203, loss_att=54.584, acc=0.952, loss=54.584, backward_time=0.295, grad_norm=98.079, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=7.107e-04, train_time=2.706, time=6 hours, 48 minutes and 27.18 seconds, total_count=651798, gpu_max_cached_mem_GB=29.945, [valid] loss_att=12.514, acc=0.978, cer=0.028, wer=0.108, loss=12.514, time=5 minutes and 17.1 seconds, total_count=1008, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 37.94 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 01:40:18,038 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 01:40:18,047 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/8epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 01:40:18,047 (trainer:272) INFO: 19/60epoch started. Estimated time to finish: 1 week, 5 days and 6 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 02:05:38,856 (trainer:732) INFO: 19epoch:train:1-1810batch: iter_time=0.001, forward_time=0.203, loss_att=53.418, acc=0.953, loss=53.418, backward_time=0.295, grad_norm=95.239, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=7.002e-04, train_time=3.361 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 02:25:51,190 (trainer:732) INFO: 19epoch:train:1811-3620batch: iter_time=2.825e-04, forward_time=0.203, loss_att=53.841, acc=0.953, loss=53.841, backward_time=0.295, grad_norm=98.493, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.992e-04, train_time=2.679 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 02:46:07,247 (trainer:732) INFO: 19epoch:train:3621-5430batch: iter_time=2.849e-04, forward_time=0.204, loss_att=53.795, acc=0.954, loss=53.795, backward_time=0.296, grad_norm=99.695, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.983e-04, train_time=2.687 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 03:06:18,122 (trainer:732) INFO: 19epoch:train:5431-7240batch: iter_time=2.894e-04, forward_time=0.203, loss_att=53.844, acc=0.953, loss=53.844, backward_time=0.295, grad_norm=101.638, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.973e-04, train_time=2.675 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 03:26:27,695 (trainer:732) INFO: 19epoch:train:7241-9050batch: iter_time=2.907e-04, forward_time=0.203, loss_att=53.193, acc=0.953, loss=53.193, backward_time=0.295, grad_norm=98.995, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.964e-04, train_time=2.673 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 03:46:35,923 (trainer:732) INFO: 19epoch:train:9051-10860batch: iter_time=2.906e-04, forward_time=0.202, loss_att=54.114, acc=0.953, loss=54.114, backward_time=0.294, grad_norm=101.690, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.954e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 04:06:45,507 (trainer:732) INFO: 19epoch:train:10861-12670batch: iter_time=2.814e-04, forward_time=0.203, loss_att=53.647, acc=0.953, loss=53.647, backward_time=0.295, grad_norm=102.188, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.945e-04, train_time=2.673 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 04:26:54,736 (trainer:732) INFO: 19epoch:train:12671-14480batch: iter_time=2.828e-04, forward_time=0.203, loss_att=53.975, acc=0.953, loss=53.975, backward_time=0.295, grad_norm=96.593, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.935e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 04:47:03,635 (trainer:732) INFO: 19epoch:train:14481-16290batch: iter_time=2.923e-04, forward_time=0.203, loss_att=53.605, acc=0.953, loss=53.605, backward_time=0.295, grad_norm=97.253, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.926e-04, train_time=2.672 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 05:07:12,117 (trainer:732) INFO: 19epoch:train:16291-18100batch: iter_time=2.867e-04, forward_time=0.203, loss_att=53.899, acc=0.953, loss=53.899, backward_time=0.294, grad_norm=95.850, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.916e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 05:27:25,627 (trainer:732) INFO: 19epoch:train:18101-19910batch: iter_time=2.850e-04, forward_time=0.203, loss_att=54.076, acc=0.953, loss=54.076, backward_time=0.296, grad_norm=101.580, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.907e-04, train_time=2.682 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 05:47:36,358 (trainer:732) INFO: 19epoch:train:19911-21720batch: iter_time=2.846e-04, forward_time=0.203, loss_att=53.818, acc=0.953, loss=53.818, backward_time=0.295, grad_norm=94.292, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.898e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 06:07:43,950 (trainer:732) INFO: 19epoch:train:21721-23530batch: iter_time=2.952e-04, forward_time=0.203, loss_att=53.204, acc=0.953, loss=53.204, backward_time=0.294, grad_norm=104.214, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.889e-04, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 06:27:49,897 (trainer:732) INFO: 19epoch:train:23531-25340batch: iter_time=2.856e-04, forward_time=0.202, loss_att=53.459, acc=0.953, loss=53.459, backward_time=0.294, grad_norm=96.524, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.879e-04, train_time=2.664 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 06:47:56,618 (trainer:732) INFO: 19epoch:train:25341-27150batch: iter_time=2.783e-04, forward_time=0.203, loss_att=53.155, acc=0.953, loss=53.155, backward_time=0.295, grad_norm=98.367, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=6.870e-04, train_time=2.667 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 07:08:06,399 (trainer:732) INFO: 19epoch:train:27151-28960batch: iter_time=2.841e-04, forward_time=0.203, loss_att=53.998, acc=0.953, loss=53.998, backward_time=0.295, grad_norm=97.124, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.861e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 07:28:15,077 (trainer:732) INFO: 19epoch:train:28961-30770batch: iter_time=2.851e-04, forward_time=0.203, loss_att=54.708, acc=0.953, loss=54.708, backward_time=0.295, grad_norm=101.183, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.852e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 07:48:23,632 (trainer:732) INFO: 19epoch:train:30771-32580batch: iter_time=2.890e-04, forward_time=0.203, loss_att=54.030, acc=0.953, loss=54.030, backward_time=0.295, grad_norm=96.240, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.843e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 08:08:34,983 (trainer:732) INFO: 19epoch:train:32581-34390batch: iter_time=2.784e-04, forward_time=0.203, loss_att=53.813, acc=0.954, loss=53.813, backward_time=0.296, grad_norm=97.791, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.834e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 08:28:39,292 (trainer:732) INFO: 19epoch:train:34391-36200batch: iter_time=2.778e-04, forward_time=0.202, loss_att=53.487, acc=0.953, loss=53.487, backward_time=0.294, grad_norm=99.858, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.825e-04, train_time=2.661 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 08:37:33,662 (trainer:338) INFO: 19epoch results: [train] iter_time=3.354e-04, forward_time=0.203, loss_att=53.752, acc=0.953, loss=53.752, backward_time=0.295, grad_norm=98.737, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.912e-04, train_time=2.707, time=6 hours, 48 minutes and 41.51 seconds, total_count=688009, gpu_max_cached_mem_GB=29.945, [valid] loss_att=12.585, acc=0.977, cer=0.028, wer=0.110, loss=12.585, time=5 minutes and 5.65 seconds, total_count=1064, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 28.45 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 08:37:37,652 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 08:37:37,661 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/10epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 08:37:37,661 (trainer:272) INFO: 20/60epoch started. Estimated time to finish: 1 week, 4 days and 23 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 09:02:59,472 (trainer:732) INFO: 20epoch:train:1-1810batch: iter_time=9.739e-04, forward_time=0.204, loss_att=52.970, acc=0.954, loss=52.970, backward_time=0.296, grad_norm=103.096, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.816e-04, train_time=3.363 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 09:23:11,521 (trainer:732) INFO: 20epoch:train:1811-3620batch: iter_time=2.999e-04, forward_time=0.203, loss_att=53.332, acc=0.954, loss=53.332, backward_time=0.295, grad_norm=95.377, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.807e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 09:43:20,771 (trainer:732) INFO: 20epoch:train:3621-5430batch: iter_time=2.905e-04, forward_time=0.203, loss_att=52.927, acc=0.953, loss=52.927, backward_time=0.295, grad_norm=99.535, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.798e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 10:03:32,384 (trainer:732) INFO: 20epoch:train:5431-7240batch: iter_time=2.785e-04, forward_time=0.203, loss_att=52.860, acc=0.954, loss=52.860, backward_time=0.295, grad_norm=99.177, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.789e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 10:23:42,641 (trainer:732) INFO: 20epoch:train:7241-9050batch: iter_time=2.864e-04, forward_time=0.203, loss_att=52.826, acc=0.954, loss=52.826, backward_time=0.295, grad_norm=103.754, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.780e-04, train_time=2.675 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 10:43:51,119 (trainer:732) INFO: 20epoch:train:9051-10860batch: iter_time=2.767e-04, forward_time=0.203, loss_att=53.160, acc=0.954, loss=53.160, backward_time=0.295, grad_norm=98.072, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.771e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 11:04:00,433 (trainer:732) INFO: 20epoch:train:10861-12670batch: iter_time=2.787e-04, forward_time=0.203, loss_att=52.526, acc=0.954, loss=52.526, backward_time=0.295, grad_norm=102.727, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.763e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 11:24:09,103 (trainer:732) INFO: 20epoch:train:12671-14480batch: iter_time=2.957e-04, forward_time=0.203, loss_att=52.859, acc=0.954, loss=52.859, backward_time=0.294, grad_norm=100.753, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.754e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 11:44:23,260 (trainer:732) INFO: 20epoch:train:14481-16290batch: iter_time=2.874e-04, forward_time=0.205, loss_att=53.806, acc=0.953, loss=53.806, backward_time=0.295, grad_norm=97.670, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.745e-04, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 12:04:30,729 (trainer:732) INFO: 20epoch:train:16291-18100batch: iter_time=2.804e-04, forward_time=0.202, loss_att=52.760, acc=0.954, loss=52.760, backward_time=0.294, grad_norm=99.444, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.736e-04, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 12:24:42,575 (trainer:732) INFO: 20epoch:train:18101-19910batch: iter_time=2.856e-04, forward_time=0.203, loss_att=53.589, acc=0.954, loss=53.589, backward_time=0.296, grad_norm=100.907, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.728e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 12:44:49,469 (trainer:732) INFO: 20epoch:train:19911-21720batch: iter_time=2.889e-04, forward_time=0.202, loss_att=53.091, acc=0.953, loss=53.091, backward_time=0.294, grad_norm=101.903, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.719e-04, train_time=2.666 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 13:04:56,111 (trainer:732) INFO: 20epoch:train:21721-23530batch: iter_time=2.804e-04, forward_time=0.203, loss_att=52.430, acc=0.954, loss=52.430, backward_time=0.295, grad_norm=104.509, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.711e-04, train_time=2.667 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 13:25:06,669 (trainer:732) INFO: 20epoch:train:23531-25340batch: iter_time=2.866e-04, forward_time=0.203, loss_att=53.476, acc=0.954, loss=53.476, backward_time=0.295, grad_norm=102.445, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.702e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 13:45:15,887 (trainer:732) INFO: 20epoch:train:25341-27150batch: iter_time=2.804e-04, forward_time=0.203, loss_att=52.915, acc=0.954, loss=52.915, backward_time=0.295, grad_norm=96.376, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.694e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 14:05:26,949 (trainer:732) INFO: 20epoch:train:27151-28960batch: iter_time=2.893e-04, forward_time=0.203, loss_att=53.359, acc=0.954, loss=53.359, backward_time=0.295, grad_norm=101.481, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.685e-04, train_time=2.676 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 14:25:38,847 (trainer:732) INFO: 20epoch:train:28961-30770batch: iter_time=2.936e-04, forward_time=0.203, loss_att=53.733, acc=0.954, loss=53.733, backward_time=0.296, grad_norm=105.149, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.677e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 14:45:45,508 (trainer:732) INFO: 20epoch:train:30771-32580batch: iter_time=2.890e-04, forward_time=0.203, loss_att=52.330, acc=0.954, loss=52.330, backward_time=0.294, grad_norm=98.974, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.668e-04, train_time=2.666 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 15:05:53,130 (trainer:732) INFO: 20epoch:train:32581-34390batch: iter_time=2.870e-04, forward_time=0.203, loss_att=52.446, acc=0.954, loss=52.446, backward_time=0.295, grad_norm=100.280, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.660e-04, train_time=2.669 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.235<48289> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 15:26:02,168 (trainer:732) INFO: 20epoch:train:34391-36200batch: iter_time=2.856e-04, forward_time=0.203, loss_att=52.877, acc=0.954, loss=52.877, backward_time=0.295, grad_norm=101.243, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.652e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 15:34:56,626 (trainer:338) INFO: 20epoch results: [train] iter_time=3.207e-04, forward_time=0.203, loss_att=53.010, acc=0.954, loss=53.010, backward_time=0.295, grad_norm=100.642, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.733e-04, train_time=2.707, time=6 hours, 48 minutes and 47.2 seconds, total_count=724220, gpu_max_cached_mem_GB=29.945, [valid] loss_att=12.200, acc=0.978, cer=0.027, wer=0.106, loss=12.200, time=4 minutes and 52.4 seconds, total_count=1120, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 39.35 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 15:35:00,457 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 15:35:00,465 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/9epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 15:35:00,466 (trainer:272) INFO: 21/60epoch started. Estimated time to finish: 1 week, 4 days and 16 hours + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<51105> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 16:00:23,041 (trainer:732) INFO: 21epoch:train:1-1810batch: iter_time=0.001, forward_time=0.203, loss_att=51.300, acc=0.955, loss=51.300, backward_time=0.295, grad_norm=94.484, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.643e-04, train_time=3.365 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 16:20:35,720 (trainer:732) INFO: 21epoch:train:1811-3620batch: iter_time=2.913e-04, forward_time=0.203, loss_att=51.980, acc=0.954, loss=51.980, backward_time=0.295, grad_norm=94.831, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.635e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 16:40:45,646 (trainer:732) INFO: 21epoch:train:3621-5430batch: iter_time=2.967e-04, forward_time=0.203, loss_att=52.054, acc=0.954, loss=52.054, backward_time=0.295, grad_norm=96.370, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.627e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 17:00:56,991 (trainer:732) INFO: 21epoch:train:5431-7240batch: iter_time=2.863e-04, forward_time=0.203, loss_att=52.693, acc=0.954, loss=52.693, backward_time=0.295, grad_norm=101.037, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.619e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 17:21:10,807 (trainer:732) INFO: 21epoch:train:7241-9050batch: iter_time=2.909e-04, forward_time=0.204, loss_att=52.298, acc=0.955, loss=52.298, backward_time=0.296, grad_norm=99.299, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.610e-04, train_time=2.682 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 17:41:19,156 (trainer:732) INFO: 21epoch:train:9051-10860batch: iter_time=2.778e-04, forward_time=0.203, loss_att=52.250, acc=0.954, loss=52.250, backward_time=0.295, grad_norm=97.530, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.602e-04, train_time=2.670 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 18:01:28,439 (trainer:732) INFO: 21epoch:train:10861-12670batch: iter_time=2.865e-04, forward_time=0.203, loss_att=52.161, acc=0.954, loss=52.161, backward_time=0.295, grad_norm=104.708, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.594e-04, train_time=2.673 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 18:21:36,758 (trainer:732) INFO: 21epoch:train:12671-14480batch: iter_time=2.778e-04, forward_time=0.203, loss_att=52.803, acc=0.954, loss=52.803, backward_time=0.295, grad_norm=97.374, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.586e-04, train_time=2.669 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 18:41:46,955 (trainer:732) INFO: 21epoch:train:14481-16290batch: iter_time=2.891e-04, forward_time=0.203, loss_att=51.925, acc=0.955, loss=51.925, backward_time=0.295, grad_norm=95.658, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.578e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 19:01:56,865 (trainer:732) INFO: 21epoch:train:16291-18100batch: iter_time=2.815e-04, forward_time=0.203, loss_att=52.452, acc=0.955, loss=52.452, backward_time=0.296, grad_norm=100.214, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.570e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 19:22:04,481 (trainer:732) INFO: 21epoch:train:18101-19910batch: iter_time=2.817e-04, forward_time=0.203, loss_att=52.175, acc=0.954, loss=52.175, backward_time=0.294, grad_norm=102.800, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.562e-04, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 19:42:14,435 (trainer:732) INFO: 21epoch:train:19911-21720batch: iter_time=2.847e-04, forward_time=0.203, loss_att=53.036, acc=0.954, loss=53.036, backward_time=0.296, grad_norm=99.059, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.554e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 20:02:28,277 (trainer:732) INFO: 21epoch:train:21721-23530batch: iter_time=2.809e-04, forward_time=0.204, loss_att=53.022, acc=0.954, loss=53.022, backward_time=0.296, grad_norm=98.145, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.546e-04, train_time=2.683 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 20:22:38,202 (trainer:732) INFO: 21epoch:train:23531-25340batch: iter_time=2.863e-04, forward_time=0.203, loss_att=52.294, acc=0.955, loss=52.294, backward_time=0.295, grad_norm=111.722, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.538e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 20:42:46,237 (trainer:732) INFO: 21epoch:train:25341-27150batch: iter_time=2.836e-04, forward_time=0.203, loss_att=52.280, acc=0.954, loss=52.280, backward_time=0.295, grad_norm=98.271, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.530e-04, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 21:02:56,460 (trainer:732) INFO: 21epoch:train:27151-28960batch: iter_time=2.837e-04, forward_time=0.203, loss_att=52.399, acc=0.954, loss=52.399, backward_time=0.295, grad_norm=101.458, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.522e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 21:23:06,374 (trainer:732) INFO: 21epoch:train:28961-30770batch: iter_time=2.885e-04, forward_time=0.203, loss_att=52.135, acc=0.955, loss=52.135, backward_time=0.295, grad_norm=93.474, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.514e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 21:43:14,137 (trainer:732) INFO: 21epoch:train:30771-32580batch: iter_time=2.838e-04, forward_time=0.203, loss_att=51.613, acc=0.955, loss=51.613, backward_time=0.295, grad_norm=102.748, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.507e-04, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 22:03:18,309 (trainer:732) INFO: 21epoch:train:32581-34390batch: iter_time=2.877e-04, forward_time=0.202, loss_att=51.971, acc=0.954, loss=51.971, backward_time=0.294, grad_norm=99.513, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.499e-04, train_time=2.661 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 22:23:30,669 (trainer:732) INFO: 21epoch:train:34391-36200batch: iter_time=2.948e-04, forward_time=0.203, loss_att=52.567, acc=0.954, loss=52.567, backward_time=0.295, grad_norm=98.949, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.491e-04, train_time=2.679 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 22:32:37,378 (trainer:338) INFO: 21epoch results: [train] iter_time=3.314e-04, forward_time=0.203, loss_att=52.271, acc=0.954, loss=52.271, backward_time=0.295, grad_norm=99.384, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.566e-04, train_time=2.708, time=6 hours, 48 minutes and 54.16 seconds, total_count=760431, gpu_max_cached_mem_GB=29.945, [valid] loss_att=14.392, acc=0.975, cer=0.026, wer=0.104, loss=14.392, time=5 minutes and 4.26 seconds, total_count=1176, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 38.49 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 22:32:41,240 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 22:32:41,248 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/11epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 22:32:41,248 (trainer:272) INFO: 22/60epoch started. Estimated time to finish: 1 week, 4 days and 9 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 22:57:55,093 (trainer:732) INFO: 22epoch:train:1-1810batch: iter_time=0.001, forward_time=0.203, loss_att=51.061, acc=0.955, loss=51.061, backward_time=0.294, grad_norm=96.851, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.483e-04, train_time=3.346 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 23:18:06,496 (trainer:732) INFO: 22epoch:train:1811-3620batch: iter_time=2.941e-04, forward_time=0.203, loss_att=51.988, acc=0.955, loss=51.988, backward_time=0.295, grad_norm=99.162, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.476e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 23:38:18,455 (trainer:732) INFO: 22epoch:train:3621-5430batch: iter_time=2.937e-04, forward_time=0.204, loss_att=51.055, acc=0.955, loss=51.055, backward_time=0.295, grad_norm=99.888, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.468e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-26 23:58:28,337 (trainer:732) INFO: 22epoch:train:5431-7240batch: iter_time=2.841e-04, forward_time=0.203, loss_att=51.597, acc=0.955, loss=51.597, backward_time=0.295, grad_norm=98.024, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.460e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 00:18:39,802 (trainer:732) INFO: 22epoch:train:7241-9050batch: iter_time=2.912e-04, forward_time=0.203, loss_att=51.593, acc=0.955, loss=51.593, backward_time=0.295, grad_norm=98.930, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.453e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 00:38:49,526 (trainer:732) INFO: 22epoch:train:9051-10860batch: iter_time=2.745e-04, forward_time=0.203, loss_att=51.276, acc=0.955, loss=51.276, backward_time=0.295, grad_norm=96.126, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.445e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 00:58:59,162 (trainer:732) INFO: 22epoch:train:10861-12670batch: iter_time=2.837e-04, forward_time=0.203, loss_att=51.469, acc=0.955, loss=51.469, backward_time=0.295, grad_norm=101.909, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.438e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 01:19:07,744 (trainer:732) INFO: 22epoch:train:12671-14480batch: iter_time=2.906e-04, forward_time=0.203, loss_att=51.628, acc=0.955, loss=51.628, backward_time=0.294, grad_norm=99.909, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.430e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 01:39:17,122 (trainer:732) INFO: 22epoch:train:14481-16290batch: iter_time=2.845e-04, forward_time=0.203, loss_att=51.264, acc=0.955, loss=51.264, backward_time=0.295, grad_norm=101.020, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.423e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 01:59:27,002 (trainer:732) INFO: 22epoch:train:16291-18100batch: iter_time=2.845e-04, forward_time=0.203, loss_att=51.766, acc=0.955, loss=51.766, backward_time=0.295, grad_norm=97.371, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.415e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 02:19:41,674 (trainer:732) INFO: 22epoch:train:18101-19910batch: iter_time=2.864e-04, forward_time=0.203, loss_att=53.261, acc=0.955, loss=53.261, backward_time=0.296, grad_norm=100.060, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.408e-04, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 02:39:53,128 (trainer:732) INFO: 22epoch:train:19911-21720batch: iter_time=2.761e-04, forward_time=0.204, loss_att=51.481, acc=0.955, loss=51.481, backward_time=0.296, grad_norm=101.254, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.400e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 03:00:03,924 (trainer:732) INFO: 22epoch:train:21721-23530batch: iter_time=2.824e-04, forward_time=0.203, loss_att=51.905, acc=0.955, loss=51.905, backward_time=0.295, grad_norm=98.041, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.393e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 03:20:12,960 (trainer:732) INFO: 22epoch:train:23531-25340batch: iter_time=2.835e-04, forward_time=0.203, loss_att=50.992, acc=0.955, loss=50.992, backward_time=0.295, grad_norm=101.498, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.385e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 03:40:25,390 (trainer:732) INFO: 22epoch:train:25341-27150batch: iter_time=2.874e-04, forward_time=0.203, loss_att=52.186, acc=0.955, loss=52.186, backward_time=0.296, grad_norm=96.835, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.378e-04, train_time=2.679 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 04:00:36,673 (trainer:732) INFO: 22epoch:train:27151-28960batch: iter_time=2.931e-04, forward_time=0.203, loss_att=51.718, acc=0.955, loss=51.718, backward_time=0.295, grad_norm=99.621, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.371e-04, train_time=2.676 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 04:20:44,901 (trainer:732) INFO: 22epoch:train:28961-30770batch: iter_time=2.810e-04, forward_time=0.203, loss_att=51.758, acc=0.955, loss=51.758, backward_time=0.295, grad_norm=99.000, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.364e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 04:40:54,227 (trainer:732) INFO: 22epoch:train:30771-32580batch: iter_time=2.887e-04, forward_time=0.203, loss_att=51.739, acc=0.955, loss=51.739, backward_time=0.295, grad_norm=95.883, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.356e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 05:01:03,915 (trainer:732) INFO: 22epoch:train:32581-34390batch: iter_time=2.942e-04, forward_time=0.203, loss_att=50.973, acc=0.956, loss=50.973, backward_time=0.295, grad_norm=99.480, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.349e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 05:21:13,008 (trainer:732) INFO: 22epoch:train:34391-36200batch: iter_time=2.783e-04, forward_time=0.203, loss_att=51.722, acc=0.955, loss=51.722, backward_time=0.295, grad_norm=98.145, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.342e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 05:30:16,270 (trainer:338) INFO: 22epoch results: [train] iter_time=3.323e-04, forward_time=0.203, loss_att=51.618, acc=0.955, loss=51.618, backward_time=0.295, grad_norm=98.944, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.412e-04, train_time=2.708, time=6 hours, 48 minutes and 51.65 seconds, total_count=796642, gpu_max_cached_mem_GB=29.945, [valid] loss_att=12.171, acc=0.978, cer=0.027, wer=0.103, loss=12.171, time=5 minutes and 5.94 seconds, total_count=1232, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 37.43 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 05:30:21,684 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 05:30:21,698 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/12epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 05:30:21,698 (trainer:272) INFO: 23/60epoch started. Estimated time to finish: 1 week, 4 days and 2 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 05:55:31,258 (trainer:732) INFO: 23epoch:train:1-1810batch: iter_time=0.001, forward_time=0.204, loss_att=50.126, acc=0.956, loss=50.126, backward_time=0.295, grad_norm=98.501, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.335e-04, train_time=3.336 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 06:15:44,341 (trainer:732) INFO: 23epoch:train:1811-3620batch: iter_time=2.955e-04, forward_time=0.204, loss_att=50.817, acc=0.956, loss=50.817, backward_time=0.296, grad_norm=97.406, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.327e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 06:35:53,037 (trainer:732) INFO: 23epoch:train:3621-5430batch: iter_time=2.869e-04, forward_time=0.203, loss_att=50.566, acc=0.955, loss=50.566, backward_time=0.295, grad_norm=97.038, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.320e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 06:56:01,971 (trainer:732) INFO: 23epoch:train:5431-7240batch: iter_time=2.872e-04, forward_time=0.202, loss_att=50.841, acc=0.956, loss=50.841, backward_time=0.295, grad_norm=101.220, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.313e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 07:16:13,718 (trainer:732) INFO: 23epoch:train:7241-9050batch: iter_time=2.824e-04, forward_time=0.203, loss_att=51.301, acc=0.956, loss=51.301, backward_time=0.296, grad_norm=103.374, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.306e-04, train_time=2.678 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 07:36:23,361 (trainer:732) INFO: 23epoch:train:9051-10860batch: iter_time=2.737e-04, forward_time=0.203, loss_att=50.840, acc=0.955, loss=50.840, backward_time=0.295, grad_norm=100.977, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.299e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 07:56:32,206 (trainer:732) INFO: 23epoch:train:10861-12670batch: iter_time=2.732e-04, forward_time=0.202, loss_att=51.115, acc=0.956, loss=51.115, backward_time=0.295, grad_norm=101.212, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.292e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 08:16:38,902 (trainer:732) INFO: 23epoch:train:12671-14480batch: iter_time=2.912e-04, forward_time=0.203, loss_att=51.052, acc=0.955, loss=51.052, backward_time=0.295, grad_norm=99.468, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.285e-04, train_time=2.666 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051276:2051841 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 155) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051274:2051840 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 150) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 08:36:48,198 (trainer:732) INFO: 23epoch:train:14481-16290batch: iter_time=2.799e-04, forward_time=0.203, loss_att=50.921, acc=0.956, loss=50.921, backward_time=0.295, grad_norm=95.688, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.278e-04, train_time=2.673 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051278:2051842 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 138) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 08:56:53,510 (trainer:732) INFO: 23epoch:train:16291-18100batch: iter_time=2.746e-04, forward_time=0.202, loss_att=51.036, acc=0.955, loss=51.036, backward_time=0.294, grad_norm=98.441, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.271e-04, train_time=2.663 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2051277:2051839 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 139) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 09:17:00,861 (trainer:732) INFO: 23epoch:train:18101-19910batch: iter_time=2.788e-04, forward_time=0.203, loss_att=51.098, acc=0.955, loss=51.098, backward_time=0.295, grad_norm=96.934, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.264e-04, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 09:37:05,663 (trainer:732) INFO: 23epoch:train:19911-21720batch: iter_time=2.744e-04, forward_time=0.202, loss_att=50.971, acc=0.955, loss=50.971, backward_time=0.294, grad_norm=106.452, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.257e-04, train_time=2.662 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 09:57:13,589 (trainer:732) INFO: 23epoch:train:21721-23530batch: iter_time=2.759e-04, forward_time=0.203, loss_att=51.106, acc=0.955, loss=51.106, backward_time=0.295, grad_norm=102.549, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.250e-04, train_time=2.669 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 10:17:21,104 (trainer:732) INFO: 23epoch:train:23531-25340batch: iter_time=2.830e-04, forward_time=0.202, loss_att=51.299, acc=0.955, loss=51.299, backward_time=0.295, grad_norm=98.590, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.243e-04, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 10:37:30,248 (trainer:732) INFO: 23epoch:train:25341-27150batch: iter_time=2.859e-04, forward_time=0.203, loss_att=51.551, acc=0.955, loss=51.551, backward_time=0.295, grad_norm=101.242, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.236e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 10:57:41,991 (trainer:732) INFO: 23epoch:train:27151-28960batch: iter_time=2.780e-04, forward_time=0.203, loss_att=51.595, acc=0.955, loss=51.595, backward_time=0.296, grad_norm=99.374, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.229e-04, train_time=2.677 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 11:17:49,477 (trainer:732) INFO: 23epoch:train:28961-30770batch: iter_time=2.971e-04, forward_time=0.203, loss_att=50.441, acc=0.956, loss=50.441, backward_time=0.294, grad_norm=96.343, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.223e-04, train_time=2.668 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 11:38:04,508 (trainer:732) INFO: 23epoch:train:30771-32580batch: iter_time=2.902e-04, forward_time=0.204, loss_att=51.181, acc=0.956, loss=51.181, backward_time=0.296, grad_norm=102.602, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.216e-04, train_time=2.684 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 11:58:12,581 (trainer:732) INFO: 23epoch:train:32581-34390batch: iter_time=2.862e-04, forward_time=0.203, loss_att=50.835, acc=0.956, loss=50.835, backward_time=0.295, grad_norm=95.505, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.209e-04, train_time=2.670 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 12:18:21,774 (trainer:732) INFO: 23epoch:train:34391-36200batch: iter_time=2.893e-04, forward_time=0.203, loss_att=51.442, acc=0.955, loss=51.442, backward_time=0.295, grad_norm=100.479, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.202e-04, train_time=2.672 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 12:27:45,420 (trainer:338) INFO: 23epoch results: [train] iter_time=3.435e-04, forward_time=0.203, loss_att=51.005, acc=0.955, loss=51.005, backward_time=0.295, grad_norm=99.727, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.268e-04, train_time=2.705, time=6 hours, 48 minutes and 20.16 seconds, total_count=832853, gpu_max_cached_mem_GB=29.945, [valid] loss_att=12.493, acc=0.977, cer=0.026, wer=0.103, loss=12.493, time=5 minutes and 20.99 seconds, total_count=1288, gpu_max_cached_mem_GB=29.945, [att_plot] time=3 minutes and 42.58 seconds, total_count=0, gpu_max_cached_mem_GB=29.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 12:27:49,234 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 12:27:49,242 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/14epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 12:27:49,242 (trainer:272) INFO: 24/60epoch started. Estimated time to finish: 1 week, 3 days and 18 hours +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 12:53:53,433 (trainer:732) INFO: 24epoch:train:1-1810batch: iter_time=0.001, forward_time=0.205, loss_att=50.224, acc=0.956, loss=50.224, backward_time=0.296, grad_norm=99.864, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=6.195e-04, train_time=3.457 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 13:14:06,476 (trainer:732) INFO: 24epoch:train:1811-3620batch: iter_time=2.928e-04, forward_time=0.203, loss_att=49.827, acc=0.957, loss=49.827, backward_time=0.296, grad_norm=101.002, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.189e-04, train_time=2.680 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 13:34:12,137 (trainer:732) INFO: 24epoch:train:3621-5430batch: iter_time=2.823e-04, forward_time=0.202, loss_att=50.395, acc=0.956, loss=50.395, backward_time=0.294, grad_norm=99.730, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.182e-04, train_time=2.664 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 13:54:21,715 (trainer:732) INFO: 24epoch:train:5431-7240batch: iter_time=2.989e-04, forward_time=0.203, loss_att=49.885, acc=0.956, loss=49.885, backward_time=0.295, grad_norm=96.310, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.175e-04, train_time=2.673 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 14:14:31,917 (trainer:732) INFO: 24epoch:train:7241-9050batch: iter_time=3.022e-04, forward_time=0.203, loss_att=50.051, acc=0.956, loss=50.051, backward_time=0.295, grad_norm=94.823, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.169e-04, train_time=2.674 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 14:34:40,637 (trainer:732) INFO: 24epoch:train:9051-10860batch: iter_time=2.700e-04, forward_time=0.203, loss_att=50.592, acc=0.956, loss=50.592, backward_time=0.295, grad_norm=96.017, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.162e-04, train_time=2.671 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 14:54:43,319 (trainer:732) INFO: 24epoch:train:10861-12670batch: iter_time=2.566e-04, forward_time=0.202, loss_att=50.209, acc=0.956, loss=50.209, backward_time=0.293, grad_norm=102.837, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=6.155e-04, train_time=2.658 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-27 15:14:56,216 (trainer:732) INFO: 24epoch:train:12671-14480batch: iter_time=2.586e-04, forward_time=0.203, loss_att=50.526, acc=0.956, loss=50.526, backward_time=0.296, grad_norm=107.845, clip=100.000, loss_scale=1.000, optim_step_time=0.063, optim0_lr0=6.149e-04, train_time=2.680 +Exception ignored from cffi callback .vio_tell at 0x7f84993308b0>: +Exception ignored from cffi callback .vio_tell at 0x7f31faf6c820>: +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/soundfile.py", line 1264, in vio_tell + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/soundfile.py", line 1264, in vio_tell +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 197, in _run_module_as_main + @_ffi.callback("sf_vio_tell") +KeyboardInterrupt: + @_ffi.callback("sf_vio_tell") +KeyboardInterrupt: + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 109, in join + ready = multiprocessing.connection.wait( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt +Process SpawnProcess-1: +Process SpawnProcess-3: +Process SpawnProcess-2: +Process SpawnProcess-4: +Traceback (most recent call last): +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl + return forward_call(*input, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward + output = self._run_ddp_forward(*inputs, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 969, in _run_ddp_forward + return module_to_run(*inputs[0], **kwargs[0]) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl + return forward_call(*input, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/asr/espnet_model.py", line 237, in forward + encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/asr/espnet_model.py", line 420, in encode + assert encoder_out.size(-2) <= encoder_out_lens.max(), ( +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl + return forward_call(*input, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl + return forward_call(*input, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward + output = self._run_ddp_forward(*inputs, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward + output = self._run_ddp_forward(*inputs, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 969, in _run_ddp_forward + return module_to_run(*inputs[0], **kwargs[0]) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 969, in _run_ddp_forward + return module_to_run(*inputs[0], **kwargs[0]) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 567, in train_one_epoch + retval = model(**batch) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl + return forward_call(*input, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl + return forward_call(*input, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl + return forward_call(*input, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/asr/espnet_model.py", line 237, in forward + encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/asr/espnet_model.py", line 420, in encode + assert encoder_out.size(-2) <= encoder_out_lens.max(), ( +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1008, in forward + output = self._run_ddp_forward(*inputs, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 969, in _run_ddp_forward + return module_to_run(*inputs[0], **kwargs[0]) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/asr/espnet_model.py", line 237, in forward + encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/asr/espnet_model.py", line 420, in encode + assert encoder_out.size(-2) <= encoder_out_lens.max(), ( +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl + return forward_call(*input, **kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/asr/espnet_model.py", line 237, in forward + encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/asr/espnet_model.py", line 420, in encode + assert encoder_out.size(-2) <= encoder_out_lens.max(), ( +KeyboardInterrupt diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.7.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.7.log new file mode 100644 index 0000000000000000000000000000000000000000..9d0ed68e59486de557094e54a7a9bc0e30c5ea13 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.7.log @@ -0,0 +1,860 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +# Started at Tue Feb 20 19:06:09 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --init_param /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 4 --multiprocessing_distributed True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:28,887 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:28,887 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 4 nodes. +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:28,909 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:33,178 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:33,202 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:33,202 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.0005 + lr: 2.5e-08 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:33,202 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:33,215 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:33,251 (abs_task:1304) INFO: Loading pretrained params from /star-home/jinzengrui/dev/espnet/egs2/librimix/sot_asr1_pretrain/exp/asr_train_sot_asr_conformer_raw_en_char_sp/45epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:07:40,952 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,057 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/large_w_whamr/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/large_w_whamr/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,057 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=71645, batch_bins=8000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,086 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=71645, mean=27.3, min=12, max=142 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,744 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,805 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,805 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=109, batch_bins=8000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,805 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=109, mean=45.9, min=13, max=83 +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,822 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,892 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,893 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=5000, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:10,893 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] NCCL INFO Bootstrap : Using eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041322 [2] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041321 [1] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.211<0> +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041323 [3] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO Channel 00/02 : 0 1 2 3 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO Setting affinity for GPU 4 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO Channel 01/02 : 0 1 2 3 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO Setting affinity for GPU 5 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO Setting affinity for GPU 3 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO Setting affinity for GPU 2 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO Channel 00 : 3[b2000] -> 0[40000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO Channel 01 : 3[b2000] -> 0[40000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO Channel 00 : 1[41000] -> 2[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO Channel 01 : 1[41000] -> 2[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO Channel 00 : 0[40000] -> 1[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO Channel 01 : 0[40000] -> 1[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO Channel 00 : 2[b1000] -> 3[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO Channel 01 : 2[b1000] -> 3[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO Channel 00 : 3[b2000] -> 2[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO Channel 01 : 3[b2000] -> 2[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO Channel 00 : 2[b1000] -> 1[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO Channel 01 : 2[b1000] -> 1[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO Channel 00 : 1[41000] -> 0[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO Channel 01 : 1[41000] -> 0[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041568 [0] NCCL INFO comm 0x7f48d8002f70 rank 0 nranks 4 cudaDev 0 busId 40000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041321:2041570 [1] NCCL INFO comm 0x7fdf38002f70 rank 1 nranks 4 cudaDev 1 busId 41000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041323:2041571 [3] NCCL INFO comm 0x7f5160002f70 rank 3 nranks 4 cudaDev 3 busId b2000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041322:2041569 [2] NCCL INFO comm 0x7fbfbc002f70 rank 2 nranks 4 cudaDev 2 busId b1000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:2041320:2041320 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:08:14,435 (trainer:284) INFO: 1/60epoch started +[de-74279-k2-train-1-1207150822-75498b8c5f-55j4z:0/4] 2024-02-20 19:11:07,654 (distributed:995) INFO: Reducer buckets have been rebuilt in this iteration. +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 197, in _run_module_as_main + return _run_code(code, main_globals, None, + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/runpy.py", line 87, in _run_code + exec(code, run_globals) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 23, in + main() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py", line 19, in main + ASRTask.main(cmd=cmd) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1132, in main + while not ProcessContext(processes, error_queues).join(): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/multiprocessing/spawn.py", line 109, in join + ready = multiprocessing.connection.wait( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/connection.py", line 931, in wait + ready = selector.select(timeout) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/selectors.py", line 416, in select + fd_event_list = self._selector.poll(timeout) +KeyboardInterrupt +Process SpawnProcess-2: +Process SpawnProcess-4: +Process SpawnProcess-1: +Process SpawnProcess-3: +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() +Traceback (most recent call last): +Traceback (most recent call last): + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 315, in _bootstrap + self.run() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 632, in train_one_epoch + loss.backward() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 632, in train_one_epoch + loss.backward() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/multiprocessing/process.py", line 108, in run + self._target(*self._args, **self._kwargs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward + torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward + torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 632, in train_one_epoch + loss.backward() + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/tasks/abs_task.py", line 1391, in main_worker + cls.trainer.run( + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 290, in run + all_steps_are_invalid = cls.train_one_epoch( +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward + torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/train/trainer.py", line 632, in train_one_epoch + loss.backward() +KeyboardInterrupt +KeyboardInterrupt + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward + torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) + File "/star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward + Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass +KeyboardInterrupt diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.log b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.log new file mode 100644 index 0000000000000000000000000000000000000000..3d3221660724ee6ac400a445c25f2f3d703f87bb --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/train.log @@ -0,0 +1,2606 @@ +# python3 -m espnet2.bin.asr_train --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +# Started at Tue Feb 27 19:28:49 CST 2024 +# +/star-home/jinzengrui/lib/miniconda3/envs/espnet/bin/python3 /star-home/jinzengrui/lib/miniconda3/envs/espnet/lib/python3.9/site-packages/espnet-202308-py3.9.egg/espnet2/bin/asr_train.py --use_preprocessor true --bpemodel none --token_type char --token_list data/en_token_list/char/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/wav.scp,speech,kaldi_ark --valid_shape_file exp/asr_stats_raw_en_char/valid/speech_shape --resume true --ignore_init_mismatch false --fold_length 80000 --output_dir exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr --config conf/tuning/train_sot_asr_conformer_large.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz --train_data_path_and_name_and_type dump/raw/large_w_whamr/wav.scp,speech,kaldi_ark --train_shape_file exp/asr_stats_raw_en_char/train/speech_shape --fold_length 150 --train_data_path_and_name_and_type dump/raw/large_w_whamr/text,text,text --train_shape_file exp/asr_stats_raw_en_char/train/text_shape.char --valid_data_path_and_name_and_type dump/raw/cv_mix_clean_reverb_max_16k/text,text,text --valid_shape_file exp/asr_stats_raw_en_char/valid/text_shape.char --ngpu 8 --multiprocessing_distributed True +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:04,634 (distributed_c10d:228) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:04,634 (distributed_c10d:262) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 8 nodes. +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:04,667 (asr:490) INFO: Vocabulary size: 32 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:08,243 (abs_task:1229) INFO: pytorch.version=1.12.1+cu116, cuda.available=True, cudnn.version=8302, cudnn.benchmark=False, cudnn.deterministic=True +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:08,254 (abs_task:1230) INFO: Model structure: +ESPnetASRModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=512, hop_length=128, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (normalize): GlobalMVN(stats_file=exp/asr_stats_raw_en_char/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): ConformerEncoder( + (embed): Conv2dSubsampling( + (conv): Sequential( + (0): Conv2d(1, 256, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + ) + (out): Sequential( + (0): Linear(in_features=4864, out_features=256, bias=True) + (1): RelPositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (6): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (7): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (8): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (9): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (10): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (11): EncoderLayer( + (self_attn): RelPositionMultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (linear_pos): Linear(in_features=256, out_features=256, bias=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (conv_module): ConvolutionModule( + (pointwise_conv1): Conv1d(256, 512, kernel_size=(1,), stride=(1,)) + (depthwise_conv): Conv1d(256, 256, kernel_size=(31,), stride=(1,), padding=(15,), groups=256) + (norm): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (pointwise_conv2): Conv1d(256, 256, kernel_size=(1,), stride=(1,)) + (activation): Swish() + ) + (norm_ff): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_conv): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + ) + (decoder): TransformerDecoder( + (embed): Sequential( + (0): Embedding(32, 256) + (1): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (output_layer): Linear(in_features=256, out_features=32, bias=True) + (decoders): MultiSequential( + (0): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (4): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (5): DecoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (src_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=256, out_features=256, bias=True) + (linear_k): Linear(in_features=256, out_features=256, bias=True) + (linear_v): Linear(in_features=256, out_features=256, bias=True) + (linear_out): Linear(in_features=256, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=256, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=256, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (norm3): LayerNorm((256,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + ) + (criterion_att): LabelSmoothingLoss( + (criterion): KLDivLoss() + ) +) + +Model summary: + Class Name: ESPnetASRModel + Total Number of model parameters: 43.00 M + Number of trainable parameters: 43.00 M (100.0%) + Size: 172.01 MB + Type: torch.float32 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:08,254 (abs_task:1233) INFO: Optimizer: +Adam ( +Parameter Group 0 + amsgrad: False + betas: (0.9, 0.999) + capturable: False + eps: 1e-08 + foreach: None + initial_lr: 0.002 + lr: 1e-07 + maximize: False + weight_decay: 1e-06 +) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:08,254 (abs_task:1234) INFO: Scheduler: WarmupLR(warmup_steps=20000) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:08,256 (abs_task:1243) INFO: Saving the configuration in exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/config.yaml +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:13,983 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,540 (abs_task:1614) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/large_w_whamr/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/large_w_whamr/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,540 (abs_task:1615) INFO: [train] Batch sampler: NumElementsBatchSampler(N-batch=18232, batch_bins=32000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,544 (abs_task:1616) INFO: [train] mini-batch sizes summary: N-batch=18232, mean=107.4, min=25, max=446 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,804 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,836 (abs_task:1614) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,837 (abs_task:1615) INFO: [valid] Batch sampler: NumElementsBatchSampler(N-batch=28, batch_bins=32000000, sort_in_batch=descending, sort_batch=descending) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,837 (abs_task:1616) INFO: [valid] mini-batch sizes summary: N-batch=28, mean=178.6, min=76, max=290 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,848 (asr:461) INFO: Optional Data Names: ('text_spk2', 'text_spk3', 'text_spk4') +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,875 (abs_task:1614) INFO: [plot_att] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/wav.scp", "type": "kaldi_ark"} + text: {"path": "dump/raw/cv_mix_clean_reverb_max_16k/text", "type": "text"} + preprocess: ) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,875 (abs_task:1615) INFO: [plot_att] Batch sampler: UnsortedBatchSampler(N-batch=5000, batch_size=1, key_file=exp/asr_stats_raw_en_char/valid/speech_shape, +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:36,875 (abs_task:1616) INFO: [plot_att] mini-batch sizes summary: N-batch=3, mean=1.0, min=1, max=1 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:40,296 (trainer:159) INFO: The training was resumed using exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/checkpoint.pth +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] NCCL INFO Bootstrap : Using eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] NCCL INFO Using network Socket +NCCL version 2.10.3+cuda11.6 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] NCCL INFO Bootstrap : Using eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] NCCL INFO Bootstrap : Using eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] NCCL INFO Bootstrap : Using eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] NCCL INFO Bootstrap : Using eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] NCCL INFO Bootstrap : Using eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] NCCL INFO Bootstrap : Using eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] NCCL INFO Bootstrap : Using eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2641 [1] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2643 [3] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2644 [4] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2645 [5] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2642 [2] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2646 [6] NCCL INFO Using network Socket + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_0 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] misc/ibvwrap.cc:212 NCCL WARN Call to ibv_open_device failed + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] transport/net_ib.cc:149 NCCL WARN NET/IB : Unable to open device mlx5_1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] NCCL INFO NET/IB : No device found. +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] NCCL INFO NET/Socket : Using [0]eth0:10.177.74.212<0> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2647 [7] NCCL INFO Using network Socket +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO Setting affinity for GPU 7 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO Setting affinity for GPU 1 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO Setting affinity for GPU 2 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO Setting affinity for GPU 4 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO Setting affinity for GPU 6 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO Setting affinity for GPU 5 to ff,ffc0000f,fffc0000 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO Setting affinity for GPU 3 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO Setting affinity for GPU 0 to 07ffe0,0000fffe +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO Channel 00 : 7[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO Channel 00 : 6[b4000] -> 7[b5000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO Channel 00 : 1[3e000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO Channel 00 : 5[b2000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO Channel 01 : 7[b5000] -> 0[3d000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO Channel 00 : 2[40000] -> 3[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO Channel 01 : 6[b4000] -> 7[b5000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO Channel 00 : 3[41000] -> 4[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO Channel 01 : 1[3e000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO Channel 01 : 5[b2000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO Channel 01 : 2[40000] -> 3[41000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO Channel 01 : 3[41000] -> 4[b1000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO Channel 00 : 4[b1000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO Channel 01 : 4[b1000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO Channel 00 : 6[b4000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO Channel 01 : 6[b4000] -> 5[b2000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO Channel 00 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO Channel 01 : 0[3d000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO Channel 00 : 2[40000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO Channel 01 : 2[40000] -> 1[3e000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO Channel 00 : 5[b2000] -> 4[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO Channel 01 : 5[b2000] -> 4[b1000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO Channel 00 : 4[b1000] -> 3[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO Channel 01 : 4[b1000] -> 3[41000] via direct shared memory +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO Channel 00 : 7[b5000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO Channel 01 : 7[b5000] -> 6[b4000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO Connected all rings +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO Channel 00 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO Channel 01 : 1[3e000] -> 0[3d000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO Channel 00 : 3[41000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO Channel 01 : 3[41000] -> 2[40000] via P2P/IPC +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO Connected all trees +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 8/8/512 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2708 [4] NCCL INFO comm 0x7f35c8003090 rank 4 nranks 8 cudaDev 4 busId b1000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2705 [0] NCCL INFO comm 0x7f6b04003090 rank 0 nranks 8 cudaDev 0 busId 3d000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2706 [1] NCCL INFO comm 0x7feb3c003090 rank 1 nranks 8 cudaDev 1 busId 3e000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2711 [6] NCCL INFO comm 0x7fc718003090 rank 6 nranks 8 cudaDev 6 busId b4000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2707 [3] NCCL INFO comm 0x7ff9b4003090 rank 3 nranks 8 cudaDev 3 busId 41000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2710 [2] NCCL INFO comm 0x7fc998003090 rank 2 nranks 8 cudaDev 2 busId 40000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2709 [5] NCCL INFO comm 0x7f2538003090 rank 5 nranks 8 cudaDev 5 busId b2000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2712 [7] NCCL INFO comm 0x7ff904003090 rank 7 nranks 8 cudaDev 7 busId b5000 - Init COMPLETE +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2640 [0] NCCL INFO Launch mode Parallel +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:30:44,374 (trainer:284) INFO: 24/60epoch started +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:33:14,424 (distributed:995) INFO: Reducer buckets have been rebuilt in this iteration. +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:43:35,882 (trainer:732) INFO: 24epoch:train:1-911batch: iter_time=0.001, forward_time=0.208, loss_att=49.241, acc=0.957, loss=49.241, backward_time=0.302, grad_norm=75.615, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.197e-04, train_time=3.390 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 19:53:55,244 (trainer:732) INFO: 24epoch:train:912-1822batch: iter_time=2.162e-04, forward_time=0.201, loss_att=47.615, acc=0.958, loss=47.615, backward_time=0.299, grad_norm=76.453, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.194e-04, train_time=2.719 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 20:04:16,226 (trainer:732) INFO: 24epoch:train:1823-2733batch: iter_time=2.210e-04, forward_time=0.201, loss_att=47.913, acc=0.958, loss=47.913, backward_time=0.300, grad_norm=74.016, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.190e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 20:14:36,865 (trainer:732) INFO: 24epoch:train:2734-3644batch: iter_time=2.233e-04, forward_time=0.201, loss_att=47.458, acc=0.958, loss=47.458, backward_time=0.300, grad_norm=75.771, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.187e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 20:24:57,875 (trainer:732) INFO: 24epoch:train:3645-4555batch: iter_time=2.248e-04, forward_time=0.201, loss_att=47.699, acc=0.958, loss=47.699, backward_time=0.299, grad_norm=74.378, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.184e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 20:35:18,726 (trainer:732) INFO: 24epoch:train:4556-5466batch: iter_time=2.282e-04, forward_time=0.201, loss_att=47.928, acc=0.958, loss=47.928, backward_time=0.300, grad_norm=74.393, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.180e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 20:45:41,346 (trainer:732) INFO: 24epoch:train:5467-6377batch: iter_time=2.235e-04, forward_time=0.201, loss_att=48.230, acc=0.958, loss=48.230, backward_time=0.301, grad_norm=72.966, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.177e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 20:56:01,686 (trainer:732) INFO: 24epoch:train:6378-7288batch: iter_time=2.339e-04, forward_time=0.201, loss_att=47.796, acc=0.958, loss=47.796, backward_time=0.299, grad_norm=72.242, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.174e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 21:06:23,397 (trainer:732) INFO: 24epoch:train:7289-8199batch: iter_time=2.190e-04, forward_time=0.202, loss_att=47.769, acc=0.959, loss=47.769, backward_time=0.301, grad_norm=73.783, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.170e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 21:16:43,740 (trainer:732) INFO: 24epoch:train:8200-9110batch: iter_time=2.209e-04, forward_time=0.201, loss_att=47.233, acc=0.959, loss=47.233, backward_time=0.300, grad_norm=74.974, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.167e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 21:27:06,531 (trainer:732) INFO: 24epoch:train:9111-10021batch: iter_time=2.265e-04, forward_time=0.201, loss_att=47.204, acc=0.959, loss=47.204, backward_time=0.301, grad_norm=76.912, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.164e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 21:37:26,457 (trainer:732) INFO: 24epoch:train:10022-10932batch: iter_time=2.198e-04, forward_time=0.200, loss_att=47.606, acc=0.958, loss=47.606, backward_time=0.299, grad_norm=78.802, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.160e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 21:47:47,050 (trainer:732) INFO: 24epoch:train:10933-11843batch: iter_time=2.206e-04, forward_time=0.201, loss_att=47.995, acc=0.958, loss=47.995, backward_time=0.300, grad_norm=79.759, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.157e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 21:58:08,083 (trainer:732) INFO: 24epoch:train:11844-12754batch: iter_time=2.239e-04, forward_time=0.201, loss_att=47.870, acc=0.958, loss=47.870, backward_time=0.300, grad_norm=72.980, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.154e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 22:08:29,041 (trainer:732) INFO: 24epoch:train:12755-13665batch: iter_time=2.246e-04, forward_time=0.201, loss_att=48.351, acc=0.958, loss=48.351, backward_time=0.300, grad_norm=77.274, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.150e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 22:18:50,113 (trainer:732) INFO: 24epoch:train:13666-14576batch: iter_time=2.274e-04, forward_time=0.201, loss_att=47.281, acc=0.959, loss=47.281, backward_time=0.300, grad_norm=79.033, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.147e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 22:29:13,213 (trainer:732) INFO: 24epoch:train:14577-15487batch: iter_time=2.331e-04, forward_time=0.201, loss_att=48.732, acc=0.958, loss=48.732, backward_time=0.301, grad_norm=73.815, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.144e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 22:39:37,225 (trainer:732) INFO: 24epoch:train:15488-16398batch: iter_time=2.243e-04, forward_time=0.202, loss_att=48.417, acc=0.958, loss=48.417, backward_time=0.301, grad_norm=81.518, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.140e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 22:49:57,772 (trainer:732) INFO: 24epoch:train:16399-17309batch: iter_time=2.332e-04, forward_time=0.201, loss_att=48.045, acc=0.958, loss=48.045, backward_time=0.300, grad_norm=77.312, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.137e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:00:17,116 (trainer:732) INFO: 24epoch:train:17310-18220batch: iter_time=2.186e-04, forward_time=0.200, loss_att=47.383, acc=0.958, loss=47.383, backward_time=0.299, grad_norm=81.079, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.134e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:05:40,753 (trainer:338) INFO: 24epoch results: [train] iter_time=2.856e-04, forward_time=0.201, loss_att=47.891, acc=0.958, loss=47.891, backward_time=0.300, grad_norm=76.160, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.165e-04, train_time=2.760, time=3 hours, 29 minutes and 47.23 seconds, total_count=851085, gpu_max_cached_mem_GB=30.096, [valid] loss_att=11.606, acc=0.979, cer=0.024, wer=0.096, loss=11.606, time=2 minutes and 54.78 seconds, total_count=1316, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 14.33 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:05:44,506 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:05:44,514 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/13epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:05:44,514 (trainer:272) INFO: 25/60epoch started. Estimated time to finish: 5 days, 9 hours and 5.04 seconds +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:18:02,285 (trainer:732) INFO: 25epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=46.730, acc=0.959, loss=46.730, backward_time=0.300, grad_norm=76.776, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.130e-04, train_time=3.241 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:28:24,873 (trainer:732) INFO: 25epoch:train:912-1822batch: iter_time=2.605e-04, forward_time=0.202, loss_att=47.699, acc=0.959, loss=47.699, backward_time=0.300, grad_norm=76.572, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.127e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:38:46,264 (trainer:732) INFO: 25epoch:train:1823-2733batch: iter_time=2.617e-04, forward_time=0.201, loss_att=46.695, acc=0.959, loss=46.695, backward_time=0.298, grad_norm=81.263, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.124e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:49:09,224 (trainer:732) INFO: 25epoch:train:2734-3644batch: iter_time=2.660e-04, forward_time=0.202, loss_att=47.153, acc=0.959, loss=47.153, backward_time=0.299, grad_norm=81.206, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.121e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-27 23:59:32,686 (trainer:732) INFO: 25epoch:train:3645-4555batch: iter_time=2.608e-04, forward_time=0.202, loss_att=47.260, acc=0.959, loss=47.260, backward_time=0.300, grad_norm=78.729, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.117e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 00:09:57,775 (trainer:732) INFO: 25epoch:train:4556-5466batch: iter_time=2.578e-04, forward_time=0.203, loss_att=47.164, acc=0.959, loss=47.164, backward_time=0.301, grad_norm=81.627, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.114e-04, train_time=2.744 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 00:20:17,800 (trainer:732) INFO: 25epoch:train:5467-6377batch: iter_time=2.560e-04, forward_time=0.201, loss_att=46.420, acc=0.959, loss=46.420, backward_time=0.298, grad_norm=74.388, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.111e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 00:30:39,163 (trainer:732) INFO: 25epoch:train:6378-7288batch: iter_time=2.646e-04, forward_time=0.202, loss_att=46.335, acc=0.959, loss=46.335, backward_time=0.299, grad_norm=79.985, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.108e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 00:40:59,567 (trainer:732) INFO: 25epoch:train:7289-8199batch: iter_time=2.616e-04, forward_time=0.201, loss_att=46.952, acc=0.959, loss=46.952, backward_time=0.299, grad_norm=79.867, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.104e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 00:51:24,701 (trainer:732) INFO: 25epoch:train:8200-9110batch: iter_time=2.788e-04, forward_time=0.203, loss_att=47.865, acc=0.958, loss=47.865, backward_time=0.300, grad_norm=81.353, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.101e-04, train_time=2.744 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 01:01:48,637 (trainer:732) INFO: 25epoch:train:9111-10021batch: iter_time=2.842e-04, forward_time=0.203, loss_att=47.668, acc=0.959, loss=47.668, backward_time=0.300, grad_norm=77.648, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=6.098e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 01:12:13,466 (trainer:732) INFO: 25epoch:train:10022-10932batch: iter_time=2.789e-04, forward_time=0.203, loss_att=47.384, acc=0.959, loss=47.384, backward_time=0.300, grad_norm=76.808, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=6.095e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 01:22:35,744 (trainer:732) INFO: 25epoch:train:10933-11843batch: iter_time=2.804e-04, forward_time=0.203, loss_att=47.798, acc=0.958, loss=47.798, backward_time=0.300, grad_norm=76.910, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.091e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 01:32:57,595 (trainer:732) INFO: 25epoch:train:11844-12754batch: iter_time=2.846e-04, forward_time=0.202, loss_att=47.108, acc=0.958, loss=47.108, backward_time=0.298, grad_norm=77.108, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=6.088e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 01:43:17,530 (trainer:732) INFO: 25epoch:train:12755-13665batch: iter_time=2.804e-04, forward_time=0.202, loss_att=45.812, acc=0.959, loss=45.812, backward_time=0.298, grad_norm=78.201, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=6.085e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 01:53:41,681 (trainer:732) INFO: 25epoch:train:13666-14576batch: iter_time=2.879e-04, forward_time=0.203, loss_att=46.904, acc=0.959, loss=46.904, backward_time=0.301, grad_norm=74.974, clip=100.000, loss_scale=1.000, optim_step_time=0.068, optim0_lr0=6.082e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:04:04,041 (trainer:732) INFO: 25epoch:train:14577-15487batch: iter_time=2.807e-04, forward_time=0.202, loss_att=47.099, acc=0.959, loss=47.099, backward_time=0.299, grad_norm=75.103, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=6.079e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:14:28,281 (trainer:732) INFO: 25epoch:train:15488-16398batch: iter_time=2.661e-04, forward_time=0.203, loss_att=46.581, acc=0.959, loss=46.581, backward_time=0.300, grad_norm=78.358, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.075e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:24:49,830 (trainer:732) INFO: 25epoch:train:16399-17309batch: iter_time=2.697e-04, forward_time=0.202, loss_att=47.290, acc=0.959, loss=47.290, backward_time=0.299, grad_norm=76.199, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.072e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:35:12,042 (trainer:732) INFO: 25epoch:train:17310-18220batch: iter_time=2.567e-04, forward_time=0.202, loss_att=47.870, acc=0.958, loss=47.870, backward_time=0.300, grad_norm=84.652, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.069e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:40:27,112 (trainer:338) INFO: 25epoch results: [train] iter_time=3.577e-04, forward_time=0.202, loss_att=47.090, acc=0.959, loss=47.090, backward_time=0.299, grad_norm=78.393, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=6.100e-04, train_time=2.759, time=3 hours, 29 minutes and 41.96 seconds, total_count=869317, gpu_max_cached_mem_GB=30.096, [valid] loss_att=12.169, acc=0.978, cer=0.024, wer=0.095, loss=12.169, time=2 minutes and 56.69 seconds, total_count=1344, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 3.94 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:40:30,842 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:40:30,850 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/21epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:40:30,850 (trainer:272) INFO: 26/60epoch started. Estimated time to finish: 5 days, 5 hours and 21 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 02:52:47,475 (trainer:732) INFO: 26epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=46.595, acc=0.959, loss=46.595, backward_time=0.299, grad_norm=72.992, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.066e-04, train_time=3.236 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 03:03:12,274 (trainer:732) INFO: 26epoch:train:912-1822batch: iter_time=2.621e-04, forward_time=0.203, loss_att=47.306, acc=0.959, loss=47.306, backward_time=0.301, grad_norm=79.475, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.063e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 03:13:33,129 (trainer:732) INFO: 26epoch:train:1823-2733batch: iter_time=2.647e-04, forward_time=0.202, loss_att=46.017, acc=0.959, loss=46.017, backward_time=0.299, grad_norm=77.770, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.059e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 03:23:54,691 (trainer:732) INFO: 26epoch:train:2734-3644batch: iter_time=2.662e-04, forward_time=0.202, loss_att=46.497, acc=0.959, loss=46.497, backward_time=0.299, grad_norm=80.729, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.056e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 03:34:18,147 (trainer:732) INFO: 26epoch:train:3645-4555batch: iter_time=2.606e-04, forward_time=0.202, loss_att=47.350, acc=0.959, loss=47.350, backward_time=0.300, grad_norm=75.488, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.053e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 03:44:37,584 (trainer:732) INFO: 26epoch:train:4556-5466batch: iter_time=2.592e-04, forward_time=0.201, loss_att=46.433, acc=0.959, loss=46.433, backward_time=0.298, grad_norm=71.034, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.050e-04, train_time=2.720 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 03:55:01,072 (trainer:732) INFO: 26epoch:train:5467-6377batch: iter_time=2.520e-04, forward_time=0.202, loss_att=47.241, acc=0.959, loss=47.241, backward_time=0.300, grad_norm=75.332, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.047e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 04:05:21,678 (trainer:732) INFO: 26epoch:train:6378-7288batch: iter_time=2.604e-04, forward_time=0.201, loss_att=46.716, acc=0.959, loss=46.716, backward_time=0.299, grad_norm=75.639, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.044e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 04:15:40,812 (trainer:732) INFO: 26epoch:train:7289-8199batch: iter_time=2.496e-04, forward_time=0.201, loss_att=45.922, acc=0.959, loss=45.922, backward_time=0.298, grad_norm=77.094, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.041e-04, train_time=2.719 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 04:26:02,484 (trainer:732) INFO: 26epoch:train:8200-9110batch: iter_time=2.603e-04, forward_time=0.202, loss_att=45.856, acc=0.960, loss=45.856, backward_time=0.299, grad_norm=83.316, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.037e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 04:36:26,976 (trainer:732) INFO: 26epoch:train:9111-10021batch: iter_time=2.580e-04, forward_time=0.202, loss_att=47.162, acc=0.959, loss=47.162, backward_time=0.300, grad_norm=79.606, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.034e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 04:46:49,686 (trainer:732) INFO: 26epoch:train:10022-10932batch: iter_time=2.562e-04, forward_time=0.202, loss_att=46.293, acc=0.959, loss=46.293, backward_time=0.299, grad_norm=74.538, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.031e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 04:57:11,047 (trainer:732) INFO: 26epoch:train:10933-11843batch: iter_time=2.549e-04, forward_time=0.202, loss_att=46.001, acc=0.959, loss=46.001, backward_time=0.299, grad_norm=79.212, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.028e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 05:07:35,583 (trainer:732) INFO: 26epoch:train:11844-12754batch: iter_time=2.623e-04, forward_time=0.202, loss_att=46.269, acc=0.960, loss=46.269, backward_time=0.300, grad_norm=79.875, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.025e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 05:17:58,657 (trainer:732) INFO: 26epoch:train:12755-13665batch: iter_time=2.565e-04, forward_time=0.202, loss_att=46.690, acc=0.959, loss=46.690, backward_time=0.300, grad_norm=97.207, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.022e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 05:28:20,020 (trainer:732) INFO: 26epoch:train:13666-14576batch: iter_time=2.608e-04, forward_time=0.202, loss_att=47.038, acc=0.959, loss=47.038, backward_time=0.299, grad_norm=76.123, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.019e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 05:38:41,205 (trainer:732) INFO: 26epoch:train:14577-15487batch: iter_time=2.613e-04, forward_time=0.202, loss_att=46.019, acc=0.960, loss=46.019, backward_time=0.299, grad_norm=78.306, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.016e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 05:49:03,064 (trainer:732) INFO: 26epoch:train:15488-16398batch: iter_time=2.600e-04, forward_time=0.202, loss_att=47.223, acc=0.959, loss=47.223, backward_time=0.299, grad_norm=79.264, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.013e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 05:59:25,440 (trainer:732) INFO: 26epoch:train:16399-17309batch: iter_time=2.593e-04, forward_time=0.202, loss_att=47.287, acc=0.959, loss=47.287, backward_time=0.300, grad_norm=77.102, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.009e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:09:50,307 (trainer:732) INFO: 26epoch:train:17310-18220batch: iter_time=2.541e-04, forward_time=0.202, loss_att=45.810, acc=0.960, loss=45.810, backward_time=0.300, grad_norm=79.004, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.006e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:14:55,545 (trainer:338) INFO: 26epoch results: [train] iter_time=3.147e-04, forward_time=0.202, loss_att=46.588, acc=0.959, loss=46.588, backward_time=0.299, grad_norm=78.474, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=6.036e-04, train_time=2.757, time=3 hours, 29 minutes and 33.88 seconds, total_count=887549, gpu_max_cached_mem_GB=30.096, [valid] loss_att=11.244, acc=0.980, cer=0.024, wer=0.094, loss=11.244, time=2 minutes and 46.61 seconds, total_count=1372, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 4.2 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:14:59,670 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:14:59,679 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/15epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:14:59,680 (trainer:272) INFO: 27/60epoch started. Estimated time to finish: 5 days, 1 hour and 41 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:27:16,856 (trainer:732) INFO: 27epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=45.770, acc=0.960, loss=45.770, backward_time=0.299, grad_norm=85.465, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.003e-04, train_time=3.239 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<41244> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<41374> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<43517> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<43661> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<40100> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<40200> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<45323> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<45445> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<43416> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<43454> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<18740> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<38388> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<18864> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<38512> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:37:37,642 (trainer:732) INFO: 27epoch:train:912-1822batch: iter_time=2.662e-04, forward_time=0.202, loss_att=45.913, acc=0.959, loss=45.913, backward_time=0.299, grad_norm=81.202, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=6.000e-04, train_time=2.725 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<32304> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.98<32396> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:47:56,936 (trainer:732) INFO: 27epoch:train:1823-2733batch: iter_time=2.568e-04, forward_time=0.201, loss_att=45.800, acc=0.960, loss=45.800, backward_time=0.298, grad_norm=81.108, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.997e-04, train_time=2.719 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 06:58:18,953 (trainer:732) INFO: 27epoch:train:2734-3644batch: iter_time=2.552e-04, forward_time=0.202, loss_att=46.490, acc=0.960, loss=46.490, backward_time=0.300, grad_norm=78.290, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.994e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 07:08:40,956 (trainer:732) INFO: 27epoch:train:3645-4555batch: iter_time=2.652e-04, forward_time=0.202, loss_att=46.555, acc=0.960, loss=46.555, backward_time=0.300, grad_norm=79.378, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.991e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 07:19:02,490 (trainer:732) INFO: 27epoch:train:4556-5466batch: iter_time=2.548e-04, forward_time=0.202, loss_att=45.454, acc=0.960, loss=45.454, backward_time=0.299, grad_norm=73.785, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.988e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 07:29:25,063 (trainer:732) INFO: 27epoch:train:5467-6377batch: iter_time=2.506e-04, forward_time=0.203, loss_att=46.270, acc=0.960, loss=46.270, backward_time=0.300, grad_norm=81.269, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.985e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 07:39:49,607 (trainer:732) INFO: 27epoch:train:6378-7288batch: iter_time=2.546e-04, forward_time=0.203, loss_att=46.282, acc=0.960, loss=46.282, backward_time=0.301, grad_norm=80.710, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.982e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 07:50:12,956 (trainer:732) INFO: 27epoch:train:7289-8199batch: iter_time=2.543e-04, forward_time=0.202, loss_att=46.566, acc=0.960, loss=46.566, backward_time=0.300, grad_norm=75.984, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.979e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 08:00:32,072 (trainer:732) INFO: 27epoch:train:8200-9110batch: iter_time=2.539e-04, forward_time=0.201, loss_att=45.885, acc=0.959, loss=45.885, backward_time=0.298, grad_norm=76.474, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.976e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 08:10:54,507 (trainer:732) INFO: 27epoch:train:9111-10021batch: iter_time=2.455e-04, forward_time=0.202, loss_att=46.525, acc=0.959, loss=46.525, backward_time=0.299, grad_norm=87.308, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.973e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 08:21:18,377 (trainer:732) INFO: 27epoch:train:10022-10932batch: iter_time=2.480e-04, forward_time=0.203, loss_att=46.746, acc=0.960, loss=46.746, backward_time=0.301, grad_norm=74.064, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.970e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 08:31:39,947 (trainer:732) INFO: 27epoch:train:10933-11843batch: iter_time=2.532e-04, forward_time=0.202, loss_att=45.837, acc=0.960, loss=45.837, backward_time=0.299, grad_norm=78.680, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.967e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 08:42:03,086 (trainer:732) INFO: 27epoch:train:11844-12754batch: iter_time=2.521e-04, forward_time=0.202, loss_att=46.305, acc=0.960, loss=46.305, backward_time=0.300, grad_norm=79.046, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.964e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 08:52:23,968 (trainer:732) INFO: 27epoch:train:12755-13665batch: iter_time=2.570e-04, forward_time=0.201, loss_att=46.099, acc=0.959, loss=46.099, backward_time=0.298, grad_norm=76.570, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.961e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:02:45,560 (trainer:732) INFO: 27epoch:train:13666-14576batch: iter_time=2.531e-04, forward_time=0.202, loss_att=46.552, acc=0.959, loss=46.552, backward_time=0.299, grad_norm=75.627, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.958e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:13:08,370 (trainer:732) INFO: 27epoch:train:14577-15487batch: iter_time=2.547e-04, forward_time=0.203, loss_att=46.544, acc=0.960, loss=46.544, backward_time=0.300, grad_norm=81.363, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.955e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:23:30,766 (trainer:732) INFO: 27epoch:train:15488-16398batch: iter_time=2.563e-04, forward_time=0.202, loss_att=45.301, acc=0.960, loss=45.301, backward_time=0.299, grad_norm=79.111, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.952e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:33:50,619 (trainer:732) INFO: 27epoch:train:16399-17309batch: iter_time=2.524e-04, forward_time=0.201, loss_att=46.623, acc=0.959, loss=46.623, backward_time=0.298, grad_norm=74.500, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.949e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:44:12,803 (trainer:732) INFO: 27epoch:train:17310-18220batch: iter_time=2.578e-04, forward_time=0.202, loss_att=46.515, acc=0.959, loss=46.515, backward_time=0.299, grad_norm=80.647, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.946e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:51:39,093 (trainer:338) INFO: 27epoch results: [train] iter_time=3.095e-04, forward_time=0.202, loss_att=46.201, acc=0.960, loss=46.201, backward_time=0.299, grad_norm=79.035, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.974e-04, train_time=2.756, time=3 hours, 29 minutes and 26.8 seconds, total_count=905781, gpu_max_cached_mem_GB=30.096, [valid] loss_att=11.058, acc=0.980, cer=0.024, wer=0.094, loss=11.058, time=3 minutes and 4.75 seconds, total_count=1400, gpu_max_cached_mem_GB=30.096, [att_plot] time=4 minutes and 7.85 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:51:46,903 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:51:46,920 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/16epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 09:51:46,921 (trainer:272) INFO: 28/60epoch started. Estimated time to finish: 4 days, 22 hours and 23 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 10:06:44,734 (trainer:732) INFO: 28epoch:train:1-911batch: iter_time=0.002, forward_time=0.215, loss_att=44.720, acc=0.961, loss=44.720, backward_time=0.304, grad_norm=78.177, clip=100.000, loss_scale=1.000, optim_step_time=0.105, optim0_lr0=5.943e-04, train_time=3.945 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 10:17:55,165 (trainer:732) INFO: 28epoch:train:912-1822batch: iter_time=6.482e-04, forward_time=0.220, loss_att=45.239, acc=0.960, loss=45.239, backward_time=0.306, grad_norm=75.028, clip=100.000, loss_scale=1.000, optim_step_time=0.123, optim0_lr0=5.940e-04, train_time=2.942 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 10:29:01,456 (trainer:732) INFO: 28epoch:train:1823-2733batch: iter_time=6.253e-04, forward_time=0.219, loss_att=44.995, acc=0.960, loss=44.995, backward_time=0.305, grad_norm=76.783, clip=100.000, loss_scale=1.000, optim_step_time=0.119, optim0_lr0=5.937e-04, train_time=2.925 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 10:39:57,208 (trainer:732) INFO: 28epoch:train:2734-3644batch: iter_time=5.138e-04, forward_time=0.216, loss_att=45.151, acc=0.961, loss=45.151, backward_time=0.304, grad_norm=76.000, clip=100.000, loss_scale=1.000, optim_step_time=0.102, optim0_lr0=5.934e-04, train_time=2.878 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 10:51:07,034 (trainer:732) INFO: 28epoch:train:3645-4555batch: iter_time=6.647e-04, forward_time=0.221, loss_att=45.839, acc=0.960, loss=45.839, backward_time=0.306, grad_norm=75.850, clip=100.000, loss_scale=1.000, optim_step_time=0.126, optim0_lr0=5.931e-04, train_time=2.941 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 11:01:56,618 (trainer:732) INFO: 28epoch:train:4556-5466batch: iter_time=4.728e-04, forward_time=0.213, loss_att=46.377, acc=0.960, loss=46.377, backward_time=0.304, grad_norm=79.755, clip=100.000, loss_scale=1.000, optim_step_time=0.098, optim0_lr0=5.928e-04, train_time=2.852 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 11:12:22,237 (trainer:732) INFO: 28epoch:train:5467-6377batch: iter_time=2.623e-04, forward_time=0.203, loss_att=46.496, acc=0.960, loss=46.496, backward_time=0.301, grad_norm=84.650, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.925e-04, train_time=2.747 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 11:22:45,244 (trainer:732) INFO: 28epoch:train:6378-7288batch: iter_time=2.572e-04, forward_time=0.202, loss_att=45.594, acc=0.960, loss=45.594, backward_time=0.300, grad_norm=74.864, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.922e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 11:33:07,967 (trainer:732) INFO: 28epoch:train:7289-8199batch: iter_time=2.648e-04, forward_time=0.202, loss_att=45.736, acc=0.960, loss=45.736, backward_time=0.299, grad_norm=81.419, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.919e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 11:43:29,601 (trainer:732) INFO: 28epoch:train:8200-9110batch: iter_time=2.678e-04, forward_time=0.202, loss_att=46.493, acc=0.959, loss=46.493, backward_time=0.299, grad_norm=79.833, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.916e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 11:53:52,590 (trainer:732) INFO: 28epoch:train:9111-10021batch: iter_time=2.611e-04, forward_time=0.202, loss_att=45.737, acc=0.960, loss=45.737, backward_time=0.300, grad_norm=77.716, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.913e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 12:04:15,886 (trainer:732) INFO: 28epoch:train:10022-10932batch: iter_time=2.599e-04, forward_time=0.202, loss_att=45.571, acc=0.960, loss=45.571, backward_time=0.300, grad_norm=74.158, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.910e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 12:14:33,949 (trainer:732) INFO: 28epoch:train:10933-11843batch: iter_time=2.620e-04, forward_time=0.201, loss_att=45.658, acc=0.959, loss=45.658, backward_time=0.297, grad_norm=80.214, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.907e-04, train_time=2.714 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 12:24:56,231 (trainer:732) INFO: 28epoch:train:11844-12754batch: iter_time=2.586e-04, forward_time=0.202, loss_att=45.728, acc=0.960, loss=45.728, backward_time=0.300, grad_norm=83.645, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.904e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 12:35:18,496 (trainer:732) INFO: 28epoch:train:12755-13665batch: iter_time=2.538e-04, forward_time=0.202, loss_att=46.018, acc=0.959, loss=46.018, backward_time=0.299, grad_norm=78.476, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.901e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 12:45:39,568 (trainer:732) INFO: 28epoch:train:13666-14576batch: iter_time=2.642e-04, forward_time=0.202, loss_att=46.528, acc=0.959, loss=46.528, backward_time=0.299, grad_norm=78.113, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.898e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 12:56:01,244 (trainer:732) INFO: 28epoch:train:14577-15487batch: iter_time=2.666e-04, forward_time=0.202, loss_att=45.984, acc=0.960, loss=45.984, backward_time=0.299, grad_norm=77.144, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.895e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:06:21,311 (trainer:732) INFO: 28epoch:train:15488-16398batch: iter_time=2.613e-04, forward_time=0.201, loss_att=46.273, acc=0.960, loss=46.273, backward_time=0.298, grad_norm=80.623, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.892e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:16:43,595 (trainer:732) INFO: 28epoch:train:16399-17309batch: iter_time=2.626e-04, forward_time=0.202, loss_att=45.983, acc=0.960, loss=45.983, backward_time=0.300, grad_norm=88.783, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.889e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:27:04,205 (trainer:732) INFO: 28epoch:train:17310-18220batch: iter_time=2.641e-04, forward_time=0.202, loss_att=45.024, acc=0.960, loss=45.024, backward_time=0.299, grad_norm=80.160, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.887e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:32:02,996 (trainer:338) INFO: 28epoch results: [train] iter_time=4.209e-04, forward_time=0.206, loss_att=45.763, acc=0.960, loss=45.763, backward_time=0.301, grad_norm=79.097, clip=100.000, loss_scale=1.000, optim_step_time=0.079, optim0_lr0=5.914e-04, train_time=2.835, time=3 hours, 35 minutes and 29.21 seconds, total_count=924013, gpu_max_cached_mem_GB=30.096, [valid] loss_att=11.029, acc=0.980, cer=0.024, wer=0.093, loss=11.029, time=2 minutes and 41.93 seconds, total_count=1428, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 4.93 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:32:07,184 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:32:07,195 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/17epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:32:07,195 (trainer:272) INFO: 29/60epoch started. Estimated time to finish: 4 days, 19 hours and 20 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:44:26,886 (trainer:732) INFO: 29epoch:train:1-911batch: iter_time=0.002, forward_time=0.203, loss_att=45.414, acc=0.961, loss=45.414, backward_time=0.301, grad_norm=75.951, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.884e-04, train_time=3.249 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 13:54:47,328 (trainer:732) INFO: 29epoch:train:912-1822batch: iter_time=2.696e-04, forward_time=0.202, loss_att=44.722, acc=0.961, loss=44.722, backward_time=0.299, grad_norm=85.447, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.881e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 14:05:09,367 (trainer:732) INFO: 29epoch:train:1823-2733batch: iter_time=2.607e-04, forward_time=0.202, loss_att=44.991, acc=0.960, loss=44.991, backward_time=0.299, grad_norm=75.658, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.878e-04, train_time=2.731 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 14:15:31,995 (trainer:732) INFO: 29epoch:train:2734-3644batch: iter_time=2.604e-04, forward_time=0.202, loss_att=45.703, acc=0.960, loss=45.703, backward_time=0.300, grad_norm=75.230, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.875e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 14:25:53,577 (trainer:732) INFO: 29epoch:train:3645-4555batch: iter_time=2.639e-04, forward_time=0.202, loss_att=44.759, acc=0.960, loss=44.759, backward_time=0.299, grad_norm=82.497, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.872e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 14:36:14,040 (trainer:732) INFO: 29epoch:train:4556-5466batch: iter_time=2.633e-04, forward_time=0.202, loss_att=45.456, acc=0.960, loss=45.456, backward_time=0.299, grad_norm=83.511, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.869e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 14:46:37,432 (trainer:732) INFO: 29epoch:train:5467-6377batch: iter_time=2.706e-04, forward_time=0.202, loss_att=45.278, acc=0.961, loss=45.278, backward_time=0.300, grad_norm=81.000, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.866e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 14:57:00,544 (trainer:732) INFO: 29epoch:train:6378-7288batch: iter_time=2.575e-04, forward_time=0.202, loss_att=44.661, acc=0.961, loss=44.661, backward_time=0.300, grad_norm=77.294, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.863e-04, train_time=2.735 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 15:07:23,700 (trainer:732) INFO: 29epoch:train:7289-8199batch: iter_time=2.659e-04, forward_time=0.203, loss_att=46.103, acc=0.960, loss=46.103, backward_time=0.300, grad_norm=78.319, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.861e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 15:17:44,196 (trainer:732) INFO: 29epoch:train:8200-9110batch: iter_time=2.590e-04, forward_time=0.201, loss_att=45.626, acc=0.960, loss=45.626, backward_time=0.298, grad_norm=83.578, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.858e-04, train_time=2.724 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 15:28:04,800 (trainer:732) INFO: 29epoch:train:9111-10021batch: iter_time=2.623e-04, forward_time=0.202, loss_att=44.778, acc=0.960, loss=44.778, backward_time=0.299, grad_norm=79.472, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.855e-04, train_time=2.725 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 15:38:24,796 (trainer:732) INFO: 29epoch:train:10022-10932batch: iter_time=2.748e-04, forward_time=0.201, loss_att=45.512, acc=0.960, loss=45.512, backward_time=0.298, grad_norm=75.959, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.852e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 15:48:48,632 (trainer:732) INFO: 29epoch:train:10933-11843batch: iter_time=2.658e-04, forward_time=0.202, loss_att=46.565, acc=0.960, loss=46.565, backward_time=0.300, grad_norm=77.387, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.849e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 15:59:11,221 (trainer:732) INFO: 29epoch:train:11844-12754batch: iter_time=2.677e-04, forward_time=0.202, loss_att=45.457, acc=0.960, loss=45.457, backward_time=0.300, grad_norm=84.073, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.846e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 16:09:32,282 (trainer:732) INFO: 29epoch:train:12755-13665batch: iter_time=2.670e-04, forward_time=0.202, loss_att=45.219, acc=0.960, loss=45.219, backward_time=0.299, grad_norm=78.900, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.843e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 16:19:54,500 (trainer:732) INFO: 29epoch:train:13666-14576batch: iter_time=2.653e-04, forward_time=0.202, loss_att=45.217, acc=0.960, loss=45.217, backward_time=0.299, grad_norm=80.779, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.841e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 16:30:17,331 (trainer:732) INFO: 29epoch:train:14577-15487batch: iter_time=2.709e-04, forward_time=0.202, loss_att=44.722, acc=0.960, loss=44.722, backward_time=0.299, grad_norm=83.767, clip=100.000, loss_scale=1.000, optim_step_time=0.067, optim0_lr0=5.838e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 16:40:44,720 (trainer:732) INFO: 29epoch:train:15488-16398batch: iter_time=3.242e-04, forward_time=0.203, loss_att=46.387, acc=0.960, loss=46.387, backward_time=0.301, grad_norm=77.962, clip=100.000, loss_scale=1.000, optim_step_time=0.069, optim0_lr0=5.835e-04, train_time=2.754 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 16:51:54,008 (trainer:732) INFO: 29epoch:train:16399-17309batch: iter_time=6.294e-04, forward_time=0.220, loss_att=46.002, acc=0.960, loss=46.002, backward_time=0.306, grad_norm=87.453, clip=100.000, loss_scale=1.000, optim_step_time=0.123, optim0_lr0=5.832e-04, train_time=2.938 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:02:43,993 (trainer:732) INFO: 29epoch:train:17310-18220batch: iter_time=4.960e-04, forward_time=0.213, loss_att=45.845, acc=0.960, loss=45.845, backward_time=0.302, grad_norm=84.280, clip=100.000, loss_scale=1.000, optim_step_time=0.099, optim0_lr0=5.829e-04, train_time=2.853 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:09:36,965 (trainer:338) INFO: 29epoch results: [train] iter_time=3.792e-04, forward_time=0.203, loss_att=45.422, acc=0.960, loss=45.422, backward_time=0.300, grad_norm=80.430, clip=100.000, loss_scale=1.000, optim_step_time=0.069, optim0_lr0=5.856e-04, train_time=2.774, time=3 hours, 30 minutes and 49.62 seconds, total_count=942245, gpu_max_cached_mem_GB=30.096, [valid] loss_att=11.344, acc=0.980, cer=0.023, wer=0.092, loss=11.344, time=2 minutes and 31.3 seconds, total_count=1456, gpu_max_cached_mem_GB=30.096, [att_plot] time=4 minutes and 8.85 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:09:45,693 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:09:45,711 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/19epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:09:45,712 (trainer:272) INFO: 30/60epoch started. Estimated time to finish: 4 days, 15 hours and 51 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:25:02,419 (trainer:732) INFO: 30epoch:train:1-911batch: iter_time=0.002, forward_time=0.220, loss_att=44.594, acc=0.961, loss=44.594, backward_time=0.305, grad_norm=82.109, clip=100.000, loss_scale=1.000, optim_step_time=0.121, optim0_lr0=5.826e-04, train_time=4.029 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:35:48,391 (trainer:732) INFO: 30epoch:train:912-1822batch: iter_time=4.534e-04, forward_time=0.211, loss_att=44.685, acc=0.960, loss=44.685, backward_time=0.303, grad_norm=73.594, clip=100.000, loss_scale=1.000, optim_step_time=0.094, optim0_lr0=5.824e-04, train_time=2.835 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:46:58,268 (trainer:732) INFO: 30epoch:train:1823-2733batch: iter_time=6.182e-04, forward_time=0.219, loss_att=45.247, acc=0.960, loss=45.247, backward_time=0.306, grad_norm=80.700, clip=100.000, loss_scale=1.000, optim_step_time=0.119, optim0_lr0=5.821e-04, train_time=2.940 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 17:57:59,801 (trainer:732) INFO: 30epoch:train:2734-3644batch: iter_time=5.457e-04, forward_time=0.214, loss_att=44.361, acc=0.961, loss=44.361, backward_time=0.304, grad_norm=78.171, clip=100.000, loss_scale=1.000, optim_step_time=0.108, optim0_lr0=5.818e-04, train_time=2.904 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 18:08:20,845 (trainer:732) INFO: 30epoch:train:3645-4555batch: iter_time=2.699e-04, forward_time=0.202, loss_att=45.656, acc=0.960, loss=45.656, backward_time=0.299, grad_norm=75.545, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.815e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 18:18:45,895 (trainer:732) INFO: 30epoch:train:4556-5466batch: iter_time=2.810e-04, forward_time=0.204, loss_att=44.920, acc=0.961, loss=44.920, backward_time=0.300, grad_norm=82.797, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.812e-04, train_time=2.744 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 18:29:09,785 (trainer:732) INFO: 30epoch:train:5467-6377batch: iter_time=2.557e-04, forward_time=0.202, loss_att=45.238, acc=0.961, loss=45.238, backward_time=0.300, grad_norm=85.489, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.810e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 18:39:32,960 (trainer:732) INFO: 30epoch:train:6378-7288batch: iter_time=2.539e-04, forward_time=0.202, loss_att=45.093, acc=0.961, loss=45.093, backward_time=0.300, grad_norm=78.080, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.807e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 18:49:54,642 (trainer:732) INFO: 30epoch:train:7289-8199batch: iter_time=2.603e-04, forward_time=0.202, loss_att=45.368, acc=0.960, loss=45.368, backward_time=0.299, grad_norm=74.532, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.804e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 19:00:18,739 (trainer:732) INFO: 30epoch:train:8200-9110batch: iter_time=2.616e-04, forward_time=0.203, loss_att=45.581, acc=0.961, loss=45.581, backward_time=0.301, grad_norm=75.012, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.801e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 19:10:40,903 (trainer:732) INFO: 30epoch:train:9111-10021batch: iter_time=2.504e-04, forward_time=0.202, loss_att=45.628, acc=0.960, loss=45.628, backward_time=0.299, grad_norm=76.315, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.798e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 19:21:03,394 (trainer:732) INFO: 30epoch:train:10022-10932batch: iter_time=2.540e-04, forward_time=0.202, loss_att=45.460, acc=0.960, loss=45.460, backward_time=0.300, grad_norm=74.749, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.796e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 19:31:26,008 (trainer:732) INFO: 30epoch:train:10933-11843batch: iter_time=2.545e-04, forward_time=0.202, loss_att=44.870, acc=0.961, loss=44.870, backward_time=0.300, grad_norm=80.802, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.793e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 19:41:45,600 (trainer:732) INFO: 30epoch:train:11844-12754batch: iter_time=2.535e-04, forward_time=0.201, loss_att=45.084, acc=0.960, loss=45.084, backward_time=0.298, grad_norm=77.105, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.790e-04, train_time=2.720 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 19:52:08,237 (trainer:732) INFO: 30epoch:train:12755-13665batch: iter_time=2.529e-04, forward_time=0.203, loss_att=45.044, acc=0.961, loss=45.044, backward_time=0.300, grad_norm=78.164, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.787e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:02:31,196 (trainer:732) INFO: 30epoch:train:13666-14576batch: iter_time=2.538e-04, forward_time=0.202, loss_att=44.739, acc=0.961, loss=44.739, backward_time=0.300, grad_norm=81.478, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.785e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:12:54,054 (trainer:732) INFO: 30epoch:train:14577-15487batch: iter_time=2.539e-04, forward_time=0.202, loss_att=45.377, acc=0.960, loss=45.377, backward_time=0.300, grad_norm=76.762, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.782e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:23:13,953 (trainer:732) INFO: 30epoch:train:15488-16398batch: iter_time=2.521e-04, forward_time=0.201, loss_att=44.843, acc=0.960, loss=44.843, backward_time=0.298, grad_norm=75.628, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.779e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:33:34,262 (trainer:732) INFO: 30epoch:train:16399-17309batch: iter_time=2.460e-04, forward_time=0.202, loss_att=45.656, acc=0.960, loss=45.656, backward_time=0.299, grad_norm=79.158, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.776e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:43:56,563 (trainer:732) INFO: 30epoch:train:17310-18220batch: iter_time=2.624e-04, forward_time=0.202, loss_att=44.650, acc=0.961, loss=44.650, backward_time=0.299, grad_norm=77.373, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.774e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:48:59,821 (trainer:338) INFO: 30epoch results: [train] iter_time=3.891e-04, forward_time=0.205, loss_att=45.107, acc=0.961, loss=45.107, backward_time=0.300, grad_norm=78.226, clip=100.000, loss_scale=1.000, optim_step_time=0.074, optim0_lr0=5.800e-04, train_time=2.821, time=3 hours, 34 minutes and 28.44 seconds, total_count=960477, gpu_max_cached_mem_GB=30.096, [valid] loss_att=11.311, acc=0.980, cer=0.024, wer=0.092, loss=11.311, time=2 minutes and 41.01 seconds, total_count=1484, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 4.66 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:49:03,464 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:49:03,476 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/23epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 20:49:03,476 (trainer:272) INFO: 31/60epoch started. Estimated time to finish: 4 days, 12 hours and 27 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 21:01:22,424 (trainer:732) INFO: 31epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=44.113, acc=0.961, loss=44.113, backward_time=0.299, grad_norm=80.522, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.771e-04, train_time=3.247 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 21:11:44,019 (trainer:732) INFO: 31epoch:train:912-1822batch: iter_time=2.709e-04, forward_time=0.202, loss_att=45.230, acc=0.961, loss=45.230, backward_time=0.299, grad_norm=78.652, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.768e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 21:22:07,581 (trainer:732) INFO: 31epoch:train:1823-2733batch: iter_time=2.610e-04, forward_time=0.203, loss_att=45.045, acc=0.961, loss=45.045, backward_time=0.301, grad_norm=79.221, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.765e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 21:32:28,303 (trainer:732) INFO: 31epoch:train:2734-3644batch: iter_time=2.677e-04, forward_time=0.201, loss_att=44.156, acc=0.961, loss=44.156, backward_time=0.298, grad_norm=82.592, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.763e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 21:42:49,742 (trainer:732) INFO: 31epoch:train:3645-4555batch: iter_time=2.634e-04, forward_time=0.202, loss_att=45.050, acc=0.960, loss=45.050, backward_time=0.299, grad_norm=89.261, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.760e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 21:53:14,551 (trainer:732) INFO: 31epoch:train:4556-5466batch: iter_time=2.641e-04, forward_time=0.203, loss_att=44.658, acc=0.961, loss=44.658, backward_time=0.301, grad_norm=82.842, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.757e-04, train_time=2.744 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 22:03:36,722 (trainer:732) INFO: 31epoch:train:5467-6377batch: iter_time=2.640e-04, forward_time=0.202, loss_att=44.524, acc=0.961, loss=44.524, backward_time=0.299, grad_norm=90.758, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.755e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 22:13:58,829 (trainer:732) INFO: 31epoch:train:6378-7288batch: iter_time=2.572e-04, forward_time=0.202, loss_att=45.448, acc=0.961, loss=45.448, backward_time=0.299, grad_norm=79.933, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.752e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 22:24:20,289 (trainer:732) INFO: 31epoch:train:7289-8199batch: iter_time=2.634e-04, forward_time=0.202, loss_att=43.504, acc=0.961, loss=43.504, backward_time=0.299, grad_norm=78.664, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.749e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 22:34:42,113 (trainer:732) INFO: 31epoch:train:8200-9110batch: iter_time=2.708e-04, forward_time=0.202, loss_att=45.778, acc=0.960, loss=45.778, backward_time=0.299, grad_norm=81.793, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.746e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 22:45:05,103 (trainer:732) INFO: 31epoch:train:9111-10021batch: iter_time=2.672e-04, forward_time=0.202, loss_att=43.824, acc=0.961, loss=43.824, backward_time=0.300, grad_norm=80.229, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.744e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 22:55:28,969 (trainer:732) INFO: 31epoch:train:10022-10932batch: iter_time=2.637e-04, forward_time=0.202, loss_att=44.657, acc=0.961, loss=44.657, backward_time=0.300, grad_norm=79.450, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.741e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 23:05:51,192 (trainer:732) INFO: 31epoch:train:10933-11843batch: iter_time=2.580e-04, forward_time=0.202, loss_att=44.808, acc=0.961, loss=44.808, backward_time=0.299, grad_norm=79.248, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.738e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 23:16:11,078 (trainer:732) INFO: 31epoch:train:11844-12754batch: iter_time=2.666e-04, forward_time=0.201, loss_att=45.432, acc=0.960, loss=45.432, backward_time=0.298, grad_norm=83.642, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.736e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 23:26:33,385 (trainer:732) INFO: 31epoch:train:12755-13665batch: iter_time=2.613e-04, forward_time=0.202, loss_att=44.933, acc=0.961, loss=44.933, backward_time=0.299, grad_norm=79.027, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.733e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 23:36:56,489 (trainer:732) INFO: 31epoch:train:13666-14576batch: iter_time=2.661e-04, forward_time=0.202, loss_att=44.291, acc=0.961, loss=44.291, backward_time=0.299, grad_norm=77.256, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.730e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 23:47:18,201 (trainer:732) INFO: 31epoch:train:14577-15487batch: iter_time=2.609e-04, forward_time=0.202, loss_att=44.410, acc=0.961, loss=44.410, backward_time=0.299, grad_norm=77.673, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.728e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-28 23:57:41,316 (trainer:732) INFO: 31epoch:train:15488-16398batch: iter_time=2.676e-04, forward_time=0.202, loss_att=44.982, acc=0.961, loss=44.982, backward_time=0.300, grad_norm=79.614, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.725e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:08:03,259 (trainer:732) INFO: 31epoch:train:16399-17309batch: iter_time=2.677e-04, forward_time=0.202, loss_att=45.461, acc=0.960, loss=45.461, backward_time=0.299, grad_norm=84.525, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.722e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:18:26,184 (trainer:732) INFO: 31epoch:train:17310-18220batch: iter_time=2.737e-04, forward_time=0.202, loss_att=45.427, acc=0.960, loss=45.427, backward_time=0.300, grad_norm=82.629, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.720e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:23:45,397 (trainer:338) INFO: 31epoch results: [train] iter_time=3.223e-04, forward_time=0.202, loss_att=44.784, acc=0.961, loss=44.784, backward_time=0.299, grad_norm=81.378, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.745e-04, train_time=2.758, time=3 hours, 29 minutes and 37.26 seconds, total_count=978709, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.935, acc=0.981, cer=0.023, wer=0.092, loss=10.935, time=2 minutes and 57.39 seconds, total_count=1512, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 7.26 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:23:49,891 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:23:49,903 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/18epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:23:49,903 (trainer:272) INFO: 32/60epoch started. Estimated time to finish: 4 days, 8 hours and 42 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:36:09,569 (trainer:732) INFO: 32epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=43.629, acc=0.962, loss=43.629, backward_time=0.299, grad_norm=83.109, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.717e-04, train_time=3.250 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:46:30,792 (trainer:732) INFO: 32epoch:train:912-1822batch: iter_time=2.592e-04, forward_time=0.201, loss_att=43.990, acc=0.961, loss=43.990, backward_time=0.299, grad_norm=81.253, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.714e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 00:56:52,740 (trainer:732) INFO: 32epoch:train:1823-2733batch: iter_time=2.598e-04, forward_time=0.202, loss_att=43.678, acc=0.961, loss=43.678, backward_time=0.299, grad_norm=75.134, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.712e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 01:07:14,063 (trainer:732) INFO: 32epoch:train:2734-3644batch: iter_time=2.733e-04, forward_time=0.201, loss_att=44.099, acc=0.961, loss=44.099, backward_time=0.298, grad_norm=81.536, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.709e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 01:17:37,372 (trainer:732) INFO: 32epoch:train:3645-4555batch: iter_time=2.569e-04, forward_time=0.202, loss_att=44.107, acc=0.962, loss=44.107, backward_time=0.300, grad_norm=85.233, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.706e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 01:27:58,914 (trainer:732) INFO: 32epoch:train:4556-5466batch: iter_time=2.611e-04, forward_time=0.202, loss_att=44.494, acc=0.961, loss=44.494, backward_time=0.299, grad_norm=76.426, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.704e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 01:38:22,390 (trainer:732) INFO: 32epoch:train:5467-6377batch: iter_time=2.550e-04, forward_time=0.203, loss_att=45.130, acc=0.961, loss=45.130, backward_time=0.300, grad_norm=77.881, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.701e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 01:48:40,433 (trainer:732) INFO: 32epoch:train:6378-7288batch: iter_time=2.624e-04, forward_time=0.200, loss_att=44.583, acc=0.960, loss=44.583, backward_time=0.297, grad_norm=80.952, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.698e-04, train_time=2.713 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 01:59:02,970 (trainer:732) INFO: 32epoch:train:7289-8199batch: iter_time=2.610e-04, forward_time=0.202, loss_att=44.584, acc=0.961, loss=44.584, backward_time=0.299, grad_norm=91.329, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.696e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 02:09:23,869 (trainer:732) INFO: 32epoch:train:8200-9110batch: iter_time=2.628e-04, forward_time=0.202, loss_att=45.316, acc=0.960, loss=45.316, backward_time=0.299, grad_norm=87.890, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.693e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 02:19:47,096 (trainer:732) INFO: 32epoch:train:9111-10021batch: iter_time=2.613e-04, forward_time=0.202, loss_att=44.715, acc=0.961, loss=44.715, backward_time=0.299, grad_norm=81.711, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.690e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 02:30:12,402 (trainer:732) INFO: 32epoch:train:10022-10932batch: iter_time=2.619e-04, forward_time=0.203, loss_att=44.608, acc=0.961, loss=44.608, backward_time=0.301, grad_norm=74.624, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.688e-04, train_time=2.745 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 02:40:34,697 (trainer:732) INFO: 32epoch:train:10933-11843batch: iter_time=2.546e-04, forward_time=0.202, loss_att=45.350, acc=0.961, loss=45.350, backward_time=0.299, grad_norm=81.872, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.685e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 02:50:57,952 (trainer:732) INFO: 32epoch:train:11844-12754batch: iter_time=2.610e-04, forward_time=0.202, loss_att=44.673, acc=0.961, loss=44.673, backward_time=0.300, grad_norm=79.064, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.683e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:01:18,538 (trainer:732) INFO: 32epoch:train:12755-13665batch: iter_time=2.568e-04, forward_time=0.201, loss_att=44.275, acc=0.961, loss=44.275, backward_time=0.298, grad_norm=82.860, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.680e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:11:40,278 (trainer:732) INFO: 32epoch:train:13666-14576batch: iter_time=2.590e-04, forward_time=0.201, loss_att=43.774, acc=0.961, loss=43.774, backward_time=0.299, grad_norm=81.813, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.677e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:22:04,559 (trainer:732) INFO: 32epoch:train:14577-15487batch: iter_time=2.637e-04, forward_time=0.203, loss_att=44.614, acc=0.961, loss=44.614, backward_time=0.301, grad_norm=83.903, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.675e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:32:26,721 (trainer:732) INFO: 32epoch:train:15488-16398batch: iter_time=2.608e-04, forward_time=0.202, loss_att=45.185, acc=0.961, loss=45.185, backward_time=0.299, grad_norm=74.740, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.672e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:42:53,740 (trainer:732) INFO: 32epoch:train:16399-17309batch: iter_time=2.566e-04, forward_time=0.203, loss_att=45.106, acc=0.961, loss=45.106, backward_time=0.302, grad_norm=83.552, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.670e-04, train_time=2.752 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:53:17,175 (trainer:732) INFO: 32epoch:train:17310-18220batch: iter_time=2.545e-04, forward_time=0.203, loss_att=45.130, acc=0.961, loss=45.130, backward_time=0.300, grad_norm=83.883, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.667e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:58:38,934 (trainer:338) INFO: 32epoch results: [train] iter_time=3.398e-04, forward_time=0.202, loss_att=44.552, acc=0.961, loss=44.552, backward_time=0.299, grad_norm=81.429, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.692e-04, train_time=2.759, time=3 hours, 29 minutes and 41.46 seconds, total_count=996941, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.508, acc=0.981, cer=0.023, wer=0.090, loss=10.508, time=3 minutes and 1.59 seconds, total_count=1540, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 5.98 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:58:43,046 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:58:43,058 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/22epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 03:58:43,059 (trainer:272) INFO: 33/60epoch started. Estimated time to finish: 4 days, 5 hours and 22.57 seconds + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 04:11:02,188 (trainer:732) INFO: 33epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=43.426, acc=0.961, loss=43.426, backward_time=0.299, grad_norm=81.085, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.664e-04, train_time=3.246 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 04:21:25,750 (trainer:732) INFO: 33epoch:train:912-1822batch: iter_time=2.751e-04, forward_time=0.203, loss_att=44.624, acc=0.961, loss=44.624, backward_time=0.301, grad_norm=78.942, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.662e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 04:31:48,528 (trainer:732) INFO: 33epoch:train:1823-2733batch: iter_time=2.686e-04, forward_time=0.202, loss_att=43.614, acc=0.961, loss=43.614, backward_time=0.299, grad_norm=79.657, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.659e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 04:42:10,862 (trainer:732) INFO: 33epoch:train:2734-3644batch: iter_time=2.746e-04, forward_time=0.202, loss_att=43.549, acc=0.962, loss=43.549, backward_time=0.299, grad_norm=77.392, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.657e-04, train_time=2.732 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 04:52:33,979 (trainer:732) INFO: 33epoch:train:3645-4555batch: iter_time=2.780e-04, forward_time=0.202, loss_att=44.944, acc=0.961, loss=44.944, backward_time=0.300, grad_norm=77.757, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.654e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 05:02:57,120 (trainer:732) INFO: 33epoch:train:4556-5466batch: iter_time=2.723e-04, forward_time=0.203, loss_att=44.328, acc=0.961, loss=44.328, backward_time=0.300, grad_norm=75.867, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.652e-04, train_time=2.735 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 05:13:18,862 (trainer:732) INFO: 33epoch:train:5467-6377batch: iter_time=2.668e-04, forward_time=0.202, loss_att=43.699, acc=0.961, loss=43.699, backward_time=0.298, grad_norm=79.228, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.649e-04, train_time=2.730 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 05:23:43,075 (trainer:732) INFO: 33epoch:train:6378-7288batch: iter_time=2.583e-04, forward_time=0.202, loss_att=44.462, acc=0.961, loss=44.462, backward_time=0.300, grad_norm=77.206, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.646e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 05:34:06,063 (trainer:732) INFO: 33epoch:train:7289-8199batch: iter_time=2.718e-04, forward_time=0.202, loss_att=44.359, acc=0.962, loss=44.359, backward_time=0.300, grad_norm=80.186, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.644e-04, train_time=2.736 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 05:44:29,768 (trainer:732) INFO: 33epoch:train:8200-9110batch: iter_time=2.681e-04, forward_time=0.203, loss_att=44.353, acc=0.961, loss=44.353, backward_time=0.300, grad_norm=80.360, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.641e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 05:54:52,388 (trainer:732) INFO: 33epoch:train:9111-10021batch: iter_time=2.664e-04, forward_time=0.202, loss_att=43.719, acc=0.962, loss=43.719, backward_time=0.300, grad_norm=75.466, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.639e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 06:05:12,691 (trainer:732) INFO: 33epoch:train:10022-10932batch: iter_time=2.720e-04, forward_time=0.201, loss_att=43.934, acc=0.961, loss=43.934, backward_time=0.298, grad_norm=79.334, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.636e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 06:15:34,694 (trainer:732) INFO: 33epoch:train:10933-11843batch: iter_time=2.651e-04, forward_time=0.202, loss_att=44.565, acc=0.961, loss=44.565, backward_time=0.300, grad_norm=76.517, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.634e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 06:25:54,330 (trainer:732) INFO: 33epoch:train:11844-12754batch: iter_time=2.656e-04, forward_time=0.201, loss_att=44.475, acc=0.961, loss=44.475, backward_time=0.298, grad_norm=81.689, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.631e-04, train_time=2.720 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 06:36:17,963 (trainer:732) INFO: 33epoch:train:12755-13665batch: iter_time=2.682e-04, forward_time=0.203, loss_att=45.033, acc=0.961, loss=45.033, backward_time=0.300, grad_norm=81.899, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.629e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 06:46:39,711 (trainer:732) INFO: 33epoch:train:13666-14576batch: iter_time=2.621e-04, forward_time=0.202, loss_att=43.788, acc=0.961, loss=43.788, backward_time=0.299, grad_norm=81.260, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.626e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 06:56:58,377 (trainer:732) INFO: 33epoch:train:14577-15487batch: iter_time=2.678e-04, forward_time=0.201, loss_att=44.015, acc=0.961, loss=44.015, backward_time=0.297, grad_norm=78.287, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.623e-04, train_time=2.716 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:07:21,777 (trainer:732) INFO: 33epoch:train:15488-16398batch: iter_time=2.659e-04, forward_time=0.202, loss_att=44.414, acc=0.962, loss=44.414, backward_time=0.300, grad_norm=78.761, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.621e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:17:45,212 (trainer:732) INFO: 33epoch:train:16399-17309batch: iter_time=2.697e-04, forward_time=0.202, loss_att=43.944, acc=0.962, loss=43.944, backward_time=0.300, grad_norm=82.601, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.618e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:28:05,550 (trainer:732) INFO: 33epoch:train:17310-18220batch: iter_time=2.627e-04, forward_time=0.202, loss_att=44.870, acc=0.961, loss=44.870, backward_time=0.298, grad_norm=84.384, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.616e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:33:10,929 (trainer:338) INFO: 33epoch results: [train] iter_time=3.752e-04, forward_time=0.202, loss_att=44.201, acc=0.961, loss=44.201, backward_time=0.299, grad_norm=79.409, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.640e-04, train_time=2.758, time=3 hours, 29 minutes and 36.81 seconds, total_count=1015173, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.917, acc=0.980, cer=0.023, wer=0.091, loss=10.917, time=2 minutes and 44.26 seconds, total_count=1568, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 6.8 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:33:15,242 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:33:15,270 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/20epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:33:15,271 (trainer:272) INFO: 34/60epoch started. Estimated time to finish: 4 days, 1 hour and 18 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:45:33,457 (trainer:732) INFO: 34epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=44.070, acc=0.961, loss=44.070, backward_time=0.299, grad_norm=77.811, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.613e-04, train_time=3.243 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 07:55:55,741 (trainer:732) INFO: 34epoch:train:912-1822batch: iter_time=2.749e-04, forward_time=0.202, loss_att=43.879, acc=0.962, loss=43.879, backward_time=0.299, grad_norm=79.773, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.611e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 08:06:17,656 (trainer:732) INFO: 34epoch:train:1823-2733batch: iter_time=2.720e-04, forward_time=0.202, loss_att=43.824, acc=0.961, loss=43.824, backward_time=0.299, grad_norm=78.923, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.608e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 08:16:40,297 (trainer:732) INFO: 34epoch:train:2734-3644batch: iter_time=2.735e-04, forward_time=0.202, loss_att=44.007, acc=0.962, loss=44.007, backward_time=0.300, grad_norm=78.070, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.606e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 08:27:00,460 (trainer:732) INFO: 34epoch:train:3645-4555batch: iter_time=2.711e-04, forward_time=0.202, loss_att=44.284, acc=0.961, loss=44.284, backward_time=0.298, grad_norm=78.817, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.603e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 08:37:24,115 (trainer:732) INFO: 34epoch:train:4556-5466batch: iter_time=2.669e-04, forward_time=0.203, loss_att=44.052, acc=0.962, loss=44.052, backward_time=0.300, grad_norm=83.652, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.601e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 08:47:46,084 (trainer:732) INFO: 34epoch:train:5467-6377batch: iter_time=2.606e-04, forward_time=0.202, loss_att=43.779, acc=0.962, loss=43.779, backward_time=0.300, grad_norm=81.927, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.598e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 08:58:06,572 (trainer:732) INFO: 34epoch:train:6378-7288batch: iter_time=2.586e-04, forward_time=0.202, loss_att=43.502, acc=0.961, loss=43.502, backward_time=0.299, grad_norm=79.653, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.596e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 09:08:31,121 (trainer:732) INFO: 34epoch:train:7289-8199batch: iter_time=2.662e-04, forward_time=0.203, loss_att=43.690, acc=0.962, loss=43.690, backward_time=0.301, grad_norm=83.101, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.593e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 09:18:53,700 (trainer:732) INFO: 34epoch:train:8200-9110batch: iter_time=2.583e-04, forward_time=0.202, loss_att=44.595, acc=0.961, loss=44.595, backward_time=0.300, grad_norm=81.626, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.591e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 09:29:16,915 (trainer:732) INFO: 34epoch:train:9111-10021batch: iter_time=2.576e-04, forward_time=0.202, loss_att=44.220, acc=0.962, loss=44.220, backward_time=0.300, grad_norm=85.545, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.588e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 09:39:38,613 (trainer:732) INFO: 34epoch:train:10022-10932batch: iter_time=2.585e-04, forward_time=0.202, loss_att=44.740, acc=0.961, loss=44.740, backward_time=0.299, grad_norm=78.882, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.586e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 09:50:01,377 (trainer:732) INFO: 34epoch:train:10933-11843batch: iter_time=2.555e-04, forward_time=0.202, loss_att=44.229, acc=0.961, loss=44.229, backward_time=0.299, grad_norm=77.009, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.583e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 10:00:24,140 (trainer:732) INFO: 34epoch:train:11844-12754batch: iter_time=2.667e-04, forward_time=0.202, loss_att=43.652, acc=0.961, loss=43.652, backward_time=0.300, grad_norm=72.470, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.581e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 10:10:45,090 (trainer:732) INFO: 34epoch:train:12755-13665batch: iter_time=2.703e-04, forward_time=0.202, loss_att=44.533, acc=0.961, loss=44.533, backward_time=0.299, grad_norm=78.567, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.578e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 10:21:09,334 (trainer:732) INFO: 34epoch:train:13666-14576batch: iter_time=2.798e-04, forward_time=0.203, loss_att=43.587, acc=0.962, loss=43.587, backward_time=0.301, grad_norm=86.865, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.576e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 10:31:31,270 (trainer:732) INFO: 34epoch:train:14577-15487batch: iter_time=2.690e-04, forward_time=0.202, loss_att=43.740, acc=0.962, loss=43.740, backward_time=0.299, grad_norm=83.793, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.573e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 10:41:54,484 (trainer:732) INFO: 34epoch:train:15488-16398batch: iter_time=2.641e-04, forward_time=0.202, loss_att=43.535, acc=0.962, loss=43.535, backward_time=0.300, grad_norm=81.127, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.571e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 10:52:13,198 (trainer:732) INFO: 34epoch:train:16399-17309batch: iter_time=2.606e-04, forward_time=0.201, loss_att=43.434, acc=0.961, loss=43.434, backward_time=0.298, grad_norm=79.336, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.569e-04, train_time=2.716 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:02:34,721 (trainer:732) INFO: 34epoch:train:17310-18220batch: iter_time=2.589e-04, forward_time=0.202, loss_att=43.722, acc=0.962, loss=43.722, backward_time=0.299, grad_norm=78.672, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.566e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:07:52,177 (trainer:338) INFO: 34epoch results: [train] iter_time=3.384e-04, forward_time=0.202, loss_att=43.955, acc=0.961, loss=43.955, backward_time=0.299, grad_norm=80.283, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.590e-04, train_time=2.757, time=3 hours, 29 minutes and 35.06 seconds, total_count=1033405, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.636, acc=0.981, cer=0.023, wer=0.089, loss=10.636, time=2 minutes and 54.77 seconds, total_count=1596, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 7.07 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:07:56,025 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:07:56,037 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/25epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:07:56,037 (trainer:272) INFO: 35/60epoch started. Estimated time to finish: 3 days, 21 hours and 38 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:20:14,467 (trainer:732) INFO: 35epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=43.331, acc=0.962, loss=43.331, backward_time=0.299, grad_norm=79.468, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.564e-04, train_time=3.244 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:30:36,507 (trainer:732) INFO: 35epoch:train:912-1822batch: iter_time=2.672e-04, forward_time=0.202, loss_att=43.279, acc=0.962, loss=43.279, backward_time=0.299, grad_norm=86.698, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.561e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:40:56,603 (trainer:732) INFO: 35epoch:train:1823-2733batch: iter_time=2.717e-04, forward_time=0.202, loss_att=43.031, acc=0.962, loss=43.031, backward_time=0.299, grad_norm=77.602, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.559e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 11:51:19,220 (trainer:732) INFO: 35epoch:train:2734-3644batch: iter_time=2.670e-04, forward_time=0.202, loss_att=43.202, acc=0.962, loss=43.202, backward_time=0.299, grad_norm=87.745, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.556e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 12:01:39,374 (trainer:732) INFO: 35epoch:train:3645-4555batch: iter_time=2.659e-04, forward_time=0.202, loss_att=43.522, acc=0.962, loss=43.522, backward_time=0.299, grad_norm=76.382, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.554e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 12:12:00,889 (trainer:732) INFO: 35epoch:train:4556-5466batch: iter_time=2.669e-04, forward_time=0.202, loss_att=44.645, acc=0.961, loss=44.645, backward_time=0.299, grad_norm=79.348, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.551e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 12:22:22,255 (trainer:732) INFO: 35epoch:train:5467-6377batch: iter_time=2.601e-04, forward_time=0.202, loss_att=43.493, acc=0.962, loss=43.493, backward_time=0.299, grad_norm=79.532, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.549e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 12:32:43,262 (trainer:732) INFO: 35epoch:train:6378-7288batch: iter_time=2.668e-04, forward_time=0.202, loss_att=44.425, acc=0.961, loss=44.425, backward_time=0.299, grad_norm=81.165, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.547e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 12:43:05,548 (trainer:732) INFO: 35epoch:train:7289-8199batch: iter_time=2.692e-04, forward_time=0.202, loss_att=43.163, acc=0.962, loss=43.163, backward_time=0.300, grad_norm=77.759, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.544e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 12:53:29,254 (trainer:732) INFO: 35epoch:train:8200-9110batch: iter_time=2.673e-04, forward_time=0.202, loss_att=43.702, acc=0.962, loss=43.702, backward_time=0.300, grad_norm=81.830, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.542e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 13:03:49,264 (trainer:732) INFO: 35epoch:train:9111-10021batch: iter_time=2.707e-04, forward_time=0.202, loss_att=43.547, acc=0.962, loss=43.547, backward_time=0.298, grad_norm=86.867, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.539e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 13:14:12,655 (trainer:732) INFO: 35epoch:train:10022-10932batch: iter_time=2.701e-04, forward_time=0.202, loss_att=43.759, acc=0.962, loss=43.759, backward_time=0.300, grad_norm=89.661, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.537e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 13:24:35,237 (trainer:732) INFO: 35epoch:train:10933-11843batch: iter_time=2.635e-04, forward_time=0.202, loss_att=44.052, acc=0.962, loss=44.052, backward_time=0.300, grad_norm=85.855, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.534e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 13:34:57,390 (trainer:732) INFO: 35epoch:train:11844-12754batch: iter_time=2.706e-04, forward_time=0.202, loss_att=44.295, acc=0.961, loss=44.295, backward_time=0.299, grad_norm=86.717, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.532e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 13:45:20,819 (trainer:732) INFO: 35epoch:train:12755-13665batch: iter_time=2.685e-04, forward_time=0.202, loss_att=44.801, acc=0.961, loss=44.801, backward_time=0.300, grad_norm=82.108, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.530e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 13:55:43,180 (trainer:732) INFO: 35epoch:train:13666-14576batch: iter_time=2.641e-04, forward_time=0.202, loss_att=43.639, acc=0.962, loss=43.639, backward_time=0.300, grad_norm=77.938, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.527e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:06:03,284 (trainer:732) INFO: 35epoch:train:14577-15487batch: iter_time=2.607e-04, forward_time=0.201, loss_att=43.292, acc=0.961, loss=43.292, backward_time=0.298, grad_norm=82.718, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.525e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:16:24,912 (trainer:732) INFO: 35epoch:train:15488-16398batch: iter_time=2.681e-04, forward_time=0.202, loss_att=43.570, acc=0.962, loss=43.570, backward_time=0.299, grad_norm=81.993, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.522e-04, train_time=2.729 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:26:48,178 (trainer:732) INFO: 35epoch:train:16399-17309batch: iter_time=2.746e-04, forward_time=0.203, loss_att=44.034, acc=0.961, loss=44.034, backward_time=0.300, grad_norm=87.284, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.520e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:37:10,898 (trainer:732) INFO: 35epoch:train:17310-18220batch: iter_time=2.698e-04, forward_time=0.203, loss_att=44.273, acc=0.961, loss=44.273, backward_time=0.300, grad_norm=78.293, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.518e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:42:12,472 (trainer:338) INFO: 35epoch results: [train] iter_time=3.391e-04, forward_time=0.202, loss_att=43.751, acc=0.962, loss=43.751, backward_time=0.299, grad_norm=82.330, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.541e-04, train_time=2.756, time=3 hours, 29 minutes and 29.02 seconds, total_count=1051637, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.982, acc=0.980, cer=0.023, wer=0.091, loss=10.982, time=2 minutes and 41.28 seconds, total_count=1624, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 6.13 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:42:16,562 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:42:16,577 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/24epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:42:16,577 (trainer:272) INFO: 36/60epoch started. Estimated time to finish: 3 days, 17 hours and 59 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 14:54:32,447 (trainer:732) INFO: 36epoch:train:1-911batch: iter_time=0.001, forward_time=0.201, loss_att=43.064, acc=0.962, loss=43.064, backward_time=0.298, grad_norm=81.624, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.515e-04, train_time=3.233 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 15:04:56,154 (trainer:732) INFO: 36epoch:train:912-1822batch: iter_time=2.768e-04, forward_time=0.203, loss_att=43.293, acc=0.962, loss=43.293, backward_time=0.300, grad_norm=80.070, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.513e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 15:15:19,217 (trainer:732) INFO: 36epoch:train:1823-2733batch: iter_time=2.763e-04, forward_time=0.202, loss_att=43.951, acc=0.962, loss=43.951, backward_time=0.300, grad_norm=74.050, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.510e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 15:25:41,849 (trainer:732) INFO: 36epoch:train:2734-3644batch: iter_time=2.715e-04, forward_time=0.202, loss_att=43.846, acc=0.962, loss=43.846, backward_time=0.300, grad_norm=82.301, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.508e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 15:36:01,370 (trainer:732) INFO: 36epoch:train:3645-4555batch: iter_time=2.753e-04, forward_time=0.202, loss_att=43.697, acc=0.962, loss=43.697, backward_time=0.298, grad_norm=88.784, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.506e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 15:46:23,828 (trainer:732) INFO: 36epoch:train:4556-5466batch: iter_time=2.657e-04, forward_time=0.202, loss_att=43.175, acc=0.962, loss=43.175, backward_time=0.300, grad_norm=92.418, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.503e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 15:56:46,125 (trainer:732) INFO: 36epoch:train:5467-6377batch: iter_time=2.664e-04, forward_time=0.202, loss_att=42.636, acc=0.962, loss=42.636, backward_time=0.299, grad_norm=83.175, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.501e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 16:07:06,805 (trainer:732) INFO: 36epoch:train:6378-7288batch: iter_time=2.624e-04, forward_time=0.201, loss_att=43.184, acc=0.962, loss=43.184, backward_time=0.298, grad_norm=82.122, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.499e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 16:17:24,922 (trainer:732) INFO: 36epoch:train:7289-8199batch: iter_time=2.690e-04, forward_time=0.201, loss_att=42.950, acc=0.961, loss=42.950, backward_time=0.298, grad_norm=77.502, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.496e-04, train_time=2.714 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 16:27:46,346 (trainer:732) INFO: 36epoch:train:8200-9110batch: iter_time=2.639e-04, forward_time=0.202, loss_att=44.116, acc=0.962, loss=44.116, backward_time=0.299, grad_norm=80.687, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.494e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 16:38:09,374 (trainer:732) INFO: 36epoch:train:9111-10021batch: iter_time=2.638e-04, forward_time=0.202, loss_att=43.268, acc=0.962, loss=43.268, backward_time=0.300, grad_norm=79.376, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.491e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 16:48:31,239 (trainer:732) INFO: 36epoch:train:10022-10932batch: iter_time=2.638e-04, forward_time=0.202, loss_att=42.987, acc=0.962, loss=42.987, backward_time=0.300, grad_norm=82.038, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.489e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 16:58:51,424 (trainer:732) INFO: 36epoch:train:10933-11843batch: iter_time=2.660e-04, forward_time=0.202, loss_att=43.025, acc=0.962, loss=43.025, backward_time=0.299, grad_norm=79.946, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.487e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 17:09:14,454 (trainer:732) INFO: 36epoch:train:11844-12754batch: iter_time=2.637e-04, forward_time=0.202, loss_att=44.017, acc=0.962, loss=44.017, backward_time=0.300, grad_norm=82.781, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.484e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 17:19:33,453 (trainer:732) INFO: 36epoch:train:12755-13665batch: iter_time=2.610e-04, forward_time=0.201, loss_att=43.860, acc=0.961, loss=43.860, backward_time=0.298, grad_norm=85.137, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.482e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 17:29:55,466 (trainer:732) INFO: 36epoch:train:13666-14576batch: iter_time=2.650e-04, forward_time=0.202, loss_att=44.153, acc=0.962, loss=44.153, backward_time=0.299, grad_norm=85.227, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.480e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 17:40:18,534 (trainer:732) INFO: 36epoch:train:14577-15487batch: iter_time=2.691e-04, forward_time=0.202, loss_att=43.955, acc=0.961, loss=43.955, backward_time=0.300, grad_norm=83.122, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.477e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 17:50:41,825 (trainer:732) INFO: 36epoch:train:15488-16398batch: iter_time=2.738e-04, forward_time=0.202, loss_att=44.131, acc=0.962, loss=44.131, backward_time=0.300, grad_norm=79.422, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.475e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:01:05,032 (trainer:732) INFO: 36epoch:train:16399-17309batch: iter_time=2.678e-04, forward_time=0.203, loss_att=43.628, acc=0.962, loss=43.628, backward_time=0.301, grad_norm=96.370, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.473e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:11:28,475 (trainer:732) INFO: 36epoch:train:17310-18220batch: iter_time=2.727e-04, forward_time=0.202, loss_att=43.330, acc=0.962, loss=43.330, backward_time=0.300, grad_norm=74.777, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.470e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:16:29,979 (trainer:338) INFO: 36epoch results: [train] iter_time=3.144e-04, forward_time=0.202, loss_att=43.518, acc=0.962, loss=43.518, backward_time=0.299, grad_norm=82.544, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.493e-04, train_time=2.755, time=3 hours, 29 minutes and 25.99 seconds, total_count=1069869, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.545, acc=0.981, cer=0.023, wer=0.089, loss=10.545, time=2 minutes and 40.18 seconds, total_count=1652, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 7.23 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:16:34,102 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:16:34,132 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/30epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:16:34,133 (trainer:272) INFO: 37/60epoch started. Estimated time to finish: 3 days, 14 hours and 19 minutes + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:28:55,049 (trainer:732) INFO: 37epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=42.965, acc=0.963, loss=42.965, backward_time=0.300, grad_norm=85.387, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.468e-04, train_time=3.255 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:39:17,212 (trainer:732) INFO: 37epoch:train:912-1822batch: iter_time=2.695e-04, forward_time=0.202, loss_att=41.699, acc=0.963, loss=41.699, backward_time=0.300, grad_norm=79.187, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.466e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 18:49:40,969 (trainer:732) INFO: 37epoch:train:1823-2733batch: iter_time=2.597e-04, forward_time=0.202, loss_att=43.139, acc=0.963, loss=43.139, backward_time=0.300, grad_norm=83.174, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.463e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 19:00:02,517 (trainer:732) INFO: 37epoch:train:2734-3644batch: iter_time=2.618e-04, forward_time=0.202, loss_att=43.533, acc=0.962, loss=43.533, backward_time=0.299, grad_norm=83.218, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.461e-04, train_time=2.728 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 19:10:23,334 (trainer:732) INFO: 37epoch:train:3645-4555batch: iter_time=2.674e-04, forward_time=0.202, loss_att=42.859, acc=0.962, loss=42.859, backward_time=0.299, grad_norm=93.794, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.459e-04, train_time=2.727 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 152) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 19:20:45,778 (trainer:732) INFO: 37epoch:train:4556-5466batch: iter_time=2.621e-04, forward_time=0.202, loss_att=43.251, acc=0.962, loss=43.251, backward_time=0.300, grad_norm=82.210, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.456e-04, train_time=2.732 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 19:31:08,934 (trainer:732) INFO: 37epoch:train:5467-6377batch: iter_time=2.641e-04, forward_time=0.202, loss_att=43.535, acc=0.962, loss=43.535, backward_time=0.300, grad_norm=84.785, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.454e-04, train_time=2.735 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 19:41:30,629 (trainer:732) INFO: 37epoch:train:6378-7288batch: iter_time=2.673e-04, forward_time=0.201, loss_att=43.306, acc=0.962, loss=43.306, backward_time=0.299, grad_norm=83.747, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.452e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 19:51:53,737 (trainer:732) INFO: 37epoch:train:7289-8199batch: iter_time=2.631e-04, forward_time=0.202, loss_att=43.207, acc=0.962, loss=43.207, backward_time=0.300, grad_norm=83.097, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.450e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 20:02:17,302 (trainer:732) INFO: 37epoch:train:8200-9110batch: iter_time=2.690e-04, forward_time=0.203, loss_att=43.044, acc=0.962, loss=43.044, backward_time=0.300, grad_norm=85.272, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.447e-04, train_time=2.737 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 20:12:39,963 (trainer:732) INFO: 37epoch:train:9111-10021batch: iter_time=2.732e-04, forward_time=0.202, loss_att=43.810, acc=0.962, loss=43.810, backward_time=0.299, grad_norm=77.844, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.445e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 20:23:04,373 (trainer:732) INFO: 37epoch:train:10022-10932batch: iter_time=2.666e-04, forward_time=0.203, loss_att=44.214, acc=0.962, loss=44.214, backward_time=0.301, grad_norm=87.824, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.443e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 20:33:25,326 (trainer:732) INFO: 37epoch:train:10933-11843batch: iter_time=2.710e-04, forward_time=0.202, loss_att=43.536, acc=0.962, loss=43.536, backward_time=0.299, grad_norm=86.987, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.440e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 20:43:47,768 (trainer:732) INFO: 37epoch:train:11844-12754batch: iter_time=2.744e-04, forward_time=0.202, loss_att=44.314, acc=0.962, loss=44.314, backward_time=0.300, grad_norm=83.250, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.438e-04, train_time=2.733 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<15092> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<29171> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.17.248<60562> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 20:54:08,661 (trainer:732) INFO: 37epoch:train:12755-13665batch: iter_time=2.652e-04, forward_time=0.201, loss_att=43.322, acc=0.962, loss=43.322, backward_time=0.298, grad_norm=84.735, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.436e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:04:30,538 (trainer:732) INFO: 37epoch:train:13666-14576batch: iter_time=2.802e-04, forward_time=0.202, loss_att=42.893, acc=0.962, loss=42.893, backward_time=0.300, grad_norm=84.553, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.433e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:14:51,697 (trainer:732) INFO: 37epoch:train:14577-15487batch: iter_time=2.670e-04, forward_time=0.202, loss_att=43.609, acc=0.962, loss=43.609, backward_time=0.299, grad_norm=84.763, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.431e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:25:12,849 (trainer:732) INFO: 37epoch:train:15488-16398batch: iter_time=2.669e-04, forward_time=0.202, loss_att=43.491, acc=0.962, loss=43.491, backward_time=0.299, grad_norm=86.330, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.429e-04, train_time=2.727 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.13.248<54177> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:35:33,448 (trainer:732) INFO: 37epoch:train:16399-17309batch: iter_time=2.765e-04, forward_time=0.202, loss_att=43.361, acc=0.962, loss=43.361, backward_time=0.299, grad_norm=79.663, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.427e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:45:54,207 (trainer:732) INFO: 37epoch:train:17310-18220batch: iter_time=2.621e-04, forward_time=0.201, loss_att=43.164, acc=0.962, loss=43.164, backward_time=0.299, grad_norm=80.917, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.424e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:50:58,009 (trainer:338) INFO: 37epoch results: [train] iter_time=3.133e-04, forward_time=0.202, loss_att=43.310, acc=0.962, loss=43.310, backward_time=0.299, grad_norm=84.012, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.446e-04, train_time=2.757, time=3 hours, 29 minutes and 34.48 seconds, total_count=1088101, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.634, acc=0.981, cer=0.022, wer=0.087, loss=10.634, time=2 minutes and 42.17 seconds, total_count=1680, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 7.23 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:51:01,899 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:51:01,914 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/29epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 21:51:01,914 (trainer:272) INFO: 38/60epoch started. Estimated time to finish: 3 days, 10 hours and 41 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 22:03:20,489 (trainer:732) INFO: 38epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=41.915, acc=0.963, loss=41.915, backward_time=0.299, grad_norm=82.728, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.422e-04, train_time=3.245 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 22:13:43,830 (trainer:732) INFO: 38epoch:train:912-1822batch: iter_time=2.713e-04, forward_time=0.203, loss_att=42.467, acc=0.963, loss=42.467, backward_time=0.300, grad_norm=78.348, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.420e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 22:24:08,238 (trainer:732) INFO: 38epoch:train:1823-2733batch: iter_time=2.650e-04, forward_time=0.203, loss_att=42.693, acc=0.963, loss=42.693, backward_time=0.300, grad_norm=87.466, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.418e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 22:34:30,439 (trainer:732) INFO: 38epoch:train:2734-3644batch: iter_time=2.723e-04, forward_time=0.202, loss_att=42.318, acc=0.963, loss=42.318, backward_time=0.299, grad_norm=84.329, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.415e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 22:44:53,805 (trainer:732) INFO: 38epoch:train:3645-4555batch: iter_time=2.679e-04, forward_time=0.202, loss_att=42.885, acc=0.962, loss=42.885, backward_time=0.300, grad_norm=80.868, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.413e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 22:55:16,468 (trainer:732) INFO: 38epoch:train:4556-5466batch: iter_time=2.654e-04, forward_time=0.202, loss_att=43.454, acc=0.962, loss=43.454, backward_time=0.300, grad_norm=86.355, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.411e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 23:05:37,648 (trainer:732) INFO: 38epoch:train:5467-6377batch: iter_time=2.639e-04, forward_time=0.202, loss_att=43.532, acc=0.962, loss=43.532, backward_time=0.299, grad_norm=81.331, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.408e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 23:15:58,234 (trainer:732) INFO: 38epoch:train:6378-7288batch: iter_time=2.690e-04, forward_time=0.201, loss_att=42.925, acc=0.962, loss=42.925, backward_time=0.298, grad_norm=81.796, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.406e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 23:26:21,122 (trainer:732) INFO: 38epoch:train:7289-8199batch: iter_time=2.676e-04, forward_time=0.202, loss_att=43.648, acc=0.962, loss=43.648, backward_time=0.300, grad_norm=83.550, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.404e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 23:36:43,090 (trainer:732) INFO: 38epoch:train:8200-9110batch: iter_time=2.631e-04, forward_time=0.202, loss_att=43.350, acc=0.962, loss=43.350, backward_time=0.299, grad_norm=74.630, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.402e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 23:47:06,662 (trainer:732) INFO: 38epoch:train:9111-10021batch: iter_time=2.617e-04, forward_time=0.202, loss_att=43.448, acc=0.963, loss=43.448, backward_time=0.300, grad_norm=87.421, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.399e-04, train_time=2.737 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/socket.h:423 NCCL WARN Net : Connection closed by remote peer 10.38.11.213<37332> +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO include/socket.h:445 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO include/socket.h:457 -> 2 +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:229 -> 2 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 151) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-02-29 23:57:29,920 (trainer:732) INFO: 38epoch:train:10022-10932batch: iter_time=2.665e-04, forward_time=0.202, loss_att=43.297, acc=0.962, loss=43.297, backward_time=0.300, grad_norm=82.544, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.397e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 00:07:51,072 (trainer:732) INFO: 38epoch:train:10933-11843batch: iter_time=2.772e-04, forward_time=0.202, loss_att=43.615, acc=0.961, loss=43.615, backward_time=0.299, grad_norm=81.696, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.395e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 00:18:13,671 (trainer:732) INFO: 38epoch:train:11844-12754batch: iter_time=2.681e-04, forward_time=0.202, loss_att=42.707, acc=0.962, loss=42.707, backward_time=0.299, grad_norm=87.400, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.393e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 00:28:34,979 (trainer:732) INFO: 38epoch:train:12755-13665batch: iter_time=2.722e-04, forward_time=0.202, loss_att=43.269, acc=0.962, loss=43.269, backward_time=0.299, grad_norm=82.474, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.391e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 00:38:57,734 (trainer:732) INFO: 38epoch:train:13666-14576batch: iter_time=2.649e-04, forward_time=0.202, loss_att=42.058, acc=0.963, loss=42.058, backward_time=0.299, grad_norm=83.222, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.388e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 00:49:18,994 (trainer:732) INFO: 38epoch:train:14577-15487batch: iter_time=2.656e-04, forward_time=0.202, loss_att=43.342, acc=0.962, loss=43.342, backward_time=0.298, grad_norm=80.463, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.386e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 00:59:44,139 (trainer:732) INFO: 38epoch:train:15488-16398batch: iter_time=2.642e-04, forward_time=0.203, loss_att=44.585, acc=0.962, loss=44.585, backward_time=0.301, grad_norm=87.432, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.384e-04, train_time=2.745 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:10:05,934 (trainer:732) INFO: 38epoch:train:16399-17309batch: iter_time=2.668e-04, forward_time=0.202, loss_att=42.816, acc=0.962, loss=42.816, backward_time=0.299, grad_norm=82.492, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.382e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:20:29,525 (trainer:732) INFO: 38epoch:train:17310-18220batch: iter_time=2.685e-04, forward_time=0.203, loss_att=43.330, acc=0.963, loss=43.330, backward_time=0.300, grad_norm=77.871, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.379e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:25:52,750 (trainer:338) INFO: 38epoch results: [train] iter_time=3.195e-04, forward_time=0.202, loss_att=43.076, acc=0.962, loss=43.076, backward_time=0.299, grad_norm=82.713, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.401e-04, train_time=2.759, time=3 hours, 29 minutes and 42.51 seconds, total_count=1106333, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.216, acc=0.982, cer=0.022, wer=0.087, loss=10.216, time=3 minutes and 1.53 seconds, total_count=1708, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 6.79 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:25:57,313 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:25:57,327 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/26epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:25:57,328 (trainer:272) INFO: 39/60epoch started. Estimated time to finish: 3 days, 7 hours and 4 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:38:16,363 (trainer:732) INFO: 39epoch:train:1-911batch: iter_time=0.002, forward_time=0.201, loss_att=41.310, acc=0.963, loss=41.310, backward_time=0.298, grad_norm=85.759, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.377e-04, train_time=3.246 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:48:37,305 (trainer:732) INFO: 39epoch:train:912-1822batch: iter_time=2.738e-04, forward_time=0.201, loss_att=42.111, acc=0.963, loss=42.111, backward_time=0.298, grad_norm=84.154, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.375e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 01:59:00,534 (trainer:732) INFO: 39epoch:train:1823-2733batch: iter_time=2.729e-04, forward_time=0.202, loss_att=42.527, acc=0.963, loss=42.527, backward_time=0.300, grad_norm=85.159, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.373e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 02:09:22,663 (trainer:732) INFO: 39epoch:train:2734-3644batch: iter_time=2.687e-04, forward_time=0.202, loss_att=42.819, acc=0.962, loss=42.819, backward_time=0.299, grad_norm=82.559, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.371e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 02:19:44,339 (trainer:732) INFO: 39epoch:train:3645-4555batch: iter_time=2.668e-04, forward_time=0.202, loss_att=42.245, acc=0.963, loss=42.245, backward_time=0.299, grad_norm=78.874, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.368e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 02:30:09,057 (trainer:732) INFO: 39epoch:train:4556-5466batch: iter_time=2.734e-04, forward_time=0.203, loss_att=42.739, acc=0.963, loss=42.739, backward_time=0.300, grad_norm=80.746, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.366e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 02:40:33,155 (trainer:732) INFO: 39epoch:train:5467-6377batch: iter_time=2.711e-04, forward_time=0.202, loss_att=43.626, acc=0.962, loss=43.626, backward_time=0.299, grad_norm=76.136, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.364e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 02:50:57,748 (trainer:732) INFO: 39epoch:train:6378-7288batch: iter_time=2.629e-04, forward_time=0.202, loss_att=42.921, acc=0.963, loss=42.921, backward_time=0.300, grad_norm=84.103, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.362e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 03:01:21,659 (trainer:732) INFO: 39epoch:train:7289-8199batch: iter_time=2.685e-04, forward_time=0.202, loss_att=43.008, acc=0.962, loss=43.008, backward_time=0.300, grad_norm=85.951, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.360e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 03:11:42,115 (trainer:732) INFO: 39epoch:train:8200-9110batch: iter_time=2.770e-04, forward_time=0.201, loss_att=43.268, acc=0.962, loss=43.268, backward_time=0.298, grad_norm=84.386, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.357e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 03:22:05,419 (trainer:732) INFO: 39epoch:train:9111-10021batch: iter_time=2.736e-04, forward_time=0.203, loss_att=43.013, acc=0.962, loss=43.013, backward_time=0.300, grad_norm=74.082, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.355e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 03:32:27,470 (trainer:732) INFO: 39epoch:train:10022-10932batch: iter_time=2.683e-04, forward_time=0.202, loss_att=43.253, acc=0.962, loss=43.253, backward_time=0.299, grad_norm=80.263, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.353e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 03:42:48,775 (trainer:732) INFO: 39epoch:train:10933-11843batch: iter_time=2.669e-04, forward_time=0.202, loss_att=43.097, acc=0.962, loss=43.097, backward_time=0.299, grad_norm=82.594, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.351e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 03:53:14,026 (trainer:732) INFO: 39epoch:train:11844-12754batch: iter_time=2.637e-04, forward_time=0.203, loss_att=43.112, acc=0.963, loss=43.112, backward_time=0.301, grad_norm=82.986, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.349e-04, train_time=2.745 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 04:03:35,777 (trainer:732) INFO: 39epoch:train:12755-13665batch: iter_time=2.670e-04, forward_time=0.202, loss_att=42.818, acc=0.963, loss=42.818, backward_time=0.299, grad_norm=82.510, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.346e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 04:14:00,274 (trainer:732) INFO: 39epoch:train:13666-14576batch: iter_time=2.650e-04, forward_time=0.203, loss_att=43.307, acc=0.963, loss=43.307, backward_time=0.300, grad_norm=78.045, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.344e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 04:24:22,399 (trainer:732) INFO: 39epoch:train:14577-15487batch: iter_time=2.689e-04, forward_time=0.202, loss_att=43.344, acc=0.962, loss=43.344, backward_time=0.299, grad_norm=81.291, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.342e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 04:34:45,378 (trainer:732) INFO: 39epoch:train:15488-16398batch: iter_time=2.761e-04, forward_time=0.203, loss_att=43.261, acc=0.962, loss=43.261, backward_time=0.300, grad_norm=84.608, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.340e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 04:45:05,683 (trainer:732) INFO: 39epoch:train:16399-17309batch: iter_time=2.677e-04, forward_time=0.201, loss_att=42.337, acc=0.963, loss=42.337, backward_time=0.298, grad_norm=82.591, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.338e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 04:55:26,701 (trainer:732) INFO: 39epoch:train:17310-18220batch: iter_time=2.688e-04, forward_time=0.202, loss_att=42.773, acc=0.962, loss=42.773, backward_time=0.298, grad_norm=78.226, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.336e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:00:52,501 (trainer:338) INFO: 39epoch results: [train] iter_time=3.442e-04, forward_time=0.202, loss_att=42.846, acc=0.962, loss=42.846, backward_time=0.299, grad_norm=81.749, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.356e-04, train_time=2.759, time=3 hours, 29 minutes and 44.06 seconds, total_count=1124565, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.903, acc=0.981, cer=0.022, wer=0.087, loss=10.903, time=3 minutes and 2.16 seconds, total_count=1736, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 8.94 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:00:56,765 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:00:56,780 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/27epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:00:56,781 (trainer:272) INFO: 40/60epoch started. Estimated time to finish: 3 days, 3 hours and 28 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:13:17,100 (trainer:732) INFO: 40epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=43.430, acc=0.962, loss=43.430, backward_time=0.299, grad_norm=88.641, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.333e-04, train_time=3.252 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:23:38,663 (trainer:732) INFO: 40epoch:train:912-1822batch: iter_time=2.807e-04, forward_time=0.202, loss_att=41.440, acc=0.964, loss=41.440, backward_time=0.299, grad_norm=86.923, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.331e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:34:03,181 (trainer:732) INFO: 40epoch:train:1823-2733batch: iter_time=2.804e-04, forward_time=0.203, loss_att=41.746, acc=0.964, loss=41.746, backward_time=0.300, grad_norm=78.117, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.329e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:44:25,056 (trainer:732) INFO: 40epoch:train:2734-3644batch: iter_time=2.809e-04, forward_time=0.201, loss_att=42.326, acc=0.963, loss=42.326, backward_time=0.299, grad_norm=84.270, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.327e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 05:54:47,677 (trainer:732) INFO: 40epoch:train:3645-4555batch: iter_time=2.672e-04, forward_time=0.202, loss_att=42.928, acc=0.963, loss=42.928, backward_time=0.299, grad_norm=81.255, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.325e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 06:05:09,182 (trainer:732) INFO: 40epoch:train:4556-5466batch: iter_time=2.756e-04, forward_time=0.202, loss_att=42.310, acc=0.963, loss=42.310, backward_time=0.298, grad_norm=79.896, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.323e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 06:15:30,385 (trainer:732) INFO: 40epoch:train:5467-6377batch: iter_time=2.626e-04, forward_time=0.202, loss_att=43.053, acc=0.962, loss=43.053, backward_time=0.299, grad_norm=82.018, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.321e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 06:25:51,942 (trainer:732) INFO: 40epoch:train:6378-7288batch: iter_time=2.653e-04, forward_time=0.202, loss_att=42.199, acc=0.963, loss=42.199, backward_time=0.299, grad_norm=79.995, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.318e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 06:36:11,964 (trainer:732) INFO: 40epoch:train:7289-8199batch: iter_time=2.666e-04, forward_time=0.201, loss_att=43.171, acc=0.962, loss=43.171, backward_time=0.298, grad_norm=81.925, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.316e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 06:46:35,416 (trainer:732) INFO: 40epoch:train:8200-9110batch: iter_time=2.703e-04, forward_time=0.203, loss_att=42.766, acc=0.963, loss=42.766, backward_time=0.300, grad_norm=79.650, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.314e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 06:56:56,900 (trainer:732) INFO: 40epoch:train:9111-10021batch: iter_time=2.700e-04, forward_time=0.202, loss_att=42.683, acc=0.962, loss=42.683, backward_time=0.299, grad_norm=92.634, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.312e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 07:07:17,074 (trainer:732) INFO: 40epoch:train:10022-10932batch: iter_time=2.731e-04, forward_time=0.201, loss_att=42.548, acc=0.962, loss=42.548, backward_time=0.298, grad_norm=84.937, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.310e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 07:17:37,116 (trainer:732) INFO: 40epoch:train:10933-11843batch: iter_time=2.672e-04, forward_time=0.201, loss_att=42.687, acc=0.962, loss=42.687, backward_time=0.299, grad_norm=79.694, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.308e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 07:27:58,674 (trainer:732) INFO: 40epoch:train:11844-12754batch: iter_time=2.629e-04, forward_time=0.202, loss_att=42.187, acc=0.963, loss=42.187, backward_time=0.299, grad_norm=80.657, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.306e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 07:38:22,274 (trainer:732) INFO: 40epoch:train:12755-13665batch: iter_time=2.596e-04, forward_time=0.202, loss_att=42.958, acc=0.963, loss=42.958, backward_time=0.300, grad_norm=81.633, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.303e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 07:48:47,567 (trainer:732) INFO: 40epoch:train:13666-14576batch: iter_time=2.674e-04, forward_time=0.203, loss_att=42.998, acc=0.963, loss=42.998, backward_time=0.301, grad_norm=77.899, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.301e-04, train_time=2.745 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 07:59:10,244 (trainer:732) INFO: 40epoch:train:14577-15487batch: iter_time=2.727e-04, forward_time=0.203, loss_att=42.733, acc=0.963, loss=42.733, backward_time=0.300, grad_norm=83.671, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.299e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:09:35,080 (trainer:732) INFO: 40epoch:train:15488-16398batch: iter_time=2.685e-04, forward_time=0.203, loss_att=43.762, acc=0.962, loss=43.762, backward_time=0.301, grad_norm=79.817, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.297e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:19:55,771 (trainer:732) INFO: 40epoch:train:16399-17309batch: iter_time=2.682e-04, forward_time=0.201, loss_att=41.995, acc=0.963, loss=41.995, backward_time=0.298, grad_norm=81.131, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.295e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:30:18,145 (trainer:732) INFO: 40epoch:train:17310-18220batch: iter_time=2.688e-04, forward_time=0.202, loss_att=42.928, acc=0.963, loss=42.928, backward_time=0.300, grad_norm=82.891, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.293e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:35:19,619 (trainer:338) INFO: 40epoch results: [train] iter_time=3.498e-04, forward_time=0.202, loss_att=42.639, acc=0.963, loss=42.639, backward_time=0.299, grad_norm=82.372, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.313e-04, train_time=2.757, time=3 hours, 29 minutes and 35.01 seconds, total_count=1142797, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.216, acc=0.982, cer=0.022, wer=0.087, loss=10.216, time=2 minutes and 41.83 seconds, total_count=1764, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 6 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:35:23,328 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:35:23,360 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/33epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:35:23,361 (trainer:272) INFO: 41/60epoch started. Estimated time to finish: 2 days, 23 hours and 51 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:47:44,127 (trainer:732) INFO: 41epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=42.257, acc=0.963, loss=42.257, backward_time=0.299, grad_norm=87.792, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.291e-04, train_time=3.254 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 08:58:06,406 (trainer:732) INFO: 41epoch:train:912-1822batch: iter_time=2.609e-04, forward_time=0.202, loss_att=41.305, acc=0.964, loss=41.305, backward_time=0.300, grad_norm=78.875, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.289e-04, train_time=2.732 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 09:08:27,391 (trainer:732) INFO: 41epoch:train:1823-2733batch: iter_time=2.563e-04, forward_time=0.202, loss_att=41.856, acc=0.963, loss=41.856, backward_time=0.299, grad_norm=86.194, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.287e-04, train_time=2.727 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 152) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 152) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 09:18:50,809 (trainer:732) INFO: 41epoch:train:2734-3644batch: iter_time=2.660e-04, forward_time=0.202, loss_att=42.873, acc=0.963, loss=42.873, backward_time=0.300, grad_norm=90.126, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.284e-04, train_time=2.736 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 09:29:12,100 (trainer:732) INFO: 41epoch:train:3645-4555batch: iter_time=2.675e-04, forward_time=0.202, loss_att=43.281, acc=0.963, loss=43.281, backward_time=0.299, grad_norm=80.670, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.282e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 09:39:32,363 (trainer:732) INFO: 41epoch:train:4556-5466batch: iter_time=2.647e-04, forward_time=0.202, loss_att=41.515, acc=0.963, loss=41.515, backward_time=0.298, grad_norm=78.326, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.280e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 09:49:53,625 (trainer:732) INFO: 41epoch:train:5467-6377batch: iter_time=2.592e-04, forward_time=0.202, loss_att=41.454, acc=0.963, loss=41.454, backward_time=0.299, grad_norm=77.769, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.278e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 10:00:17,279 (trainer:732) INFO: 41epoch:train:6378-7288batch: iter_time=2.620e-04, forward_time=0.202, loss_att=42.825, acc=0.963, loss=42.825, backward_time=0.300, grad_norm=86.934, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.276e-04, train_time=2.738 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 10:10:40,881 (trainer:732) INFO: 41epoch:train:7289-8199batch: iter_time=2.586e-04, forward_time=0.203, loss_att=42.592, acc=0.963, loss=42.592, backward_time=0.301, grad_norm=84.855, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.274e-04, train_time=2.739 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 10:21:00,992 (trainer:732) INFO: 41epoch:train:8200-9110batch: iter_time=2.565e-04, forward_time=0.202, loss_att=42.771, acc=0.962, loss=42.771, backward_time=0.298, grad_norm=79.176, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.272e-04, train_time=2.722 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 10:31:23,821 (trainer:732) INFO: 41epoch:train:9111-10021batch: iter_time=2.581e-04, forward_time=0.202, loss_att=42.372, acc=0.963, loss=42.372, backward_time=0.299, grad_norm=85.321, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.270e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 10:41:46,515 (trainer:732) INFO: 41epoch:train:10022-10932batch: iter_time=2.725e-04, forward_time=0.203, loss_att=41.999, acc=0.963, loss=41.999, backward_time=0.300, grad_norm=83.710, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.268e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 10:52:06,868 (trainer:732) INFO: 41epoch:train:10933-11843batch: iter_time=2.630e-04, forward_time=0.201, loss_att=42.764, acc=0.962, loss=42.764, backward_time=0.299, grad_norm=79.972, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.266e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 11:02:33,212 (trainer:732) INFO: 41epoch:train:11844-12754batch: iter_time=2.642e-04, forward_time=0.203, loss_att=43.524, acc=0.963, loss=43.524, backward_time=0.302, grad_norm=87.715, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.264e-04, train_time=2.750 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 11:12:54,803 (trainer:732) INFO: 41epoch:train:12755-13665batch: iter_time=2.755e-04, forward_time=0.202, loss_att=42.866, acc=0.962, loss=42.866, backward_time=0.299, grad_norm=84.763, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.261e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 11:23:15,575 (trainer:732) INFO: 41epoch:train:13666-14576batch: iter_time=2.592e-04, forward_time=0.201, loss_att=42.021, acc=0.963, loss=42.021, backward_time=0.298, grad_norm=87.146, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.259e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 11:33:36,380 (trainer:732) INFO: 41epoch:train:14577-15487batch: iter_time=2.633e-04, forward_time=0.202, loss_att=42.106, acc=0.963, loss=42.106, backward_time=0.299, grad_norm=78.919, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.257e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 11:43:58,578 (trainer:732) INFO: 41epoch:train:15488-16398batch: iter_time=2.603e-04, forward_time=0.202, loss_att=43.095, acc=0.962, loss=43.095, backward_time=0.300, grad_norm=81.316, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.255e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 11:54:19,196 (trainer:732) INFO: 41epoch:train:16399-17309batch: iter_time=2.585e-04, forward_time=0.201, loss_att=41.941, acc=0.963, loss=41.941, backward_time=0.299, grad_norm=85.319, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.253e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:04:39,877 (trainer:732) INFO: 41epoch:train:17310-18220batch: iter_time=2.540e-04, forward_time=0.201, loss_att=42.991, acc=0.962, loss=42.991, backward_time=0.298, grad_norm=85.113, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.251e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:09:46,775 (trainer:338) INFO: 41epoch results: [train] iter_time=3.394e-04, forward_time=0.202, loss_att=42.420, acc=0.963, loss=42.420, backward_time=0.299, grad_norm=83.501, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.271e-04, train_time=2.756, time=3 hours, 29 minutes and 30.71 seconds, total_count=1161029, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.512, acc=0.982, cer=0.022, wer=0.086, loss=10.512, time=2 minutes and 41.53 seconds, total_count=1792, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 11.17 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:09:50,354 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:09:50,369 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/28epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:09:50,369 (trainer:272) INFO: 42/60epoch started. Estimated time to finish: 2 days, 20 hours and 14 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:22:12,152 (trainer:732) INFO: 42epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=41.364, acc=0.964, loss=41.364, backward_time=0.299, grad_norm=85.464, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.249e-04, train_time=3.259 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:32:32,686 (trainer:732) INFO: 42epoch:train:912-1822batch: iter_time=2.894e-04, forward_time=0.202, loss_att=41.999, acc=0.963, loss=41.999, backward_time=0.299, grad_norm=87.313, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.247e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:42:56,319 (trainer:732) INFO: 42epoch:train:1823-2733batch: iter_time=2.742e-04, forward_time=0.202, loss_att=42.369, acc=0.963, loss=42.369, backward_time=0.300, grad_norm=87.397, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.245e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 12:53:17,672 (trainer:732) INFO: 42epoch:train:2734-3644batch: iter_time=2.732e-04, forward_time=0.201, loss_att=42.145, acc=0.963, loss=42.145, backward_time=0.299, grad_norm=83.163, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.243e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 13:03:39,803 (trainer:732) INFO: 42epoch:train:3645-4555batch: iter_time=2.745e-04, forward_time=0.202, loss_att=42.642, acc=0.963, loss=42.642, backward_time=0.300, grad_norm=81.368, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.241e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 13:14:01,508 (trainer:732) INFO: 42epoch:train:4556-5466batch: iter_time=2.720e-04, forward_time=0.202, loss_att=42.184, acc=0.963, loss=42.184, backward_time=0.299, grad_norm=88.744, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.239e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 13:24:22,798 (trainer:732) INFO: 42epoch:train:5467-6377batch: iter_time=2.706e-04, forward_time=0.202, loss_att=42.447, acc=0.963, loss=42.447, backward_time=0.299, grad_norm=84.743, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.237e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 13:34:45,200 (trainer:732) INFO: 42epoch:train:6378-7288batch: iter_time=2.740e-04, forward_time=0.202, loss_att=42.567, acc=0.963, loss=42.567, backward_time=0.300, grad_norm=82.178, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.235e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 13:45:07,572 (trainer:732) INFO: 42epoch:train:7289-8199batch: iter_time=2.733e-04, forward_time=0.202, loss_att=42.118, acc=0.963, loss=42.118, backward_time=0.300, grad_norm=77.770, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.233e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 13:55:28,381 (trainer:732) INFO: 42epoch:train:8200-9110batch: iter_time=2.704e-04, forward_time=0.201, loss_att=42.288, acc=0.963, loss=42.288, backward_time=0.299, grad_norm=78.465, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.231e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 14:05:50,785 (trainer:732) INFO: 42epoch:train:9111-10021batch: iter_time=2.801e-04, forward_time=0.202, loss_att=42.250, acc=0.963, loss=42.250, backward_time=0.299, grad_norm=79.567, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.229e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 14:16:15,385 (trainer:732) INFO: 42epoch:train:10022-10932batch: iter_time=2.669e-04, forward_time=0.203, loss_att=42.061, acc=0.964, loss=42.061, backward_time=0.301, grad_norm=80.467, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.227e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 14:26:36,544 (trainer:732) INFO: 42epoch:train:10933-11843batch: iter_time=2.643e-04, forward_time=0.202, loss_att=42.077, acc=0.963, loss=42.077, backward_time=0.299, grad_norm=84.520, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.225e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 14:37:00,029 (trainer:732) INFO: 42epoch:train:11844-12754batch: iter_time=2.691e-04, forward_time=0.203, loss_att=43.272, acc=0.963, loss=43.272, backward_time=0.300, grad_norm=81.101, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.222e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 14:47:21,680 (trainer:732) INFO: 42epoch:train:12755-13665batch: iter_time=2.704e-04, forward_time=0.202, loss_att=42.413, acc=0.963, loss=42.413, backward_time=0.299, grad_norm=87.949, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.220e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 14:57:42,674 (trainer:732) INFO: 42epoch:train:13666-14576batch: iter_time=2.752e-04, forward_time=0.202, loss_att=41.955, acc=0.963, loss=41.955, backward_time=0.299, grad_norm=81.623, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.218e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:08:05,384 (trainer:732) INFO: 42epoch:train:14577-15487batch: iter_time=2.765e-04, forward_time=0.202, loss_att=42.344, acc=0.963, loss=42.344, backward_time=0.300, grad_norm=77.655, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.216e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:18:25,269 (trainer:732) INFO: 42epoch:train:15488-16398batch: iter_time=2.799e-04, forward_time=0.202, loss_att=41.940, acc=0.963, loss=41.940, backward_time=0.298, grad_norm=82.946, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.214e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:28:45,646 (trainer:732) INFO: 42epoch:train:16399-17309batch: iter_time=2.686e-04, forward_time=0.202, loss_att=41.285, acc=0.963, loss=41.285, backward_time=0.299, grad_norm=85.282, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.212e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:39:08,326 (trainer:732) INFO: 42epoch:train:17310-18220batch: iter_time=2.688e-04, forward_time=0.203, loss_att=43.136, acc=0.963, loss=43.136, backward_time=0.300, grad_norm=87.334, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.210e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:44:08,637 (trainer:338) INFO: 42epoch results: [train] iter_time=3.403e-04, forward_time=0.202, loss_att=42.242, acc=0.963, loss=42.242, backward_time=0.299, grad_norm=83.314, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.230e-04, train_time=2.757, time=3 hours, 29 minutes and 31.87 seconds, total_count=1179261, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.272, acc=0.982, cer=0.022, wer=0.085, loss=10.272, time=2 minutes and 39.18 seconds, total_count=1820, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 7.22 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:44:12,251 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:44:12,265 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/35epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:44:12,265 (trainer:272) INFO: 43/60epoch started. Estimated time to finish: 2 days, 16 hours and 38 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 15:56:34,598 (trainer:732) INFO: 43epoch:train:1-911batch: iter_time=0.002, forward_time=0.203, loss_att=41.893, acc=0.964, loss=41.893, backward_time=0.300, grad_norm=84.406, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.208e-04, train_time=3.261 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 16:06:53,725 (trainer:732) INFO: 43epoch:train:912-1822batch: iter_time=2.644e-04, forward_time=0.201, loss_att=41.598, acc=0.963, loss=41.598, backward_time=0.298, grad_norm=81.324, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.206e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 16:17:15,540 (trainer:732) INFO: 43epoch:train:1823-2733batch: iter_time=2.657e-04, forward_time=0.202, loss_att=42.019, acc=0.963, loss=42.019, backward_time=0.299, grad_norm=79.102, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.204e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 16:27:37,126 (trainer:732) INFO: 43epoch:train:2734-3644batch: iter_time=2.636e-04, forward_time=0.202, loss_att=42.076, acc=0.963, loss=42.076, backward_time=0.299, grad_norm=81.174, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.202e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 16:37:59,519 (trainer:732) INFO: 43epoch:train:3645-4555batch: iter_time=2.700e-04, forward_time=0.202, loss_att=41.603, acc=0.964, loss=41.603, backward_time=0.300, grad_norm=82.410, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.200e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 16:48:20,766 (trainer:732) INFO: 43epoch:train:4556-5466batch: iter_time=2.647e-04, forward_time=0.202, loss_att=41.880, acc=0.963, loss=41.880, backward_time=0.299, grad_norm=87.376, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.198e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 16:58:43,407 (trainer:732) INFO: 43epoch:train:5467-6377batch: iter_time=2.626e-04, forward_time=0.202, loss_att=41.245, acc=0.964, loss=41.245, backward_time=0.299, grad_norm=76.745, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.196e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 17:09:05,026 (trainer:732) INFO: 43epoch:train:6378-7288batch: iter_time=2.536e-04, forward_time=0.202, loss_att=41.917, acc=0.963, loss=41.917, backward_time=0.299, grad_norm=82.830, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.194e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 17:19:26,959 (trainer:732) INFO: 43epoch:train:7289-8199batch: iter_time=2.648e-04, forward_time=0.202, loss_att=42.534, acc=0.963, loss=42.534, backward_time=0.299, grad_norm=91.351, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.192e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 17:29:47,278 (trainer:732) INFO: 43epoch:train:8200-9110batch: iter_time=2.597e-04, forward_time=0.201, loss_att=41.873, acc=0.963, loss=41.873, backward_time=0.298, grad_norm=82.288, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.190e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 17:40:09,272 (trainer:732) INFO: 43epoch:train:9111-10021batch: iter_time=2.638e-04, forward_time=0.202, loss_att=41.911, acc=0.963, loss=41.911, backward_time=0.299, grad_norm=88.456, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.188e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 17:50:30,572 (trainer:732) INFO: 43epoch:train:10022-10932batch: iter_time=2.713e-04, forward_time=0.202, loss_att=41.939, acc=0.963, loss=41.939, backward_time=0.299, grad_norm=82.352, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.186e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 18:00:50,483 (trainer:732) INFO: 43epoch:train:10933-11843batch: iter_time=2.613e-04, forward_time=0.201, loss_att=42.057, acc=0.963, loss=42.057, backward_time=0.298, grad_norm=74.833, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.184e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 18:11:13,987 (trainer:732) INFO: 43epoch:train:11844-12754batch: iter_time=2.569e-04, forward_time=0.203, loss_att=42.051, acc=0.963, loss=42.051, backward_time=0.301, grad_norm=83.401, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.182e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 18:21:34,869 (trainer:732) INFO: 43epoch:train:12755-13665batch: iter_time=2.580e-04, forward_time=0.202, loss_att=41.475, acc=0.963, loss=41.475, backward_time=0.299, grad_norm=84.465, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.180e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 18:31:59,403 (trainer:732) INFO: 43epoch:train:13666-14576batch: iter_time=2.631e-04, forward_time=0.203, loss_att=42.581, acc=0.963, loss=42.581, backward_time=0.301, grad_norm=79.060, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.178e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 18:42:23,156 (trainer:732) INFO: 43epoch:train:14577-15487batch: iter_time=2.708e-04, forward_time=0.203, loss_att=43.361, acc=0.963, loss=43.361, backward_time=0.301, grad_norm=83.524, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.176e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 18:52:46,266 (trainer:732) INFO: 43epoch:train:15488-16398batch: iter_time=2.647e-04, forward_time=0.203, loss_att=42.191, acc=0.963, loss=42.191, backward_time=0.300, grad_norm=82.296, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.174e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:03:08,945 (trainer:732) INFO: 43epoch:train:16399-17309batch: iter_time=2.657e-04, forward_time=0.202, loss_att=42.698, acc=0.963, loss=42.698, backward_time=0.300, grad_norm=85.603, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.172e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:13:31,733 (trainer:732) INFO: 43epoch:train:17310-18220batch: iter_time=2.657e-04, forward_time=0.203, loss_att=41.949, acc=0.963, loss=41.949, backward_time=0.300, grad_norm=79.434, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.171e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:18:29,585 (trainer:338) INFO: 43epoch results: [train] iter_time=3.425e-04, forward_time=0.202, loss_att=42.042, acc=0.963, loss=42.042, backward_time=0.299, grad_norm=82.609, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.189e-04, train_time=2.757, time=3 hours, 29 minutes and 33.47 seconds, total_count=1197493, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.896, acc=0.983, cer=0.022, wer=0.084, loss=9.896, time=2 minutes and 39.37 seconds, total_count=1848, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 4.47 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:18:33,675 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:18:33,690 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/31epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:18:33,690 (trainer:272) INFO: 44/60epoch started. Estimated time to finish: 2 days, 13 hours and 1 minute +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:30:54,456 (trainer:732) INFO: 44epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=40.711, acc=0.964, loss=40.711, backward_time=0.299, grad_norm=80.471, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.169e-04, train_time=3.255 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:41:15,404 (trainer:732) INFO: 44epoch:train:912-1822batch: iter_time=2.705e-04, forward_time=0.202, loss_att=40.740, acc=0.964, loss=40.740, backward_time=0.299, grad_norm=85.960, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.167e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 19:51:37,138 (trainer:732) INFO: 44epoch:train:1823-2733batch: iter_time=2.703e-04, forward_time=0.202, loss_att=41.537, acc=0.963, loss=41.537, backward_time=0.299, grad_norm=81.144, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.165e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 20:01:57,120 (trainer:732) INFO: 44epoch:train:2734-3644batch: iter_time=2.663e-04, forward_time=0.201, loss_att=41.874, acc=0.963, loss=41.874, backward_time=0.298, grad_norm=82.128, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.163e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 20:12:19,036 (trainer:732) INFO: 44epoch:train:3645-4555batch: iter_time=2.620e-04, forward_time=0.202, loss_att=42.224, acc=0.963, loss=42.224, backward_time=0.299, grad_norm=80.069, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.161e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 20:22:42,504 (trainer:732) INFO: 44epoch:train:4556-5466batch: iter_time=2.634e-04, forward_time=0.202, loss_att=42.256, acc=0.963, loss=42.256, backward_time=0.300, grad_norm=81.969, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.159e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 20:33:04,622 (trainer:732) INFO: 44epoch:train:5467-6377batch: iter_time=2.667e-04, forward_time=0.202, loss_att=42.299, acc=0.963, loss=42.299, backward_time=0.300, grad_norm=83.233, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.157e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 20:43:27,236 (trainer:732) INFO: 44epoch:train:6378-7288batch: iter_time=2.649e-04, forward_time=0.202, loss_att=41.940, acc=0.963, loss=41.940, backward_time=0.300, grad_norm=90.157, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.155e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 20:53:48,458 (trainer:732) INFO: 44epoch:train:7289-8199batch: iter_time=2.694e-04, forward_time=0.202, loss_att=42.241, acc=0.963, loss=42.241, backward_time=0.299, grad_norm=80.768, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.153e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 21:04:11,090 (trainer:732) INFO: 44epoch:train:8200-9110batch: iter_time=2.673e-04, forward_time=0.202, loss_att=42.370, acc=0.963, loss=42.370, backward_time=0.299, grad_norm=82.208, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.151e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 21:14:34,257 (trainer:732) INFO: 44epoch:train:9111-10021batch: iter_time=2.666e-04, forward_time=0.202, loss_att=41.897, acc=0.964, loss=41.897, backward_time=0.300, grad_norm=81.258, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.149e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 21:24:56,910 (trainer:732) INFO: 44epoch:train:10022-10932batch: iter_time=2.655e-04, forward_time=0.202, loss_att=42.567, acc=0.963, loss=42.567, backward_time=0.299, grad_norm=96.178, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.147e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 21:35:20,359 (trainer:732) INFO: 44epoch:train:10933-11843batch: iter_time=2.559e-04, forward_time=0.203, loss_att=42.111, acc=0.963, loss=42.111, backward_time=0.300, grad_norm=83.752, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.145e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 21:45:40,810 (trainer:732) INFO: 44epoch:train:11844-12754batch: iter_time=2.759e-04, forward_time=0.202, loss_att=41.547, acc=0.963, loss=41.547, backward_time=0.299, grad_norm=81.413, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.143e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 21:56:02,820 (trainer:732) INFO: 44epoch:train:12755-13665batch: iter_time=2.629e-04, forward_time=0.202, loss_att=42.330, acc=0.963, loss=42.330, backward_time=0.299, grad_norm=82.441, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.141e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:06:22,966 (trainer:732) INFO: 44epoch:train:13666-14576batch: iter_time=2.728e-04, forward_time=0.202, loss_att=41.789, acc=0.963, loss=41.789, backward_time=0.299, grad_norm=79.495, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.139e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:16:44,441 (trainer:732) INFO: 44epoch:train:14577-15487batch: iter_time=2.680e-04, forward_time=0.202, loss_att=41.258, acc=0.963, loss=41.258, backward_time=0.299, grad_norm=78.617, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.137e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:27:08,218 (trainer:732) INFO: 44epoch:train:15488-16398batch: iter_time=2.633e-04, forward_time=0.203, loss_att=42.527, acc=0.963, loss=42.527, backward_time=0.300, grad_norm=83.781, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.135e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:37:30,968 (trainer:732) INFO: 44epoch:train:16399-17309batch: iter_time=2.741e-04, forward_time=0.202, loss_att=41.240, acc=0.964, loss=41.240, backward_time=0.300, grad_norm=82.988, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.134e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:47:54,989 (trainer:732) INFO: 44epoch:train:17310-18220batch: iter_time=2.685e-04, forward_time=0.203, loss_att=41.117, acc=0.964, loss=41.117, backward_time=0.300, grad_norm=79.927, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.132e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:53:08,107 (trainer:338) INFO: 44epoch results: [train] iter_time=3.241e-04, forward_time=0.202, loss_att=41.826, acc=0.963, loss=41.826, backward_time=0.299, grad_norm=82.896, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.150e-04, train_time=2.757, time=3 hours, 29 minutes and 35.9 seconds, total_count=1215725, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.924, acc=0.982, cer=0.022, wer=0.085, loss=9.924, time=2 minutes and 51.64 seconds, total_count=1876, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 6.87 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:53:12,313 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:53:12,328 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/39epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 22:53:12,329 (trainer:272) INFO: 45/60epoch started. Estimated time to finish: 2 days, 9 hours and 25 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 23:05:33,337 (trainer:732) INFO: 45epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=41.874, acc=0.963, loss=41.874, backward_time=0.299, grad_norm=79.978, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.130e-04, train_time=3.255 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 23:15:55,429 (trainer:732) INFO: 45epoch:train:912-1822batch: iter_time=2.743e-04, forward_time=0.202, loss_att=40.717, acc=0.964, loss=40.717, backward_time=0.300, grad_norm=82.516, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.128e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 23:26:17,612 (trainer:732) INFO: 45epoch:train:1823-2733batch: iter_time=2.688e-04, forward_time=0.202, loss_att=41.357, acc=0.964, loss=41.357, backward_time=0.300, grad_norm=84.971, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.126e-04, train_time=2.732 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2641:2718 [1] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2646:2719 [6] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 23:36:40,444 (trainer:732) INFO: 45epoch:train:2734-3644batch: iter_time=2.764e-04, forward_time=0.202, loss_att=41.067, acc=0.964, loss=41.067, backward_time=0.299, grad_norm=79.626, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.124e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 23:47:04,307 (trainer:732) INFO: 45epoch:train:3645-4555batch: iter_time=2.718e-04, forward_time=0.203, loss_att=41.735, acc=0.964, loss=41.735, backward_time=0.300, grad_norm=84.745, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.122e-04, train_time=2.740 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2644:2715 [4] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2643:2716 [3] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 0, fd 131) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-01 23:57:24,872 (trainer:732) INFO: 45epoch:train:4556-5466batch: iter_time=2.702e-04, forward_time=0.201, loss_att=41.529, acc=0.963, loss=41.529, backward_time=0.298, grad_norm=83.440, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.120e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 00:07:46,302 (trainer:732) INFO: 45epoch:train:5467-6377batch: iter_time=2.720e-04, forward_time=0.202, loss_att=41.301, acc=0.964, loss=41.301, backward_time=0.299, grad_norm=83.280, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.118e-04, train_time=2.728 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2645:2720 [5] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 00:18:07,728 (trainer:732) INFO: 45epoch:train:6378-7288batch: iter_time=2.637e-04, forward_time=0.202, loss_att=41.689, acc=0.963, loss=41.689, backward_time=0.300, grad_norm=80.550, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.116e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 00:28:30,502 (trainer:732) INFO: 45epoch:train:7289-8199batch: iter_time=2.833e-04, forward_time=0.202, loss_att=41.285, acc=0.964, loss=41.285, backward_time=0.300, grad_norm=85.257, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.114e-04, train_time=2.734 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2647:2717 [7] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2642:2713 [2] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 132) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 00:38:55,064 (trainer:732) INFO: 45epoch:train:8200-9110batch: iter_time=2.741e-04, forward_time=0.203, loss_att=42.075, acc=0.963, loss=42.075, backward_time=0.300, grad_norm=82.894, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.112e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 00:49:18,140 (trainer:732) INFO: 45epoch:train:9111-10021batch: iter_time=2.705e-04, forward_time=0.202, loss_att=41.640, acc=0.964, loss=41.640, backward_time=0.300, grad_norm=87.181, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.111e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 00:59:40,679 (trainer:732) INFO: 45epoch:train:10022-10932batch: iter_time=2.818e-04, forward_time=0.202, loss_att=41.652, acc=0.963, loss=41.652, backward_time=0.299, grad_norm=83.381, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.109e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 01:10:02,880 (trainer:732) INFO: 45epoch:train:10933-11843batch: iter_time=2.797e-04, forward_time=0.202, loss_att=42.442, acc=0.963, loss=42.442, backward_time=0.299, grad_norm=84.278, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.107e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 01:20:26,711 (trainer:732) INFO: 45epoch:train:11844-12754batch: iter_time=2.820e-04, forward_time=0.203, loss_att=42.270, acc=0.963, loss=42.270, backward_time=0.300, grad_norm=84.830, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.105e-04, train_time=2.739 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 152) + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] include/alloc.h:48 NCCL WARN Cuda failure 'out of memory' +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] NCCL INFO bootstrap.cc:231 -> 1 + +de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:2640:2714 [0] bootstrap.cc:279 NCCL WARN [Rem Allocator] Allocation failed (segment 1, fd 152) +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 01:30:49,841 (trainer:732) INFO: 45epoch:train:12755-13665batch: iter_time=2.870e-04, forward_time=0.202, loss_att=41.925, acc=0.963, loss=41.925, backward_time=0.299, grad_norm=79.338, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.103e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 01:41:09,639 (trainer:732) INFO: 45epoch:train:13666-14576batch: iter_time=2.848e-04, forward_time=0.201, loss_att=41.342, acc=0.963, loss=41.342, backward_time=0.298, grad_norm=79.481, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.101e-04, train_time=2.720 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 01:51:31,736 (trainer:732) INFO: 45epoch:train:14577-15487batch: iter_time=2.744e-04, forward_time=0.202, loss_att=41.377, acc=0.964, loss=41.377, backward_time=0.299, grad_norm=81.501, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.099e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:01:55,760 (trainer:732) INFO: 45epoch:train:15488-16398batch: iter_time=2.774e-04, forward_time=0.203, loss_att=41.653, acc=0.964, loss=41.653, backward_time=0.300, grad_norm=86.599, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.097e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:12:19,061 (trainer:732) INFO: 45epoch:train:16399-17309batch: iter_time=2.737e-04, forward_time=0.202, loss_att=41.858, acc=0.963, loss=41.858, backward_time=0.300, grad_norm=83.840, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.095e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:22:41,536 (trainer:732) INFO: 45epoch:train:17310-18220batch: iter_time=2.847e-04, forward_time=0.202, loss_att=42.356, acc=0.963, loss=42.356, backward_time=0.299, grad_norm=80.772, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.094e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:28:07,796 (trainer:338) INFO: 45epoch results: [train] iter_time=3.347e-04, forward_time=0.202, loss_att=41.653, acc=0.963, loss=41.653, backward_time=0.299, grad_norm=82.922, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.112e-04, train_time=2.759, time=3 hours, 29 minutes and 43.66 seconds, total_count=1233957, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.403, acc=0.982, cer=0.022, wer=0.086, loss=10.403, time=2 minutes and 59.79 seconds, total_count=1904, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 12.01 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:28:12,304 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:28:12,339 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/34epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:28:12,339 (trainer:272) INFO: 46/60epoch started. Estimated time to finish: 2 days, 5 hours and 50 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:40:37,317 (trainer:732) INFO: 46epoch:train:1-911batch: iter_time=0.002, forward_time=0.203, loss_att=41.948, acc=0.964, loss=41.948, backward_time=0.300, grad_norm=85.985, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.092e-04, train_time=3.273 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 02:51:00,642 (trainer:732) INFO: 46epoch:train:912-1822batch: iter_time=2.821e-04, forward_time=0.203, loss_att=41.862, acc=0.964, loss=41.862, backward_time=0.300, grad_norm=92.508, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.090e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 03:01:21,172 (trainer:732) INFO: 46epoch:train:1823-2733batch: iter_time=2.725e-04, forward_time=0.202, loss_att=40.697, acc=0.964, loss=40.697, backward_time=0.299, grad_norm=85.773, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.088e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 03:11:43,925 (trainer:732) INFO: 46epoch:train:2734-3644batch: iter_time=2.686e-04, forward_time=0.202, loss_att=41.368, acc=0.964, loss=41.368, backward_time=0.299, grad_norm=81.498, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=5.086e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 03:22:07,132 (trainer:732) INFO: 46epoch:train:3645-4555batch: iter_time=2.821e-04, forward_time=0.203, loss_att=40.928, acc=0.964, loss=40.928, backward_time=0.300, grad_norm=80.739, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.084e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 03:32:33,484 (trainer:732) INFO: 46epoch:train:4556-5466batch: iter_time=2.741e-04, forward_time=0.203, loss_att=41.640, acc=0.964, loss=41.640, backward_time=0.301, grad_norm=86.213, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.082e-04, train_time=2.750 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 03:42:59,112 (trainer:732) INFO: 46epoch:train:5467-6377batch: iter_time=2.779e-04, forward_time=0.203, loss_att=41.590, acc=0.964, loss=41.590, backward_time=0.301, grad_norm=85.067, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.080e-04, train_time=2.746 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 03:53:20,966 (trainer:732) INFO: 46epoch:train:6378-7288batch: iter_time=2.705e-04, forward_time=0.202, loss_att=41.588, acc=0.963, loss=41.588, backward_time=0.299, grad_norm=84.949, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.079e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 04:03:43,416 (trainer:732) INFO: 46epoch:train:7289-8199batch: iter_time=2.768e-04, forward_time=0.202, loss_att=42.041, acc=0.963, loss=42.041, backward_time=0.299, grad_norm=87.997, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.077e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 04:14:03,571 (trainer:732) INFO: 46epoch:train:8200-9110batch: iter_time=2.754e-04, forward_time=0.202, loss_att=41.359, acc=0.964, loss=41.359, backward_time=0.299, grad_norm=84.824, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.075e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 04:24:25,972 (trainer:732) INFO: 46epoch:train:9111-10021batch: iter_time=2.727e-04, forward_time=0.202, loss_att=41.892, acc=0.963, loss=41.892, backward_time=0.299, grad_norm=87.583, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.073e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 04:34:46,508 (trainer:732) INFO: 46epoch:train:10022-10932batch: iter_time=2.734e-04, forward_time=0.202, loss_att=41.581, acc=0.963, loss=41.581, backward_time=0.298, grad_norm=84.349, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.071e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 04:45:08,014 (trainer:732) INFO: 46epoch:train:10933-11843batch: iter_time=2.687e-04, forward_time=0.202, loss_att=41.631, acc=0.963, loss=41.631, backward_time=0.299, grad_norm=86.063, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.069e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 04:55:30,223 (trainer:732) INFO: 46epoch:train:11844-12754batch: iter_time=2.715e-04, forward_time=0.202, loss_att=41.071, acc=0.964, loss=41.071, backward_time=0.299, grad_norm=85.385, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.067e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 05:05:52,794 (trainer:732) INFO: 46epoch:train:12755-13665batch: iter_time=2.692e-04, forward_time=0.202, loss_att=41.549, acc=0.964, loss=41.549, backward_time=0.299, grad_norm=82.769, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.066e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 05:16:16,192 (trainer:732) INFO: 46epoch:train:13666-14576batch: iter_time=2.705e-04, forward_time=0.202, loss_att=40.933, acc=0.964, loss=40.933, backward_time=0.300, grad_norm=86.155, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.064e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 05:26:39,489 (trainer:732) INFO: 46epoch:train:14577-15487batch: iter_time=2.661e-04, forward_time=0.202, loss_att=41.777, acc=0.963, loss=41.777, backward_time=0.300, grad_norm=88.174, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.062e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 05:37:00,904 (trainer:732) INFO: 46epoch:train:15488-16398batch: iter_time=2.665e-04, forward_time=0.202, loss_att=41.340, acc=0.964, loss=41.340, backward_time=0.299, grad_norm=90.326, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.060e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 05:47:23,125 (trainer:732) INFO: 46epoch:train:16399-17309batch: iter_time=2.708e-04, forward_time=0.202, loss_att=41.413, acc=0.963, loss=41.413, backward_time=0.299, grad_norm=89.570, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.058e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 05:57:44,253 (trainer:732) INFO: 46epoch:train:17310-18220batch: iter_time=2.696e-04, forward_time=0.201, loss_att=41.242, acc=0.963, loss=41.242, backward_time=0.298, grad_norm=86.046, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.056e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:03:07,680 (trainer:338) INFO: 46epoch results: [train] iter_time=3.441e-04, forward_time=0.202, loss_att=41.470, acc=0.964, loss=41.470, backward_time=0.299, grad_norm=86.100, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.074e-04, train_time=2.760, time=3 hours, 29 minutes and 46.67 seconds, total_count=1252189, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.296, acc=0.982, cer=0.022, wer=0.085, loss=10.296, time=2 minutes and 58.77 seconds, total_count=1932, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 9.9 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:03:11,822 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:03:11,837 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/37epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:03:11,837 (trainer:272) INFO: 47/60epoch started. Estimated time to finish: 2 days, 2 hours and 14 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:15:34,669 (trainer:732) INFO: 47epoch:train:1-911batch: iter_time=0.002, forward_time=0.203, loss_att=41.290, acc=0.964, loss=41.290, backward_time=0.300, grad_norm=76.681, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.054e-04, train_time=3.263 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:25:55,095 (trainer:732) INFO: 47epoch:train:912-1822batch: iter_time=2.703e-04, forward_time=0.202, loss_att=41.025, acc=0.964, loss=41.025, backward_time=0.298, grad_norm=84.130, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=5.053e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:36:17,198 (trainer:732) INFO: 47epoch:train:1823-2733batch: iter_time=2.682e-04, forward_time=0.202, loss_att=41.282, acc=0.964, loss=41.282, backward_time=0.300, grad_norm=84.688, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.051e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:46:37,597 (trainer:732) INFO: 47epoch:train:2734-3644batch: iter_time=2.643e-04, forward_time=0.202, loss_att=40.923, acc=0.964, loss=40.923, backward_time=0.299, grad_norm=82.871, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.049e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 06:57:00,999 (trainer:732) INFO: 47epoch:train:3645-4555batch: iter_time=2.575e-04, forward_time=0.202, loss_att=41.489, acc=0.964, loss=41.489, backward_time=0.300, grad_norm=81.161, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.047e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 07:07:21,036 (trainer:732) INFO: 47epoch:train:4556-5466batch: iter_time=2.583e-04, forward_time=0.201, loss_att=40.715, acc=0.964, loss=40.715, backward_time=0.298, grad_norm=83.318, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.045e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 07:17:44,469 (trainer:732) INFO: 47epoch:train:5467-6377batch: iter_time=2.658e-04, forward_time=0.203, loss_att=41.730, acc=0.964, loss=41.730, backward_time=0.300, grad_norm=89.103, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.043e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 07:28:07,745 (trainer:732) INFO: 47epoch:train:6378-7288batch: iter_time=2.655e-04, forward_time=0.202, loss_att=41.578, acc=0.964, loss=41.578, backward_time=0.300, grad_norm=89.939, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.042e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 07:38:30,708 (trainer:732) INFO: 47epoch:train:7289-8199batch: iter_time=2.576e-04, forward_time=0.202, loss_att=40.598, acc=0.964, loss=40.598, backward_time=0.299, grad_norm=86.139, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.040e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 07:48:51,787 (trainer:732) INFO: 47epoch:train:8200-9110batch: iter_time=2.598e-04, forward_time=0.202, loss_att=41.152, acc=0.963, loss=41.152, backward_time=0.299, grad_norm=86.553, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.038e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 07:59:13,605 (trainer:732) INFO: 47epoch:train:9111-10021batch: iter_time=2.622e-04, forward_time=0.202, loss_att=42.006, acc=0.963, loss=42.006, backward_time=0.299, grad_norm=79.214, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.036e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 08:09:37,511 (trainer:732) INFO: 47epoch:train:10022-10932batch: iter_time=2.567e-04, forward_time=0.202, loss_att=40.850, acc=0.964, loss=40.850, backward_time=0.301, grad_norm=82.630, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.034e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 08:20:00,572 (trainer:732) INFO: 47epoch:train:10933-11843batch: iter_time=2.627e-04, forward_time=0.202, loss_att=41.729, acc=0.964, loss=41.729, backward_time=0.300, grad_norm=89.464, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.033e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 08:30:21,951 (trainer:732) INFO: 47epoch:train:11844-12754batch: iter_time=2.728e-04, forward_time=0.202, loss_att=41.761, acc=0.963, loss=41.761, backward_time=0.299, grad_norm=78.101, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.031e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 08:40:43,825 (trainer:732) INFO: 47epoch:train:12755-13665batch: iter_time=2.494e-04, forward_time=0.202, loss_att=41.487, acc=0.964, loss=41.487, backward_time=0.300, grad_norm=90.505, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.029e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 08:51:06,893 (trainer:732) INFO: 47epoch:train:13666-14576batch: iter_time=2.644e-04, forward_time=0.202, loss_att=41.080, acc=0.964, loss=41.080, backward_time=0.300, grad_norm=82.433, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.027e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:01:24,797 (trainer:732) INFO: 47epoch:train:14577-15487batch: iter_time=2.580e-04, forward_time=0.201, loss_att=41.370, acc=0.963, loss=41.370, backward_time=0.297, grad_norm=80.833, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.025e-04, train_time=2.713 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:11:47,309 (trainer:732) INFO: 47epoch:train:15488-16398batch: iter_time=2.534e-04, forward_time=0.202, loss_att=41.715, acc=0.964, loss=41.715, backward_time=0.300, grad_norm=84.247, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.023e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:22:08,291 (trainer:732) INFO: 47epoch:train:16399-17309batch: iter_time=2.534e-04, forward_time=0.202, loss_att=42.005, acc=0.963, loss=42.005, backward_time=0.299, grad_norm=83.352, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.022e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:32:30,068 (trainer:732) INFO: 47epoch:train:17310-18220batch: iter_time=2.519e-04, forward_time=0.202, loss_att=41.241, acc=0.964, loss=41.241, backward_time=0.299, grad_norm=82.463, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.020e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:37:34,161 (trainer:338) INFO: 47epoch results: [train] iter_time=3.295e-04, forward_time=0.202, loss_att=41.355, acc=0.964, loss=41.355, backward_time=0.299, grad_norm=83.889, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.037e-04, train_time=2.757, time=3 hours, 29 minutes and 32.53 seconds, total_count=1270421, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.480, acc=0.981, cer=0.021, wer=0.084, loss=10.480, time=2 minutes and 42.85 seconds, total_count=1960, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 6.94 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:37:38,406 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:37:38,421 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/36epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:37:38,422 (trainer:272) INFO: 48/60epoch started. Estimated time to finish: 1 day, 22 hours and 38 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 09:49:57,194 (trainer:732) INFO: 48epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=41.036, acc=0.964, loss=41.036, backward_time=0.298, grad_norm=81.479, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.018e-04, train_time=3.245 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 10:00:19,366 (trainer:732) INFO: 48epoch:train:912-1822batch: iter_time=2.659e-04, forward_time=0.202, loss_att=41.093, acc=0.964, loss=41.093, backward_time=0.300, grad_norm=85.636, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.016e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 10:10:44,372 (trainer:732) INFO: 48epoch:train:1823-2733batch: iter_time=2.621e-04, forward_time=0.203, loss_att=41.146, acc=0.965, loss=41.146, backward_time=0.301, grad_norm=84.424, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.014e-04, train_time=2.744 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 10:21:04,793 (trainer:732) INFO: 48epoch:train:2734-3644batch: iter_time=2.611e-04, forward_time=0.201, loss_att=41.263, acc=0.963, loss=41.263, backward_time=0.298, grad_norm=90.640, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.013e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 10:31:26,589 (trainer:732) INFO: 48epoch:train:3645-4555batch: iter_time=2.731e-04, forward_time=0.202, loss_att=41.692, acc=0.963, loss=41.692, backward_time=0.299, grad_norm=85.736, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.011e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 10:41:48,862 (trainer:732) INFO: 48epoch:train:4556-5466batch: iter_time=2.788e-04, forward_time=0.203, loss_att=40.682, acc=0.964, loss=40.682, backward_time=0.300, grad_norm=79.030, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.009e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 10:52:09,952 (trainer:732) INFO: 48epoch:train:5467-6377batch: iter_time=2.589e-04, forward_time=0.202, loss_att=41.215, acc=0.964, loss=41.215, backward_time=0.299, grad_norm=90.861, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.007e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 11:02:31,526 (trainer:732) INFO: 48epoch:train:6378-7288batch: iter_time=2.576e-04, forward_time=0.202, loss_att=41.408, acc=0.964, loss=41.408, backward_time=0.299, grad_norm=80.001, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.005e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 11:12:54,500 (trainer:732) INFO: 48epoch:train:7289-8199batch: iter_time=2.573e-04, forward_time=0.203, loss_att=41.021, acc=0.964, loss=41.021, backward_time=0.300, grad_norm=84.381, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.004e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 11:23:16,621 (trainer:732) INFO: 48epoch:train:8200-9110batch: iter_time=2.536e-04, forward_time=0.202, loss_att=41.427, acc=0.964, loss=41.427, backward_time=0.299, grad_norm=85.295, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.002e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 11:33:35,308 (trainer:732) INFO: 48epoch:train:9111-10021batch: iter_time=2.539e-04, forward_time=0.201, loss_att=40.686, acc=0.963, loss=40.686, backward_time=0.297, grad_norm=83.495, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.000e-04, train_time=2.716 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 11:43:59,205 (trainer:732) INFO: 48epoch:train:10022-10932batch: iter_time=2.554e-04, forward_time=0.203, loss_att=42.212, acc=0.963, loss=42.212, backward_time=0.301, grad_norm=95.995, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.998e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 11:54:21,242 (trainer:732) INFO: 48epoch:train:10933-11843batch: iter_time=2.575e-04, forward_time=0.202, loss_att=41.031, acc=0.964, loss=41.031, backward_time=0.299, grad_norm=81.519, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.997e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 12:04:40,370 (trainer:732) INFO: 48epoch:train:11844-12754batch: iter_time=2.617e-04, forward_time=0.201, loss_att=40.717, acc=0.964, loss=40.717, backward_time=0.298, grad_norm=84.936, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.995e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 12:15:03,511 (trainer:732) INFO: 48epoch:train:12755-13665batch: iter_time=2.581e-04, forward_time=0.202, loss_att=40.871, acc=0.964, loss=40.871, backward_time=0.300, grad_norm=85.194, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.993e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 12:25:27,181 (trainer:732) INFO: 48epoch:train:13666-14576batch: iter_time=2.609e-04, forward_time=0.202, loss_att=41.531, acc=0.964, loss=41.531, backward_time=0.300, grad_norm=86.230, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.991e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 12:35:47,750 (trainer:732) INFO: 48epoch:train:14577-15487batch: iter_time=2.635e-04, forward_time=0.202, loss_att=40.991, acc=0.964, loss=40.991, backward_time=0.299, grad_norm=87.181, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.990e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 12:46:11,621 (trainer:732) INFO: 48epoch:train:15488-16398batch: iter_time=2.567e-04, forward_time=0.203, loss_att=42.107, acc=0.963, loss=42.107, backward_time=0.300, grad_norm=84.775, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.988e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 12:56:32,356 (trainer:732) INFO: 48epoch:train:16399-17309batch: iter_time=2.501e-04, forward_time=0.201, loss_att=40.658, acc=0.964, loss=40.658, backward_time=0.298, grad_norm=81.997, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.986e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:06:54,304 (trainer:732) INFO: 48epoch:train:17310-18220batch: iter_time=2.550e-04, forward_time=0.202, loss_att=41.222, acc=0.964, loss=41.222, backward_time=0.300, grad_norm=78.431, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.984e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:11:58,707 (trainer:338) INFO: 48epoch results: [train] iter_time=3.284e-04, forward_time=0.202, loss_att=41.194, acc=0.964, loss=41.194, backward_time=0.299, grad_norm=84.851, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=5.001e-04, train_time=2.756, time=3 hours, 29 minutes and 29.92 seconds, total_count=1288653, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.784, acc=0.983, cer=0.021, wer=0.083, loss=9.784, time=2 minutes and 41.61 seconds, total_count=1988, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 8.75 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:12:02,620 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:12:02,637 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/32epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:12:02,637 (trainer:272) INFO: 49/60epoch started. Estimated time to finish: 1 day, 19 hours and 3 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:24:22,539 (trainer:732) INFO: 49epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=40.743, acc=0.964, loss=40.743, backward_time=0.299, grad_norm=84.090, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.982e-04, train_time=3.250 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:34:44,445 (trainer:732) INFO: 49epoch:train:912-1822batch: iter_time=2.567e-04, forward_time=0.202, loss_att=40.704, acc=0.964, loss=40.704, backward_time=0.299, grad_norm=93.354, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.981e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:45:06,303 (trainer:732) INFO: 49epoch:train:1823-2733batch: iter_time=2.595e-04, forward_time=0.202, loss_att=40.245, acc=0.964, loss=40.245, backward_time=0.299, grad_norm=79.702, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.979e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 13:55:29,747 (trainer:732) INFO: 49epoch:train:2734-3644batch: iter_time=2.572e-04, forward_time=0.202, loss_att=41.035, acc=0.965, loss=41.035, backward_time=0.300, grad_norm=85.054, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.977e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 14:05:52,748 (trainer:732) INFO: 49epoch:train:3645-4555batch: iter_time=2.592e-04, forward_time=0.202, loss_att=40.878, acc=0.964, loss=40.878, backward_time=0.300, grad_norm=83.346, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.975e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 14:16:14,914 (trainer:732) INFO: 49epoch:train:4556-5466batch: iter_time=2.506e-04, forward_time=0.202, loss_att=40.238, acc=0.964, loss=40.238, backward_time=0.299, grad_norm=86.824, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.974e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 14:26:36,374 (trainer:732) INFO: 49epoch:train:5467-6377batch: iter_time=2.507e-04, forward_time=0.201, loss_att=40.618, acc=0.964, loss=40.618, backward_time=0.298, grad_norm=80.899, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.972e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 14:36:56,674 (trainer:732) INFO: 49epoch:train:6378-7288batch: iter_time=2.535e-04, forward_time=0.201, loss_att=40.635, acc=0.964, loss=40.635, backward_time=0.298, grad_norm=82.489, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.970e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 14:47:17,821 (trainer:732) INFO: 49epoch:train:7289-8199batch: iter_time=2.555e-04, forward_time=0.202, loss_att=40.807, acc=0.964, loss=40.807, backward_time=0.299, grad_norm=81.897, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.968e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 14:57:40,489 (trainer:732) INFO: 49epoch:train:8200-9110batch: iter_time=2.556e-04, forward_time=0.203, loss_att=41.262, acc=0.964, loss=41.262, backward_time=0.300, grad_norm=85.403, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.967e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 15:08:05,303 (trainer:732) INFO: 49epoch:train:9111-10021batch: iter_time=2.599e-04, forward_time=0.203, loss_att=41.092, acc=0.964, loss=41.092, backward_time=0.301, grad_norm=89.113, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.965e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 15:18:28,526 (trainer:732) INFO: 49epoch:train:10022-10932batch: iter_time=2.542e-04, forward_time=0.202, loss_att=41.561, acc=0.964, loss=41.561, backward_time=0.300, grad_norm=87.441, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.963e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 15:28:49,050 (trainer:732) INFO: 49epoch:train:10933-11843batch: iter_time=2.474e-04, forward_time=0.202, loss_att=40.867, acc=0.964, loss=40.867, backward_time=0.299, grad_norm=80.061, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.961e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 15:39:10,347 (trainer:732) INFO: 49epoch:train:11844-12754batch: iter_time=2.562e-04, forward_time=0.202, loss_att=41.240, acc=0.964, loss=41.240, backward_time=0.299, grad_norm=84.723, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.960e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 15:49:32,938 (trainer:732) INFO: 49epoch:train:12755-13665batch: iter_time=2.531e-04, forward_time=0.202, loss_att=42.298, acc=0.963, loss=42.298, backward_time=0.300, grad_norm=82.960, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.958e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 15:59:53,196 (trainer:732) INFO: 49epoch:train:13666-14576batch: iter_time=2.612e-04, forward_time=0.201, loss_att=41.656, acc=0.964, loss=41.656, backward_time=0.298, grad_norm=85.876, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.956e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:11:16,452 (trainer:732) INFO: 49epoch:train:14577-15487batch: iter_time=2.546e-04, forward_time=0.202, loss_att=40.999, acc=0.964, loss=40.999, backward_time=0.305, grad_norm=81.113, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.954e-04, train_time=3.001 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:21:39,974 (trainer:732) INFO: 49epoch:train:15488-16398batch: iter_time=2.513e-04, forward_time=0.202, loss_att=41.020, acc=0.964, loss=41.020, backward_time=0.300, grad_norm=87.258, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.953e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:32:01,915 (trainer:732) INFO: 49epoch:train:16399-17309batch: iter_time=2.502e-04, forward_time=0.202, loss_att=41.856, acc=0.963, loss=41.856, backward_time=0.299, grad_norm=84.877, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.951e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:42:21,723 (trainer:732) INFO: 49epoch:train:17310-18220batch: iter_time=2.491e-04, forward_time=0.201, loss_att=40.574, acc=0.964, loss=40.574, backward_time=0.298, grad_norm=81.217, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.949e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:47:26,224 (trainer:338) INFO: 49epoch results: [train] iter_time=3.131e-04, forward_time=0.202, loss_att=41.016, acc=0.964, loss=41.016, backward_time=0.300, grad_norm=84.379, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.966e-04, train_time=2.770, time=3 hours, 30 minutes and 33.06 seconds, total_count=1306885, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.663, acc=0.983, cer=0.022, wer=0.084, loss=9.663, time=2 minutes and 42.1 seconds, total_count=2016, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 8.42 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:47:30,496 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:47:30,530 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/47epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:47:30,530 (trainer:272) INFO: 50/60epoch started. Estimated time to finish: 1 day, 15 hours and 27 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 16:59:52,301 (trainer:732) INFO: 50epoch:train:1-911batch: iter_time=0.002, forward_time=0.203, loss_att=40.885, acc=0.965, loss=40.885, backward_time=0.300, grad_norm=80.069, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.948e-04, train_time=3.259 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 17:10:13,315 (trainer:732) INFO: 50epoch:train:912-1822batch: iter_time=2.630e-04, forward_time=0.202, loss_att=40.379, acc=0.964, loss=40.379, backward_time=0.299, grad_norm=81.533, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.946e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 17:20:34,738 (trainer:732) INFO: 50epoch:train:1823-2733batch: iter_time=2.562e-04, forward_time=0.202, loss_att=40.598, acc=0.964, loss=40.598, backward_time=0.299, grad_norm=79.101, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.944e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 17:30:55,698 (trainer:732) INFO: 50epoch:train:2734-3644batch: iter_time=2.655e-04, forward_time=0.202, loss_att=39.779, acc=0.964, loss=39.779, backward_time=0.299, grad_norm=79.333, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.942e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 17:41:57,193 (trainer:732) INFO: 50epoch:train:3645-4555batch: iter_time=2.588e-04, forward_time=0.202, loss_att=41.387, acc=0.964, loss=41.387, backward_time=0.303, grad_norm=85.512, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.941e-04, train_time=2.904 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 17:52:36,344 (trainer:732) INFO: 50epoch:train:4556-5466batch: iter_time=2.625e-04, forward_time=0.202, loss_att=40.762, acc=0.964, loss=40.762, backward_time=0.300, grad_norm=81.751, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.939e-04, train_time=2.807 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 18:03:00,941 (trainer:732) INFO: 50epoch:train:5467-6377batch: iter_time=2.645e-04, forward_time=0.203, loss_att=40.794, acc=0.965, loss=40.794, backward_time=0.301, grad_norm=86.212, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.937e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 18:13:24,252 (trainer:732) INFO: 50epoch:train:6378-7288batch: iter_time=2.616e-04, forward_time=0.202, loss_att=40.685, acc=0.964, loss=40.685, backward_time=0.300, grad_norm=79.656, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.936e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 18:23:47,258 (trainer:732) INFO: 50epoch:train:7289-8199batch: iter_time=2.576e-04, forward_time=0.202, loss_att=40.459, acc=0.965, loss=40.459, backward_time=0.300, grad_norm=82.052, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.934e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 18:34:06,549 (trainer:732) INFO: 50epoch:train:8200-9110batch: iter_time=2.613e-04, forward_time=0.201, loss_att=40.694, acc=0.964, loss=40.694, backward_time=0.298, grad_norm=84.057, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.932e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 18:44:29,830 (trainer:732) INFO: 50epoch:train:9111-10021batch: iter_time=2.541e-04, forward_time=0.202, loss_att=41.215, acc=0.964, loss=41.215, backward_time=0.300, grad_norm=82.244, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.930e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 18:54:51,928 (trainer:732) INFO: 50epoch:train:10022-10932batch: iter_time=2.547e-04, forward_time=0.202, loss_att=40.669, acc=0.964, loss=40.669, backward_time=0.299, grad_norm=84.069, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.929e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 19:05:14,499 (trainer:732) INFO: 50epoch:train:10933-11843batch: iter_time=2.564e-04, forward_time=0.202, loss_att=40.826, acc=0.964, loss=40.826, backward_time=0.300, grad_norm=77.393, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.927e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 19:15:37,324 (trainer:732) INFO: 50epoch:train:11844-12754batch: iter_time=2.591e-04, forward_time=0.202, loss_att=41.524, acc=0.964, loss=41.524, backward_time=0.299, grad_norm=81.594, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.925e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 19:25:58,347 (trainer:732) INFO: 50epoch:train:12755-13665batch: iter_time=2.603e-04, forward_time=0.201, loss_att=40.128, acc=0.964, loss=40.128, backward_time=0.299, grad_norm=83.759, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.924e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 19:36:21,686 (trainer:732) INFO: 50epoch:train:13666-14576batch: iter_time=2.547e-04, forward_time=0.202, loss_att=41.565, acc=0.964, loss=41.565, backward_time=0.300, grad_norm=87.719, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.922e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 19:46:42,354 (trainer:732) INFO: 50epoch:train:14577-15487batch: iter_time=2.543e-04, forward_time=0.201, loss_att=40.491, acc=0.964, loss=40.491, backward_time=0.298, grad_norm=86.037, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.920e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 19:57:05,653 (trainer:732) INFO: 50epoch:train:15488-16398batch: iter_time=2.593e-04, forward_time=0.202, loss_att=41.564, acc=0.964, loss=41.564, backward_time=0.300, grad_norm=79.357, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.919e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 20:07:27,036 (trainer:732) INFO: 50epoch:train:16399-17309batch: iter_time=2.511e-04, forward_time=0.202, loss_att=41.038, acc=0.964, loss=41.038, backward_time=0.299, grad_norm=80.245, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.917e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 20:17:50,908 (trainer:732) INFO: 50epoch:train:17310-18220batch: iter_time=2.495e-04, forward_time=0.203, loss_att=41.225, acc=0.964, loss=41.225, backward_time=0.300, grad_norm=85.152, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.915e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 20:22:55,221 (trainer:338) INFO: 50epoch results: [train] iter_time=3.232e-04, forward_time=0.202, loss_att=40.836, acc=0.964, loss=40.836, backward_time=0.300, grad_norm=82.352, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.931e-04, train_time=2.770, time=3 hours, 30 minutes and 34.62 seconds, total_count=1325117, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.729, acc=0.981, cer=0.021, wer=0.083, loss=10.729, time=2 minutes and 42.09 seconds, total_count=2044, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 7.97 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 20:22:59,152 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 20:22:59,184 (trainer:272) INFO: 51/60epoch started. Estimated time to finish: 1 day, 11 hours and 52 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 20:35:21,887 (trainer:732) INFO: 51epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=39.473, acc=0.965, loss=39.473, backward_time=0.300, grad_norm=80.600, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.913e-04, train_time=3.263 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 20:45:42,446 (trainer:732) INFO: 51epoch:train:912-1822batch: iter_time=2.822e-04, forward_time=0.202, loss_att=39.272, acc=0.965, loss=39.272, backward_time=0.299, grad_norm=85.455, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.912e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 20:56:06,145 (trainer:732) INFO: 51epoch:train:1823-2733batch: iter_time=2.772e-04, forward_time=0.202, loss_att=40.752, acc=0.965, loss=40.752, backward_time=0.300, grad_norm=83.260, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.910e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 21:06:28,068 (trainer:732) INFO: 51epoch:train:2734-3644batch: iter_time=2.828e-04, forward_time=0.202, loss_att=39.794, acc=0.965, loss=39.794, backward_time=0.300, grad_norm=79.974, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.908e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 21:16:49,378 (trainer:732) INFO: 51epoch:train:3645-4555batch: iter_time=2.710e-04, forward_time=0.202, loss_att=40.653, acc=0.964, loss=40.653, backward_time=0.299, grad_norm=82.699, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.907e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 21:27:10,817 (trainer:732) INFO: 51epoch:train:4556-5466batch: iter_time=2.835e-04, forward_time=0.202, loss_att=40.747, acc=0.964, loss=40.747, backward_time=0.299, grad_norm=81.523, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.905e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 21:37:50,682 (trainer:732) INFO: 51epoch:train:5467-6377batch: iter_time=2.797e-04, forward_time=0.201, loss_att=40.363, acc=0.964, loss=40.363, backward_time=0.298, grad_norm=84.060, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.903e-04, train_time=2.809 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 21:48:12,477 (trainer:732) INFO: 51epoch:train:6378-7288batch: iter_time=2.788e-04, forward_time=0.202, loss_att=41.223, acc=0.964, loss=41.223, backward_time=0.299, grad_norm=82.732, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.902e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 21:58:33,739 (trainer:732) INFO: 51epoch:train:7289-8199batch: iter_time=2.776e-04, forward_time=0.202, loss_att=41.286, acc=0.964, loss=41.286, backward_time=0.299, grad_norm=81.562, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.900e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 22:08:56,680 (trainer:732) INFO: 51epoch:train:8200-9110batch: iter_time=2.774e-04, forward_time=0.203, loss_att=40.105, acc=0.965, loss=40.105, backward_time=0.300, grad_norm=87.863, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.898e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 22:19:19,741 (trainer:732) INFO: 51epoch:train:9111-10021batch: iter_time=2.705e-04, forward_time=0.202, loss_att=40.150, acc=0.965, loss=40.150, backward_time=0.299, grad_norm=76.160, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.897e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 22:29:42,433 (trainer:732) INFO: 51epoch:train:10022-10932batch: iter_time=2.759e-04, forward_time=0.202, loss_att=40.740, acc=0.964, loss=40.740, backward_time=0.300, grad_norm=83.796, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.895e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 22:40:05,295 (trainer:732) INFO: 51epoch:train:10933-11843batch: iter_time=2.747e-04, forward_time=0.202, loss_att=40.695, acc=0.964, loss=40.695, backward_time=0.299, grad_norm=83.766, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.893e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 22:50:29,695 (trainer:732) INFO: 51epoch:train:11844-12754batch: iter_time=2.728e-04, forward_time=0.203, loss_att=41.876, acc=0.964, loss=41.876, backward_time=0.300, grad_norm=86.455, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.892e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:00:50,554 (trainer:732) INFO: 51epoch:train:12755-13665batch: iter_time=2.688e-04, forward_time=0.202, loss_att=40.640, acc=0.964, loss=40.640, backward_time=0.299, grad_norm=81.961, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.890e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:11:10,666 (trainer:732) INFO: 51epoch:train:13666-14576batch: iter_time=2.766e-04, forward_time=0.201, loss_att=40.988, acc=0.964, loss=40.988, backward_time=0.298, grad_norm=83.466, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.888e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:21:34,970 (trainer:732) INFO: 51epoch:train:14577-15487batch: iter_time=2.844e-04, forward_time=0.203, loss_att=41.038, acc=0.964, loss=41.038, backward_time=0.300, grad_norm=83.329, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.887e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:31:57,908 (trainer:732) INFO: 51epoch:train:15488-16398batch: iter_time=2.794e-04, forward_time=0.202, loss_att=41.458, acc=0.964, loss=41.458, backward_time=0.299, grad_norm=86.018, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.885e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:42:25,413 (trainer:732) INFO: 51epoch:train:16399-17309batch: iter_time=2.768e-04, forward_time=0.203, loss_att=41.012, acc=0.965, loss=41.012, backward_time=0.302, grad_norm=84.894, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.883e-04, train_time=2.755 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:52:49,507 (trainer:732) INFO: 51epoch:train:17310-18220batch: iter_time=2.728e-04, forward_time=0.202, loss_att=41.529, acc=0.964, loss=41.529, backward_time=0.300, grad_norm=89.430, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.882e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:58:17,290 (trainer:338) INFO: 51epoch results: [train] iter_time=3.213e-04, forward_time=0.202, loss_att=40.679, acc=0.964, loss=40.679, backward_time=0.299, grad_norm=83.440, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.897e-04, train_time=2.764, time=3 hours, 30 minutes and 4.98 seconds, total_count=1343349, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.380, acc=0.983, cer=0.021, wer=0.083, loss=9.380, time=3 minutes and 1.93 seconds, total_count=2072, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 11.19 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:58:21,650 (trainer:386) INFO: The best model has been updated: valid.acc +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:58:21,684 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/41epoch.pth, exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/50epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-02 23:58:21,684 (trainer:272) INFO: 52/60epoch started. Estimated time to finish: 1 day, 8 hours and 17 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 00:10:41,892 (trainer:732) INFO: 52epoch:train:1-911batch: iter_time=0.001, forward_time=0.201, loss_att=39.739, acc=0.964, loss=39.739, backward_time=0.297, grad_norm=81.626, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.880e-04, train_time=3.252 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 00:21:05,434 (trainer:732) INFO: 52epoch:train:912-1822batch: iter_time=2.812e-04, forward_time=0.202, loss_att=40.249, acc=0.965, loss=40.249, backward_time=0.300, grad_norm=85.128, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.878e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 00:31:26,362 (trainer:732) INFO: 52epoch:train:1823-2733batch: iter_time=2.836e-04, forward_time=0.202, loss_att=40.773, acc=0.964, loss=40.773, backward_time=0.298, grad_norm=80.047, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.877e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 00:41:50,589 (trainer:732) INFO: 52epoch:train:2734-3644batch: iter_time=2.816e-04, forward_time=0.203, loss_att=40.666, acc=0.965, loss=40.666, backward_time=0.300, grad_norm=78.309, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.875e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 00:52:11,887 (trainer:732) INFO: 52epoch:train:3645-4555batch: iter_time=2.749e-04, forward_time=0.202, loss_att=40.334, acc=0.964, loss=40.334, backward_time=0.299, grad_norm=84.171, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.873e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 01:02:32,515 (trainer:732) INFO: 52epoch:train:4556-5466batch: iter_time=2.769e-04, forward_time=0.202, loss_att=40.240, acc=0.964, loss=40.240, backward_time=0.298, grad_norm=85.208, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.872e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 01:12:54,753 (trainer:732) INFO: 52epoch:train:5467-6377batch: iter_time=2.773e-04, forward_time=0.202, loss_att=40.227, acc=0.964, loss=40.227, backward_time=0.299, grad_norm=81.120, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.870e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 01:23:19,045 (trainer:732) INFO: 52epoch:train:6378-7288batch: iter_time=2.729e-04, forward_time=0.202, loss_att=40.263, acc=0.965, loss=40.263, backward_time=0.299, grad_norm=93.702, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.868e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 01:33:41,490 (trainer:732) INFO: 52epoch:train:7289-8199batch: iter_time=2.768e-04, forward_time=0.202, loss_att=39.952, acc=0.964, loss=39.952, backward_time=0.299, grad_norm=82.809, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.867e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 01:44:04,241 (trainer:732) INFO: 52epoch:train:8200-9110batch: iter_time=2.708e-04, forward_time=0.202, loss_att=40.807, acc=0.964, loss=40.807, backward_time=0.300, grad_norm=79.790, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.865e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 01:54:28,470 (trainer:732) INFO: 52epoch:train:9111-10021batch: iter_time=2.663e-04, forward_time=0.203, loss_att=41.304, acc=0.964, loss=41.304, backward_time=0.300, grad_norm=84.242, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.864e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 02:04:53,021 (trainer:732) INFO: 52epoch:train:10022-10932batch: iter_time=2.813e-04, forward_time=0.203, loss_att=41.606, acc=0.964, loss=41.606, backward_time=0.300, grad_norm=85.497, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.862e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 02:15:17,488 (trainer:732) INFO: 52epoch:train:10933-11843batch: iter_time=2.776e-04, forward_time=0.203, loss_att=40.193, acc=0.965, loss=40.193, backward_time=0.300, grad_norm=81.447, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.860e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 02:25:42,628 (trainer:732) INFO: 52epoch:train:11844-12754batch: iter_time=2.767e-04, forward_time=0.203, loss_att=40.563, acc=0.964, loss=40.563, backward_time=0.300, grad_norm=90.810, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.859e-04, train_time=2.745 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 02:36:05,760 (trainer:732) INFO: 52epoch:train:12755-13665batch: iter_time=2.689e-04, forward_time=0.202, loss_att=40.544, acc=0.964, loss=40.544, backward_time=0.299, grad_norm=80.123, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.857e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 02:46:28,497 (trainer:732) INFO: 52epoch:train:13666-14576batch: iter_time=2.714e-04, forward_time=0.202, loss_att=39.775, acc=0.965, loss=39.775, backward_time=0.300, grad_norm=81.520, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.855e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 02:56:51,295 (trainer:732) INFO: 52epoch:train:14577-15487batch: iter_time=2.721e-04, forward_time=0.202, loss_att=41.061, acc=0.964, loss=41.061, backward_time=0.299, grad_norm=80.613, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.854e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:07:14,074 (trainer:732) INFO: 52epoch:train:15488-16398batch: iter_time=2.692e-04, forward_time=0.202, loss_att=41.281, acc=0.964, loss=41.281, backward_time=0.300, grad_norm=83.478, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.852e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:17:34,662 (trainer:732) INFO: 52epoch:train:16399-17309batch: iter_time=2.732e-04, forward_time=0.201, loss_att=40.643, acc=0.964, loss=40.643, backward_time=0.298, grad_norm=89.036, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.850e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:27:59,908 (trainer:732) INFO: 52epoch:train:17310-18220batch: iter_time=2.752e-04, forward_time=0.203, loss_att=40.501, acc=0.965, loss=40.501, backward_time=0.301, grad_norm=84.304, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.849e-04, train_time=2.744 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:33:24,260 (trainer:338) INFO: 52epoch results: [train] iter_time=3.279e-04, forward_time=0.202, loss_att=40.533, acc=0.964, loss=40.533, backward_time=0.299, grad_norm=83.661, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.864e-04, train_time=2.761, time=3 hours, 29 minutes and 52.82 seconds, total_count=1361581, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.803, acc=0.983, cer=0.022, wer=0.084, loss=9.803, time=2 minutes and 59.77 seconds, total_count=2100, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 9.99 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:33:28,812 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:33:28,847 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/45epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:33:28,847 (trainer:272) INFO: 53/60epoch started. Estimated time to finish: 1 day, 4 hours and 42 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:45:50,287 (trainer:732) INFO: 53epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=40.421, acc=0.965, loss=40.421, backward_time=0.299, grad_norm=92.313, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.847e-04, train_time=3.258 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 03:56:13,016 (trainer:732) INFO: 53epoch:train:912-1822batch: iter_time=2.861e-04, forward_time=0.203, loss_att=40.582, acc=0.964, loss=40.582, backward_time=0.300, grad_norm=88.629, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.846e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 04:06:35,193 (trainer:732) INFO: 53epoch:train:1823-2733batch: iter_time=2.690e-04, forward_time=0.202, loss_att=39.947, acc=0.965, loss=39.947, backward_time=0.299, grad_norm=86.171, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.844e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 04:16:56,161 (trainer:732) INFO: 53epoch:train:2734-3644batch: iter_time=2.660e-04, forward_time=0.201, loss_att=40.700, acc=0.964, loss=40.700, backward_time=0.298, grad_norm=79.476, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.842e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 04:27:20,287 (trainer:732) INFO: 53epoch:train:3645-4555batch: iter_time=2.816e-04, forward_time=0.203, loss_att=41.325, acc=0.964, loss=41.325, backward_time=0.300, grad_norm=99.119, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.841e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 04:37:42,214 (trainer:732) INFO: 53epoch:train:4556-5466batch: iter_time=2.795e-04, forward_time=0.202, loss_att=39.628, acc=0.965, loss=39.628, backward_time=0.299, grad_norm=85.443, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.839e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 04:48:06,323 (trainer:732) INFO: 53epoch:train:5467-6377batch: iter_time=2.709e-04, forward_time=0.202, loss_att=40.220, acc=0.965, loss=40.220, backward_time=0.300, grad_norm=84.641, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.838e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 04:58:26,405 (trainer:732) INFO: 53epoch:train:6378-7288batch: iter_time=2.744e-04, forward_time=0.201, loss_att=39.219, acc=0.965, loss=39.219, backward_time=0.298, grad_norm=86.062, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.836e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 05:08:50,760 (trainer:732) INFO: 53epoch:train:7289-8199batch: iter_time=2.768e-04, forward_time=0.203, loss_att=40.373, acc=0.965, loss=40.373, backward_time=0.300, grad_norm=84.110, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.834e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 05:19:12,590 (trainer:732) INFO: 53epoch:train:8200-9110batch: iter_time=2.709e-04, forward_time=0.202, loss_att=40.274, acc=0.965, loss=40.274, backward_time=0.299, grad_norm=88.311, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.833e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 05:29:35,140 (trainer:732) INFO: 53epoch:train:9111-10021batch: iter_time=2.711e-04, forward_time=0.202, loss_att=40.024, acc=0.965, loss=40.024, backward_time=0.299, grad_norm=84.983, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.831e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 05:39:58,598 (trainer:732) INFO: 53epoch:train:10022-10932batch: iter_time=2.691e-04, forward_time=0.202, loss_att=40.826, acc=0.964, loss=40.826, backward_time=0.300, grad_norm=83.335, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.829e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 05:50:20,416 (trainer:732) INFO: 53epoch:train:10933-11843batch: iter_time=2.715e-04, forward_time=0.202, loss_att=40.416, acc=0.965, loss=40.416, backward_time=0.299, grad_norm=83.491, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.828e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 06:00:43,242 (trainer:732) INFO: 53epoch:train:11844-12754batch: iter_time=2.768e-04, forward_time=0.202, loss_att=41.015, acc=0.964, loss=41.015, backward_time=0.299, grad_norm=90.680, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.826e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 06:11:06,123 (trainer:732) INFO: 53epoch:train:12755-13665batch: iter_time=2.607e-04, forward_time=0.202, loss_att=40.265, acc=0.965, loss=40.265, backward_time=0.300, grad_norm=82.913, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.825e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 06:21:26,468 (trainer:732) INFO: 53epoch:train:13666-14576batch: iter_time=2.669e-04, forward_time=0.201, loss_att=40.297, acc=0.964, loss=40.297, backward_time=0.299, grad_norm=80.197, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.823e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 06:31:48,090 (trainer:732) INFO: 53epoch:train:14577-15487batch: iter_time=2.724e-04, forward_time=0.202, loss_att=40.898, acc=0.964, loss=40.898, backward_time=0.299, grad_norm=85.192, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.821e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 06:42:10,510 (trainer:732) INFO: 53epoch:train:15488-16398batch: iter_time=2.714e-04, forward_time=0.202, loss_att=41.174, acc=0.964, loss=41.174, backward_time=0.300, grad_norm=84.495, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.820e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 06:52:34,656 (trainer:732) INFO: 53epoch:train:16399-17309batch: iter_time=2.649e-04, forward_time=0.202, loss_att=40.260, acc=0.965, loss=40.260, backward_time=0.300, grad_norm=79.372, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.818e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 07:02:55,995 (trainer:732) INFO: 53epoch:train:17310-18220batch: iter_time=2.689e-04, forward_time=0.202, loss_att=40.654, acc=0.964, loss=40.654, backward_time=0.299, grad_norm=81.106, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.817e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 07:08:05,630 (trainer:338) INFO: 53epoch results: [train] iter_time=3.292e-04, forward_time=0.202, loss_att=40.431, acc=0.965, loss=40.431, backward_time=0.299, grad_norm=85.505, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.832e-04, train_time=2.759, time=3 hours, 29 minutes and 41.24 seconds, total_count=1379813, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.863, acc=0.981, cer=0.021, wer=0.084, loss=10.863, time=2 minutes and 45.25 seconds, total_count=2128, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 10.29 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 07:08:09,895 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 07:08:09,911 (trainer:272) INFO: 54/60epoch started. Estimated time to finish: 1 day, 1 hour and 6 minutes +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 07:20:26,698 (trainer:732) INFO: 54epoch:train:1-911batch: iter_time=0.001, forward_time=0.201, loss_att=39.117, acc=0.965, loss=39.117, backward_time=0.297, grad_norm=82.084, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.815e-04, train_time=3.237 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 07:30:51,063 (trainer:732) INFO: 54epoch:train:912-1822batch: iter_time=2.643e-04, forward_time=0.203, loss_att=40.796, acc=0.965, loss=40.796, backward_time=0.301, grad_norm=84.782, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.813e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 07:41:12,577 (trainer:732) INFO: 54epoch:train:1823-2733batch: iter_time=2.597e-04, forward_time=0.202, loss_att=40.860, acc=0.964, loss=40.860, backward_time=0.300, grad_norm=83.653, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.812e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 07:51:33,622 (trainer:732) INFO: 54epoch:train:2734-3644batch: iter_time=2.648e-04, forward_time=0.202, loss_att=40.384, acc=0.965, loss=40.384, backward_time=0.299, grad_norm=91.216, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.810e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 08:01:56,034 (trainer:732) INFO: 54epoch:train:3645-4555batch: iter_time=2.625e-04, forward_time=0.202, loss_att=40.187, acc=0.965, loss=40.187, backward_time=0.300, grad_norm=82.322, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.809e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 08:12:17,855 (trainer:732) INFO: 54epoch:train:4556-5466batch: iter_time=2.567e-04, forward_time=0.202, loss_att=41.111, acc=0.964, loss=41.111, backward_time=0.300, grad_norm=82.993, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.807e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 08:22:40,982 (trainer:732) INFO: 54epoch:train:5467-6377batch: iter_time=2.582e-04, forward_time=0.202, loss_att=40.012, acc=0.965, loss=40.012, backward_time=0.299, grad_norm=83.758, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.806e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 08:33:05,469 (trainer:732) INFO: 54epoch:train:6378-7288batch: iter_time=2.583e-04, forward_time=0.203, loss_att=40.553, acc=0.965, loss=40.553, backward_time=0.301, grad_norm=87.361, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.804e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 08:43:28,280 (trainer:732) INFO: 54epoch:train:7289-8199batch: iter_time=2.583e-04, forward_time=0.202, loss_att=40.469, acc=0.965, loss=40.469, backward_time=0.300, grad_norm=85.085, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.802e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 08:53:48,370 (trainer:732) INFO: 54epoch:train:8200-9110batch: iter_time=2.644e-04, forward_time=0.201, loss_att=39.890, acc=0.965, loss=39.890, backward_time=0.298, grad_norm=86.887, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.801e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 09:04:09,932 (trainer:732) INFO: 54epoch:train:9111-10021batch: iter_time=2.544e-04, forward_time=0.202, loss_att=39.957, acc=0.965, loss=39.957, backward_time=0.299, grad_norm=85.856, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.799e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 09:14:31,433 (trainer:732) INFO: 54epoch:train:10022-10932batch: iter_time=2.546e-04, forward_time=0.202, loss_att=40.795, acc=0.964, loss=40.795, backward_time=0.299, grad_norm=86.083, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.798e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 09:24:55,864 (trainer:732) INFO: 54epoch:train:10933-11843batch: iter_time=2.558e-04, forward_time=0.203, loss_att=39.838, acc=0.966, loss=39.838, backward_time=0.301, grad_norm=90.162, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.796e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 09:35:19,330 (trainer:732) INFO: 54epoch:train:11844-12754batch: iter_time=2.573e-04, forward_time=0.203, loss_att=41.108, acc=0.964, loss=41.108, backward_time=0.300, grad_norm=81.229, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.795e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 09:45:40,350 (trainer:732) INFO: 54epoch:train:12755-13665batch: iter_time=2.638e-04, forward_time=0.202, loss_att=40.286, acc=0.964, loss=40.286, backward_time=0.299, grad_norm=83.354, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.793e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 09:56:01,313 (trainer:732) INFO: 54epoch:train:13666-14576batch: iter_time=2.602e-04, forward_time=0.201, loss_att=40.185, acc=0.965, loss=40.185, backward_time=0.299, grad_norm=84.072, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.791e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:06:22,701 (trainer:732) INFO: 54epoch:train:14577-15487batch: iter_time=2.513e-04, forward_time=0.202, loss_att=41.009, acc=0.964, loss=41.009, backward_time=0.299, grad_norm=80.529, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.790e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:16:42,351 (trainer:732) INFO: 54epoch:train:15488-16398batch: iter_time=2.602e-04, forward_time=0.202, loss_att=39.728, acc=0.965, loss=39.728, backward_time=0.298, grad_norm=79.918, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.788e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:27:03,037 (trainer:732) INFO: 54epoch:train:16399-17309batch: iter_time=2.549e-04, forward_time=0.202, loss_att=39.960, acc=0.965, loss=39.960, backward_time=0.299, grad_norm=91.909, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.787e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:37:26,863 (trainer:732) INFO: 54epoch:train:17310-18220batch: iter_time=2.618e-04, forward_time=0.202, loss_att=39.668, acc=0.965, loss=39.668, backward_time=0.300, grad_norm=83.486, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.785e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:42:31,615 (trainer:338) INFO: 54epoch results: [train] iter_time=3.118e-04, forward_time=0.202, loss_att=40.292, acc=0.965, loss=40.292, backward_time=0.299, grad_norm=84.840, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.800e-04, train_time=2.756, time=3 hours, 29 minutes and 30.89 seconds, total_count=1398045, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.727, acc=0.983, cer=0.021, wer=0.081, loss=9.727, time=2 minutes and 42.45 seconds, total_count=2156, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 8.37 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:42:35,684 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:42:35,734 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/46epoch.pth, exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/53epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:42:35,735 (trainer:272) INFO: 55/60epoch started. Estimated time to finish: 21 hours, 31 minutes and 19.62 seconds +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 10:54:57,435 (trainer:732) INFO: 55epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=39.542, acc=0.965, loss=39.542, backward_time=0.299, grad_norm=87.666, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.784e-04, train_time=3.258 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 11:05:20,930 (trainer:732) INFO: 55epoch:train:912-1822batch: iter_time=2.669e-04, forward_time=0.202, loss_att=40.519, acc=0.965, loss=40.519, backward_time=0.300, grad_norm=88.805, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.782e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 11:15:42,452 (trainer:732) INFO: 55epoch:train:1823-2733batch: iter_time=2.690e-04, forward_time=0.202, loss_att=40.082, acc=0.965, loss=40.082, backward_time=0.298, grad_norm=84.740, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.780e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 11:26:04,200 (trainer:732) INFO: 55epoch:train:2734-3644batch: iter_time=2.623e-04, forward_time=0.202, loss_att=40.306, acc=0.965, loss=40.306, backward_time=0.299, grad_norm=86.015, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.779e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 11:36:27,488 (trainer:732) INFO: 55epoch:train:3645-4555batch: iter_time=2.673e-04, forward_time=0.203, loss_att=39.477, acc=0.965, loss=39.477, backward_time=0.300, grad_norm=91.648, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.777e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 11:46:50,316 (trainer:732) INFO: 55epoch:train:4556-5466batch: iter_time=2.601e-04, forward_time=0.202, loss_att=40.159, acc=0.965, loss=40.159, backward_time=0.300, grad_norm=82.410, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.776e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 11:57:15,048 (trainer:732) INFO: 55epoch:train:5467-6377batch: iter_time=2.640e-04, forward_time=0.203, loss_att=40.925, acc=0.965, loss=40.925, backward_time=0.301, grad_norm=83.172, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.774e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 12:07:39,128 (trainer:732) INFO: 55epoch:train:6378-7288batch: iter_time=2.649e-04, forward_time=0.203, loss_att=39.841, acc=0.966, loss=39.841, backward_time=0.301, grad_norm=85.012, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.773e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 12:17:59,480 (trainer:732) INFO: 55epoch:train:7289-8199batch: iter_time=2.736e-04, forward_time=0.201, loss_att=40.559, acc=0.964, loss=40.559, backward_time=0.298, grad_norm=85.676, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.771e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 12:28:19,317 (trainer:732) INFO: 55epoch:train:8200-9110batch: iter_time=2.674e-04, forward_time=0.201, loss_att=39.966, acc=0.965, loss=39.966, backward_time=0.298, grad_norm=94.573, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.770e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 12:38:41,977 (trainer:732) INFO: 55epoch:train:9111-10021batch: iter_time=2.607e-04, forward_time=0.203, loss_att=40.852, acc=0.965, loss=40.852, backward_time=0.300, grad_norm=107.024, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.768e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 12:49:03,895 (trainer:732) INFO: 55epoch:train:10022-10932batch: iter_time=2.685e-04, forward_time=0.202, loss_att=40.101, acc=0.964, loss=40.101, backward_time=0.299, grad_norm=82.058, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.767e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 12:59:22,881 (trainer:732) INFO: 55epoch:train:10933-11843batch: iter_time=2.692e-04, forward_time=0.201, loss_att=40.093, acc=0.964, loss=40.093, backward_time=0.298, grad_norm=86.710, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.765e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 13:09:47,507 (trainer:732) INFO: 55epoch:train:11844-12754batch: iter_time=2.654e-04, forward_time=0.203, loss_att=39.484, acc=0.965, loss=39.484, backward_time=0.301, grad_norm=83.071, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.763e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 13:20:10,648 (trainer:732) INFO: 55epoch:train:12755-13665batch: iter_time=2.622e-04, forward_time=0.202, loss_att=39.671, acc=0.965, loss=39.671, backward_time=0.300, grad_norm=85.966, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.762e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 13:30:32,262 (trainer:732) INFO: 55epoch:train:13666-14576batch: iter_time=2.619e-04, forward_time=0.202, loss_att=40.770, acc=0.964, loss=40.770, backward_time=0.299, grad_norm=82.733, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.760e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 13:40:55,016 (trainer:732) INFO: 55epoch:train:14577-15487batch: iter_time=2.619e-04, forward_time=0.203, loss_att=39.996, acc=0.965, loss=39.996, backward_time=0.300, grad_norm=86.328, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.759e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 13:51:16,979 (trainer:732) INFO: 55epoch:train:15488-16398batch: iter_time=2.687e-04, forward_time=0.202, loss_att=40.057, acc=0.965, loss=40.057, backward_time=0.299, grad_norm=89.565, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.757e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 14:01:38,700 (trainer:732) INFO: 55epoch:train:16399-17309batch: iter_time=2.693e-04, forward_time=0.202, loss_att=40.235, acc=0.965, loss=40.235, backward_time=0.300, grad_norm=90.924, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.756e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 14:11:59,462 (trainer:732) INFO: 55epoch:train:17310-18220batch: iter_time=2.580e-04, forward_time=0.201, loss_att=40.000, acc=0.965, loss=40.000, backward_time=0.298, grad_norm=84.813, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.754e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 14:17:03,973 (trainer:338) INFO: 55epoch results: [train] iter_time=3.343e-04, forward_time=0.202, loss_att=40.134, acc=0.965, loss=40.134, backward_time=0.299, grad_norm=87.450, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.769e-04, train_time=2.758, time=3 hours, 29 minutes and 37.77 seconds, total_count=1416277, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.648, acc=0.981, cer=0.021, wer=0.082, loss=10.648, time=2 minutes and 42.46 seconds, total_count=2184, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 8 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 14:17:08,206 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 14:17:08,224 (trainer:272) INFO: 56/60epoch started. Estimated time to finish: 17 hours, 55 minutes and 59.98 seconds +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 14:29:31,406 (trainer:732) INFO: 56epoch:train:1-911batch: iter_time=0.001, forward_time=0.203, loss_att=38.707, acc=0.966, loss=38.707, backward_time=0.300, grad_norm=86.840, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.753e-04, train_time=3.265 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 14:39:55,457 (trainer:732) INFO: 56epoch:train:912-1822batch: iter_time=2.889e-04, forward_time=0.203, loss_att=39.575, acc=0.965, loss=39.575, backward_time=0.300, grad_norm=84.139, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.751e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 14:50:16,009 (trainer:732) INFO: 56epoch:train:1823-2733batch: iter_time=2.777e-04, forward_time=0.202, loss_att=40.060, acc=0.965, loss=40.060, backward_time=0.299, grad_norm=86.394, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.750e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 15:00:38,549 (trainer:732) INFO: 56epoch:train:2734-3644batch: iter_time=2.787e-04, forward_time=0.202, loss_att=39.380, acc=0.965, loss=39.380, backward_time=0.300, grad_norm=80.226, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.748e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 15:10:57,560 (trainer:732) INFO: 56epoch:train:3645-4555batch: iter_time=2.759e-04, forward_time=0.202, loss_att=40.143, acc=0.964, loss=40.143, backward_time=0.298, grad_norm=88.072, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.747e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 15:21:21,182 (trainer:732) INFO: 56epoch:train:4556-5466batch: iter_time=2.787e-04, forward_time=0.203, loss_att=39.869, acc=0.965, loss=39.869, backward_time=0.300, grad_norm=82.313, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.745e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 15:31:43,072 (trainer:732) INFO: 56epoch:train:5467-6377batch: iter_time=2.744e-04, forward_time=0.202, loss_att=40.107, acc=0.965, loss=40.107, backward_time=0.299, grad_norm=87.376, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.744e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 15:42:02,710 (trainer:732) INFO: 56epoch:train:6378-7288batch: iter_time=2.774e-04, forward_time=0.201, loss_att=40.164, acc=0.964, loss=40.164, backward_time=0.298, grad_norm=84.599, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.742e-04, train_time=2.720 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 15:52:25,635 (trainer:732) INFO: 56epoch:train:7289-8199batch: iter_time=2.792e-04, forward_time=0.202, loss_att=40.057, acc=0.965, loss=40.057, backward_time=0.300, grad_norm=91.032, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.741e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 16:02:47,871 (trainer:732) INFO: 56epoch:train:8200-9110batch: iter_time=2.744e-04, forward_time=0.202, loss_att=40.097, acc=0.965, loss=40.097, backward_time=0.299, grad_norm=78.999, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.739e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 16:13:12,185 (trainer:732) INFO: 56epoch:train:9111-10021batch: iter_time=2.660e-04, forward_time=0.203, loss_att=40.818, acc=0.965, loss=40.818, backward_time=0.301, grad_norm=87.283, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.737e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 16:23:35,498 (trainer:732) INFO: 56epoch:train:10022-10932batch: iter_time=2.746e-04, forward_time=0.203, loss_att=40.265, acc=0.965, loss=40.265, backward_time=0.300, grad_norm=83.762, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.736e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 16:33:55,236 (trainer:732) INFO: 56epoch:train:10933-11843batch: iter_time=2.713e-04, forward_time=0.201, loss_att=39.430, acc=0.965, loss=39.430, backward_time=0.298, grad_norm=83.750, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.734e-04, train_time=2.722 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 16:44:15,807 (trainer:732) INFO: 56epoch:train:11844-12754batch: iter_time=2.782e-04, forward_time=0.201, loss_att=39.898, acc=0.965, loss=39.898, backward_time=0.298, grad_norm=83.159, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.733e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 16:54:38,410 (trainer:732) INFO: 56epoch:train:12755-13665batch: iter_time=2.767e-04, forward_time=0.202, loss_att=40.314, acc=0.965, loss=40.314, backward_time=0.300, grad_norm=81.736, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.731e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:04:59,076 (trainer:732) INFO: 56epoch:train:13666-14576batch: iter_time=2.708e-04, forward_time=0.201, loss_att=39.863, acc=0.965, loss=39.863, backward_time=0.298, grad_norm=83.868, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.730e-04, train_time=2.724 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:15:23,246 (trainer:732) INFO: 56epoch:train:14577-15487batch: iter_time=2.684e-04, forward_time=0.203, loss_att=40.515, acc=0.965, loss=40.515, backward_time=0.301, grad_norm=89.029, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.728e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:25:45,687 (trainer:732) INFO: 56epoch:train:15488-16398batch: iter_time=2.776e-04, forward_time=0.202, loss_att=40.347, acc=0.965, loss=40.347, backward_time=0.300, grad_norm=82.133, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.727e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:36:06,432 (trainer:732) INFO: 56epoch:train:16399-17309batch: iter_time=2.697e-04, forward_time=0.202, loss_att=40.088, acc=0.964, loss=40.088, backward_time=0.299, grad_norm=84.372, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.725e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:46:31,863 (trainer:732) INFO: 56epoch:train:17310-18220batch: iter_time=2.730e-04, forward_time=0.203, loss_att=40.538, acc=0.965, loss=40.538, backward_time=0.301, grad_norm=88.873, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.724e-04, train_time=2.745 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:51:39,435 (trainer:338) INFO: 56epoch results: [train] iter_time=3.350e-04, forward_time=0.202, loss_att=40.009, acc=0.965, loss=40.009, backward_time=0.299, grad_norm=84.904, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.738e-04, train_time=2.758, time=3 hours, 29 minutes and 37.8 seconds, total_count=1434509, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.807, acc=0.983, cer=0.021, wer=0.081, loss=9.807, time=2 minutes and 42.88 seconds, total_count=2212, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 10.53 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:51:43,535 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:51:43,555 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/42epoch.pth, exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/55epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 17:51:43,555 (trainer:272) INFO: 57/60epoch started. Estimated time to finish: 14 hours, 20 minutes and 43.54 seconds +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 18:04:05,706 (trainer:732) INFO: 57epoch:train:1-911batch: iter_time=0.001, forward_time=0.202, loss_att=38.779, acc=0.966, loss=38.779, backward_time=0.300, grad_norm=87.268, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.722e-04, train_time=3.260 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 18:14:27,939 (trainer:732) INFO: 57epoch:train:912-1822batch: iter_time=2.878e-04, forward_time=0.202, loss_att=40.092, acc=0.965, loss=40.092, backward_time=0.300, grad_norm=90.834, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.721e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 18:24:50,999 (trainer:732) INFO: 57epoch:train:1823-2733batch: iter_time=2.689e-04, forward_time=0.202, loss_att=39.253, acc=0.965, loss=39.253, backward_time=0.300, grad_norm=82.949, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.719e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 18:35:14,230 (trainer:732) INFO: 57epoch:train:2734-3644batch: iter_time=2.773e-04, forward_time=0.203, loss_att=39.566, acc=0.965, loss=39.566, backward_time=0.300, grad_norm=89.943, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.718e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 18:45:33,122 (trainer:732) INFO: 57epoch:train:3645-4555batch: iter_time=2.741e-04, forward_time=0.201, loss_att=38.584, acc=0.965, loss=38.584, backward_time=0.297, grad_norm=81.633, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.716e-04, train_time=2.718 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 18:55:55,495 (trainer:732) INFO: 57epoch:train:4556-5466batch: iter_time=2.626e-04, forward_time=0.203, loss_att=40.359, acc=0.965, loss=40.359, backward_time=0.300, grad_norm=85.785, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.715e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 19:06:17,108 (trainer:732) INFO: 57epoch:train:5467-6377batch: iter_time=2.649e-04, forward_time=0.202, loss_att=39.260, acc=0.965, loss=39.260, backward_time=0.299, grad_norm=80.938, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.713e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 19:16:38,226 (trainer:732) INFO: 57epoch:train:6378-7288batch: iter_time=2.687e-04, forward_time=0.202, loss_att=39.941, acc=0.965, loss=39.941, backward_time=0.299, grad_norm=85.652, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.712e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 19:27:01,297 (trainer:732) INFO: 57epoch:train:7289-8199batch: iter_time=2.723e-04, forward_time=0.203, loss_att=39.961, acc=0.965, loss=39.961, backward_time=0.300, grad_norm=84.085, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.710e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 19:37:24,606 (trainer:732) INFO: 57epoch:train:8200-9110batch: iter_time=2.708e-04, forward_time=0.202, loss_att=40.708, acc=0.965, loss=40.708, backward_time=0.300, grad_norm=86.270, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.709e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 19:47:47,428 (trainer:732) INFO: 57epoch:train:9111-10021batch: iter_time=2.763e-04, forward_time=0.202, loss_att=39.890, acc=0.965, loss=39.890, backward_time=0.300, grad_norm=91.055, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.707e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 19:58:09,314 (trainer:732) INFO: 57epoch:train:10022-10932batch: iter_time=2.724e-04, forward_time=0.202, loss_att=39.790, acc=0.965, loss=39.790, backward_time=0.299, grad_norm=93.040, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.706e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 20:08:33,745 (trainer:732) INFO: 57epoch:train:10933-11843batch: iter_time=2.675e-04, forward_time=0.203, loss_att=40.164, acc=0.965, loss=40.164, backward_time=0.300, grad_norm=86.137, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.705e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 20:18:55,697 (trainer:732) INFO: 57epoch:train:11844-12754batch: iter_time=2.725e-04, forward_time=0.202, loss_att=40.100, acc=0.965, loss=40.100, backward_time=0.299, grad_norm=91.894, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.703e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 20:29:17,680 (trainer:732) INFO: 57epoch:train:12755-13665batch: iter_time=2.736e-04, forward_time=0.202, loss_att=40.217, acc=0.965, loss=40.217, backward_time=0.300, grad_norm=82.713, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.702e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 20:39:37,945 (trainer:732) INFO: 57epoch:train:13666-14576batch: iter_time=2.717e-04, forward_time=0.202, loss_att=40.349, acc=0.964, loss=40.349, backward_time=0.299, grad_norm=87.787, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.700e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 20:50:00,706 (trainer:732) INFO: 57epoch:train:14577-15487batch: iter_time=2.665e-04, forward_time=0.202, loss_att=40.002, acc=0.965, loss=40.002, backward_time=0.300, grad_norm=77.046, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.699e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:00:22,138 (trainer:732) INFO: 57epoch:train:15488-16398batch: iter_time=2.737e-04, forward_time=0.202, loss_att=40.835, acc=0.964, loss=40.835, backward_time=0.299, grad_norm=83.021, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.697e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:10:46,761 (trainer:732) INFO: 57epoch:train:16399-17309batch: iter_time=2.599e-04, forward_time=0.203, loss_att=40.705, acc=0.965, loss=40.705, backward_time=0.301, grad_norm=88.639, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.696e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:21:08,143 (trainer:732) INFO: 57epoch:train:17310-18220batch: iter_time=2.666e-04, forward_time=0.202, loss_att=39.587, acc=0.965, loss=39.587, backward_time=0.299, grad_norm=87.839, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.694e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:26:14,132 (trainer:338) INFO: 57epoch results: [train] iter_time=3.310e-04, forward_time=0.202, loss_att=39.903, acc=0.965, loss=39.903, backward_time=0.299, grad_norm=86.220, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.708e-04, train_time=2.758, time=3 hours, 29 minutes and 38.82 seconds, total_count=1452741, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.797, acc=0.983, cer=0.021, wer=0.081, loss=9.797, time=2 minutes and 42.42 seconds, total_count=2240, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 9.34 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:26:18,189 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:26:18,206 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/40epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:26:18,207 (trainer:272) INFO: 58/60epoch started. Estimated time to finish: 10 hours, 45 minutes and 29.46 seconds +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:38:39,476 (trainer:732) INFO: 58epoch:train:1-911batch: iter_time=0.001, forward_time=0.203, loss_att=40.037, acc=0.965, loss=40.037, backward_time=0.300, grad_norm=83.572, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.693e-04, train_time=3.257 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:49:02,454 (trainer:732) INFO: 58epoch:train:912-1822batch: iter_time=2.626e-04, forward_time=0.202, loss_att=40.145, acc=0.965, loss=40.145, backward_time=0.300, grad_norm=88.392, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.691e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 21:59:25,899 (trainer:732) INFO: 58epoch:train:1823-2733batch: iter_time=2.601e-04, forward_time=0.202, loss_att=39.936, acc=0.965, loss=39.936, backward_time=0.300, grad_norm=85.907, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.690e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 22:09:48,614 (trainer:732) INFO: 58epoch:train:2734-3644batch: iter_time=2.612e-04, forward_time=0.202, loss_att=40.693, acc=0.965, loss=40.693, backward_time=0.300, grad_norm=85.497, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.688e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 22:20:08,013 (trainer:732) INFO: 58epoch:train:3645-4555batch: iter_time=2.678e-04, forward_time=0.201, loss_att=39.877, acc=0.965, loss=39.877, backward_time=0.298, grad_norm=87.893, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.687e-04, train_time=2.720 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 22:30:33,147 (trainer:732) INFO: 58epoch:train:4556-5466batch: iter_time=2.623e-04, forward_time=0.203, loss_att=40.074, acc=0.965, loss=40.074, backward_time=0.301, grad_norm=85.118, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.685e-04, train_time=2.744 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 22:40:55,020 (trainer:732) INFO: 58epoch:train:5467-6377batch: iter_time=2.555e-04, forward_time=0.202, loss_att=39.467, acc=0.965, loss=39.467, backward_time=0.299, grad_norm=85.335, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.684e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 22:51:19,916 (trainer:732) INFO: 58epoch:train:6378-7288batch: iter_time=2.610e-04, forward_time=0.203, loss_att=40.178, acc=0.965, loss=40.178, backward_time=0.300, grad_norm=81.754, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.682e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 23:01:40,872 (trainer:732) INFO: 58epoch:train:7289-8199batch: iter_time=2.663e-04, forward_time=0.202, loss_att=39.525, acc=0.965, loss=39.525, backward_time=0.299, grad_norm=79.188, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.681e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 23:12:03,232 (trainer:732) INFO: 58epoch:train:8200-9110batch: iter_time=2.620e-04, forward_time=0.202, loss_att=39.624, acc=0.965, loss=39.624, backward_time=0.299, grad_norm=90.778, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.680e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 23:22:27,946 (trainer:732) INFO: 58epoch:train:9111-10021batch: iter_time=2.600e-04, forward_time=0.202, loss_att=39.279, acc=0.965, loss=39.279, backward_time=0.300, grad_norm=88.464, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.678e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 23:32:49,487 (trainer:732) INFO: 58epoch:train:10022-10932batch: iter_time=2.667e-04, forward_time=0.202, loss_att=39.526, acc=0.965, loss=39.526, backward_time=0.299, grad_norm=81.001, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.677e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 23:43:11,249 (trainer:732) INFO: 58epoch:train:10933-11843batch: iter_time=2.625e-04, forward_time=0.202, loss_att=39.642, acc=0.965, loss=39.642, backward_time=0.299, grad_norm=90.733, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.675e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-03 23:53:32,242 (trainer:732) INFO: 58epoch:train:11844-12754batch: iter_time=2.659e-04, forward_time=0.202, loss_att=39.142, acc=0.965, loss=39.142, backward_time=0.299, grad_norm=85.687, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.674e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 00:03:56,241 (trainer:732) INFO: 58epoch:train:12755-13665batch: iter_time=2.659e-04, forward_time=0.203, loss_att=39.073, acc=0.966, loss=39.073, backward_time=0.300, grad_norm=82.001, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.672e-04, train_time=2.739 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 00:14:17,937 (trainer:732) INFO: 58epoch:train:13666-14576batch: iter_time=2.661e-04, forward_time=0.202, loss_att=39.437, acc=0.965, loss=39.437, backward_time=0.299, grad_norm=87.071, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.671e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 00:24:40,863 (trainer:732) INFO: 58epoch:train:14577-15487batch: iter_time=2.580e-04, forward_time=0.202, loss_att=39.941, acc=0.965, loss=39.941, backward_time=0.299, grad_norm=92.521, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.669e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 00:35:01,184 (trainer:732) INFO: 58epoch:train:15488-16398batch: iter_time=2.578e-04, forward_time=0.202, loss_att=38.779, acc=0.965, loss=38.779, backward_time=0.298, grad_norm=82.731, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.668e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 00:45:23,649 (trainer:732) INFO: 58epoch:train:16399-17309batch: iter_time=2.618e-04, forward_time=0.202, loss_att=40.213, acc=0.965, loss=40.213, backward_time=0.299, grad_norm=89.806, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.666e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 00:55:45,854 (trainer:732) INFO: 58epoch:train:17310-18220batch: iter_time=2.602e-04, forward_time=0.202, loss_att=39.986, acc=0.965, loss=39.986, backward_time=0.299, grad_norm=83.150, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.665e-04, train_time=2.731 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 01:01:11,578 (trainer:338) INFO: 58epoch results: [train] iter_time=3.115e-04, forward_time=0.202, loss_att=39.725, acc=0.965, loss=39.725, backward_time=0.299, grad_norm=85.826, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.679e-04, train_time=2.759, time=3 hours, 29 minutes and 42.52 seconds, total_count=1470973, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.577, acc=0.981, cer=0.021, wer=0.081, loss=10.577, time=2 minutes and 59.92 seconds, total_count=2268, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 10.93 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 01:01:15,476 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 01:01:15,494 (trainer:272) INFO: 59/60epoch started. Estimated time to finish: 7 hours, 10 minutes and 18.92 seconds +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 01:13:38,155 (trainer:732) INFO: 59epoch:train:1-911batch: iter_time=0.002, forward_time=0.203, loss_att=39.604, acc=0.966, loss=39.604, backward_time=0.300, grad_norm=93.157, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.664e-04, train_time=3.262 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 01:24:02,068 (trainer:732) INFO: 59epoch:train:912-1822batch: iter_time=2.835e-04, forward_time=0.203, loss_att=39.935, acc=0.965, loss=39.935, backward_time=0.300, grad_norm=87.211, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.662e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 01:34:24,851 (trainer:732) INFO: 59epoch:train:1823-2733batch: iter_time=2.784e-04, forward_time=0.202, loss_att=38.617, acc=0.966, loss=38.617, backward_time=0.299, grad_norm=82.966, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.661e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 01:44:44,612 (trainer:732) INFO: 59epoch:train:2734-3644batch: iter_time=2.730e-04, forward_time=0.201, loss_att=39.017, acc=0.965, loss=39.017, backward_time=0.298, grad_norm=83.536, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.659e-04, train_time=2.720 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 01:55:08,974 (trainer:732) INFO: 59epoch:train:3645-4555batch: iter_time=2.763e-04, forward_time=0.203, loss_att=40.642, acc=0.965, loss=40.642, backward_time=0.301, grad_norm=87.723, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.658e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 02:05:30,619 (trainer:732) INFO: 59epoch:train:4556-5466batch: iter_time=2.720e-04, forward_time=0.202, loss_att=38.975, acc=0.965, loss=38.975, backward_time=0.298, grad_norm=84.514, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.656e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 02:15:53,617 (trainer:732) INFO: 59epoch:train:5467-6377batch: iter_time=2.726e-04, forward_time=0.202, loss_att=39.603, acc=0.966, loss=39.603, backward_time=0.299, grad_norm=89.539, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.655e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 02:26:17,085 (trainer:732) INFO: 59epoch:train:6378-7288batch: iter_time=2.686e-04, forward_time=0.203, loss_att=38.621, acc=0.966, loss=38.621, backward_time=0.300, grad_norm=83.288, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.653e-04, train_time=2.737 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 02:36:41,255 (trainer:732) INFO: 59epoch:train:7289-8199batch: iter_time=2.724e-04, forward_time=0.203, loss_att=39.930, acc=0.965, loss=39.930, backward_time=0.301, grad_norm=90.740, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.652e-04, train_time=2.741 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 02:47:03,388 (trainer:732) INFO: 59epoch:train:8200-9110batch: iter_time=2.678e-04, forward_time=0.203, loss_att=40.039, acc=0.965, loss=40.039, backward_time=0.300, grad_norm=86.073, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.651e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 02:57:26,974 (trainer:732) INFO: 59epoch:train:9111-10021batch: iter_time=2.686e-04, forward_time=0.203, loss_att=39.573, acc=0.965, loss=39.573, backward_time=0.300, grad_norm=83.811, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.649e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 03:07:46,794 (trainer:732) INFO: 59epoch:train:10022-10932batch: iter_time=2.715e-04, forward_time=0.201, loss_att=39.928, acc=0.965, loss=39.928, backward_time=0.298, grad_norm=83.697, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.648e-04, train_time=2.721 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 03:18:10,623 (trainer:732) INFO: 59epoch:train:10933-11843batch: iter_time=2.701e-04, forward_time=0.203, loss_att=38.907, acc=0.966, loss=38.907, backward_time=0.300, grad_norm=81.560, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.646e-04, train_time=2.740 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 03:28:31,953 (trainer:732) INFO: 59epoch:train:11844-12754batch: iter_time=2.863e-04, forward_time=0.201, loss_att=39.642, acc=0.965, loss=39.642, backward_time=0.298, grad_norm=85.189, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.645e-04, train_time=2.728 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 03:38:53,862 (trainer:732) INFO: 59epoch:train:12755-13665batch: iter_time=2.723e-04, forward_time=0.202, loss_att=39.368, acc=0.965, loss=39.368, backward_time=0.299, grad_norm=93.639, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.643e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 03:49:15,229 (trainer:732) INFO: 59epoch:train:13666-14576batch: iter_time=2.772e-04, forward_time=0.202, loss_att=39.060, acc=0.965, loss=39.060, backward_time=0.298, grad_norm=79.443, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.642e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 03:59:36,149 (trainer:732) INFO: 59epoch:train:14577-15487batch: iter_time=2.694e-04, forward_time=0.202, loss_att=39.655, acc=0.965, loss=39.655, backward_time=0.299, grad_norm=86.274, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.641e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:09:57,352 (trainer:732) INFO: 59epoch:train:15488-16398batch: iter_time=2.741e-04, forward_time=0.201, loss_att=39.591, acc=0.965, loss=39.591, backward_time=0.298, grad_norm=81.059, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.639e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:20:22,134 (trainer:732) INFO: 59epoch:train:16399-17309batch: iter_time=2.709e-04, forward_time=0.203, loss_att=40.350, acc=0.965, loss=40.350, backward_time=0.301, grad_norm=89.870, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.638e-04, train_time=2.743 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:30:44,946 (trainer:732) INFO: 59epoch:train:17310-18220batch: iter_time=2.743e-04, forward_time=0.202, loss_att=40.546, acc=0.965, loss=40.546, backward_time=0.300, grad_norm=91.939, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.636e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:36:09,411 (trainer:338) INFO: 59epoch results: [train] iter_time=3.440e-04, forward_time=0.202, loss_att=39.575, acc=0.965, loss=39.575, backward_time=0.299, grad_norm=86.251, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.650e-04, train_time=2.759, time=3 hours, 29 minutes and 44.66 seconds, total_count=1489205, gpu_max_cached_mem_GB=30.096, [valid] loss_att=10.319, acc=0.982, cer=0.020, wer=0.081, loss=10.319, time=2 minutes and 58.59 seconds, total_count=2296, gpu_max_cached_mem_GB=30.096, [att_plot] time=2 minutes and 10.66 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:36:14,069 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:36:14,105 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/58epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:36:14,105 (trainer:272) INFO: 60/60epoch started. Estimated time to finish: 3 hours, 35 minutes and 9.16 seconds +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:48:36,066 (trainer:732) INFO: 60epoch:train:1-911batch: iter_time=0.002, forward_time=0.202, loss_att=38.819, acc=0.966, loss=38.819, backward_time=0.299, grad_norm=88.103, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.635e-04, train_time=3.259 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 04:58:59,284 (trainer:732) INFO: 60epoch:train:912-1822batch: iter_time=2.798e-04, forward_time=0.203, loss_att=38.074, acc=0.967, loss=38.074, backward_time=0.300, grad_norm=91.414, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.633e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 05:09:21,953 (trainer:732) INFO: 60epoch:train:1823-2733batch: iter_time=2.771e-04, forward_time=0.202, loss_att=39.271, acc=0.966, loss=39.271, backward_time=0.300, grad_norm=87.413, clip=100.000, loss_scale=1.000, optim_step_time=0.066, optim0_lr0=4.632e-04, train_time=2.734 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 05:19:45,264 (trainer:732) INFO: 60epoch:train:2734-3644batch: iter_time=2.725e-04, forward_time=0.202, loss_att=39.186, acc=0.965, loss=39.186, backward_time=0.300, grad_norm=87.025, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.631e-04, train_time=2.736 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 05:30:05,348 (trainer:732) INFO: 60epoch:train:3645-4555batch: iter_time=2.690e-04, forward_time=0.202, loss_att=39.596, acc=0.965, loss=39.596, backward_time=0.298, grad_norm=85.050, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.629e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 05:40:27,542 (trainer:732) INFO: 60epoch:train:4556-5466batch: iter_time=2.771e-04, forward_time=0.202, loss_att=38.973, acc=0.966, loss=38.973, backward_time=0.300, grad_norm=87.073, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.628e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 05:50:51,226 (trainer:732) INFO: 60epoch:train:5467-6377batch: iter_time=2.733e-04, forward_time=0.203, loss_att=39.937, acc=0.965, loss=39.937, backward_time=0.300, grad_norm=87.147, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.626e-04, train_time=2.738 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 06:01:13,539 (trainer:732) INFO: 60epoch:train:6378-7288batch: iter_time=2.691e-04, forward_time=0.202, loss_att=39.845, acc=0.965, loss=39.845, backward_time=0.299, grad_norm=87.845, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.625e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 06:11:33,646 (trainer:732) INFO: 60epoch:train:7289-8199batch: iter_time=2.675e-04, forward_time=0.201, loss_att=39.429, acc=0.965, loss=39.429, backward_time=0.298, grad_norm=81.823, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.624e-04, train_time=2.723 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 06:21:54,653 (trainer:732) INFO: 60epoch:train:8200-9110batch: iter_time=2.627e-04, forward_time=0.202, loss_att=39.329, acc=0.965, loss=39.329, backward_time=0.299, grad_norm=97.018, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.622e-04, train_time=2.727 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 06:32:19,294 (trainer:732) INFO: 60epoch:train:9111-10021batch: iter_time=2.689e-04, forward_time=0.203, loss_att=40.226, acc=0.965, loss=40.226, backward_time=0.301, grad_norm=84.083, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.621e-04, train_time=2.742 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 06:42:42,309 (trainer:732) INFO: 60epoch:train:10022-10932batch: iter_time=2.815e-04, forward_time=0.202, loss_att=39.757, acc=0.966, loss=39.757, backward_time=0.300, grad_norm=86.024, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.619e-04, train_time=2.735 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 06:53:02,911 (trainer:732) INFO: 60epoch:train:10933-11843batch: iter_time=2.719e-04, forward_time=0.202, loss_att=39.195, acc=0.965, loss=39.195, backward_time=0.299, grad_norm=83.679, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.618e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 07:03:23,836 (trainer:732) INFO: 60epoch:train:11844-12754batch: iter_time=2.692e-04, forward_time=0.201, loss_att=39.487, acc=0.965, loss=39.487, backward_time=0.299, grad_norm=81.244, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.617e-04, train_time=2.725 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 07:13:45,331 (trainer:732) INFO: 60epoch:train:12755-13665batch: iter_time=2.671e-04, forward_time=0.202, loss_att=39.857, acc=0.965, loss=39.857, backward_time=0.299, grad_norm=87.416, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.615e-04, train_time=2.729 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 07:24:07,921 (trainer:732) INFO: 60epoch:train:13666-14576batch: iter_time=2.730e-04, forward_time=0.203, loss_att=39.608, acc=0.965, loss=39.608, backward_time=0.300, grad_norm=82.685, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.614e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 07:34:30,234 (trainer:732) INFO: 60epoch:train:14577-15487batch: iter_time=2.749e-04, forward_time=0.202, loss_att=39.908, acc=0.965, loss=39.908, backward_time=0.300, grad_norm=87.503, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.612e-04, train_time=2.733 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 07:44:52,194 (trainer:732) INFO: 60epoch:train:15488-16398batch: iter_time=2.675e-04, forward_time=0.202, loss_att=39.409, acc=0.965, loss=39.409, backward_time=0.299, grad_norm=89.245, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.611e-04, train_time=2.730 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 07:55:14,423 (trainer:732) INFO: 60epoch:train:16399-17309batch: iter_time=2.687e-04, forward_time=0.202, loss_att=39.745, acc=0.965, loss=39.745, backward_time=0.300, grad_norm=91.095, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.610e-04, train_time=2.732 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:05:35,343 (trainer:732) INFO: 60epoch:train:17310-18220batch: iter_time=2.632e-04, forward_time=0.202, loss_att=39.677, acc=0.965, loss=39.677, backward_time=0.299, grad_norm=83.616, clip=100.000, loss_scale=1.000, optim_step_time=0.064, optim0_lr0=4.608e-04, train_time=2.726 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:28,414 (trainer:338) INFO: 60epoch results: [train] iter_time=3.393e-04, forward_time=0.202, loss_att=39.467, acc=0.965, loss=39.467, backward_time=0.299, grad_norm=86.815, clip=100.000, loss_scale=1.000, optim_step_time=0.065, optim0_lr0=4.622e-04, train_time=2.757, time=3 hours, 29 minutes and 35.65 seconds, total_count=1507437, gpu_max_cached_mem_GB=30.096, [valid] loss_att=9.661, acc=0.983, cer=0.020, wer=0.081, loss=9.661, time=2 minutes and 43.41 seconds, total_count=2324, gpu_max_cached_mem_GB=30.096, [att_plot] time=1 minute and 55.24 seconds, total_count=0, gpu_max_cached_mem_GB=30.096 +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:31,861 (trainer:384) INFO: There are no improvements in this epoch +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:31,899 (trainer:440) INFO: The model files were removed: exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/38epoch.pth, exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/59epoch.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:31,899 (trainer:458) INFO: The training was finished at 60 epochs +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:31,935 (average_nbest_models:69) INFO: Averaging 10best models: criterion="valid.acc": exp/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave_10best.pth +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,159 (average_nbest_models:96) INFO: Accumulating encoder.encoders.0.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,161 (average_nbest_models:96) INFO: Accumulating encoder.encoders.1.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,164 (average_nbest_models:96) INFO: Accumulating encoder.encoders.2.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,165 (average_nbest_models:96) INFO: Accumulating encoder.encoders.3.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,167 (average_nbest_models:96) INFO: Accumulating encoder.encoders.4.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,169 (average_nbest_models:96) INFO: Accumulating encoder.encoders.5.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,171 (average_nbest_models:96) INFO: Accumulating encoder.encoders.6.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,173 (average_nbest_models:96) INFO: Accumulating encoder.encoders.7.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,175 (average_nbest_models:96) INFO: Accumulating encoder.encoders.8.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,177 (average_nbest_models:96) INFO: Accumulating encoder.encoders.9.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,179 (average_nbest_models:96) INFO: Accumulating encoder.encoders.10.conv_module.norm.num_batches_tracked instead of averaging +[de-74279-k2-train-1-1207150822-75498b8c5f-sb8fn:0/8] 2024-03-04 08:10:40,181 (average_nbest_models:96) INFO: Accumulating encoder.encoders.11.conv_module.norm.num_batches_tracked instead of averaging +# Accounting: time=477715 threads=1 +# Ended (code 0) at Mon Mar 4 08:10:44 CST 2024, elapsed time 477715 seconds diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave.pth new file mode 100644 index 0000000000000000000000000000000000000000..c51c6a9d715d47cd41ba2e1d15cef1835f417f21 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:791486dcbc54be6007172015b8527ef6609130cfaf39bc9f0b27df9de35d4927 +size 172358249 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave_10best.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave_10best.pth new file mode 100644 index 0000000000000000000000000000000000000000..c51c6a9d715d47cd41ba2e1d15cef1835f417f21 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.ave_10best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:791486dcbc54be6007172015b8527ef6609130cfaf39bc9f0b27df9de35d4927 +size 172358249 diff --git a/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.best.pth b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.best.pth new file mode 100644 index 0000000000000000000000000000000000000000..e8fc247f1ba4f93003d3ec970fd5c636b9236290 --- /dev/null +++ b/large/asr_train_sot_asr_conformer_raw_en_char_sp_finetune_ls100_45epoch_new_whamr/valid.acc.best.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c414cae0c18b1d880232d1c56249ed3ad00d3001ea345e7205a6e0d0d76a192f +size 172367337