first version

Browse files

Files changed (7) hide show

config.json +31 -0
generation_config.json +7 -0
pytorch_model.bin +3 -0
special_tokens_map.json +5 -0
spiece.model +3 -0
tokenizer_config.json +12 -0
train_log.txt +287 -0

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "mini2",
+  "architectures": [
+    "MT5ForConditionalGeneration"
+  ],
+  "d_ff": 1536,
+  "d_kv": 64,
+  "d_model": 384,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "gelu_new",
+  "dropout_rate": 0.1,
+  "eos_token_id": 1,
+  "feed_forward_proj": "gated-gelu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": true,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "mt5",
+  "num_decoder_layers": 9,
+  "num_heads": 9,
+  "num_layers": 9,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "tie_word_embeddings": false,
+  "tokenizer_class": "T5Tokenizer",
+  "torch_dtype": "float32",
+  "transformers_version": "4.26.1",
+  "use_cache": true,
+  "vocab_size": 32128
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 1,
+  "pad_token_id": 0,
+  "transformers_version": "4.26.1"
+}

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6a31e76db56aec04c81affe569cfb952c62ce5dea9f9c59c8593fdc08122d556
+size 321795553

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,5 @@

+{
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

spiece.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:108ea5dbb232558d744aff5011d29b92a76751c210ad8560e6a65738c9630bdf
+size 775057

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "additional_special_tokens": [],
+  "eos_token": "</s>",
+  "extra_ids": 0,
+  "model_max_length": 1000000000000000019884624838656,
+  "name_or_path": "mini2",
+  "pad_token": "<pad>",
+  "sp_model_kwargs": {},
+  "special_tokens_map_file": "/home/acc12952oa/.cache/huggingface/hub/models--kkuramitsu--mt5np_mini12L/snapshots/e66bd8feec1522ea93ed176acb765f0c44f81526/special_tokens_map.json",
+  "tokenizer_class": "T5Tokenizer",
+  "unk_token": "<unk>"
+}

train_log.txt ADDED Viewed

	@@ -0,0 +1,287 @@

+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=76.57510 PPL=1803619835086933004964966285967360.00000
+val epoch=1 loss=3.55529 PPL=34.99814
+train epoch=1 loss=3.58229 PPL=35.95572
+[trained] 0.0[H] 41.41847747564316[M] 2485.109[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=8.62410 PPL=5564.13037
+val epoch=1 loss=3.48060 PPL=32.47906
+train epoch=1 loss=2.05416 PPL=7.80031
+[trained] 0.0[H] 45.51669268210729[M] 2731.002[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=5.33357 PPL=207.17598
+val epoch=1 loss=2.69441 PPL=14.79680
+train epoch=1 loss=1.59283 PPL=4.91763
+[trained] 0.0[H] 41.46436125040054[M] 2487.862[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=5.03823 PPL=154.19640
+val epoch=1 loss=3.20544 PPL=24.66638
+train epoch=1 loss=1.61361 PPL=5.02092
+[trained] 0.0[H] 45.251987334092455[M] 2715.119[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=4.14368 PPL=63.03437
+val epoch=1 loss=2.43705 PPL=11.43929
+train epoch=1 loss=1.37564 PPL=3.95763
+[trained] 0.0[H] 41.47204469839732[M] 2488.323[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=4.28832 PPL=72.84402
+val epoch=1 loss=3.02900 PPL=20.67647
+train epoch=1 loss=1.48900 PPL=4.43266
+[trained] 0.0[H] 45.57923027674357[M] 2734.754[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.70968 PPL=40.84082
+val epoch=1 loss=2.28623 PPL=9.83775
+train epoch=1 loss=1.27682 PPL=3.58522
+[trained] 0.0[H] 41.4678033153216[M] 2488.068[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.83498 PPL=46.29248
+val epoch=1 loss=2.79002 PPL=16.28134
+train epoch=1 loss=1.41784 PPL=4.12821
+[trained] 0.0[H] 45.09872035185496[M] 2705.923[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.38932 PPL=29.64582
+val epoch=1 loss=2.20471 PPL=9.06766
+train epoch=1 loss=1.22078 PPL=3.38983
+[trained] 0.0[H] 41.52079544067383[M] 2491.248[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.77504 PPL=43.59935
+val epoch=1 loss=2.75377 PPL=15.70175
+train epoch=1 loss=1.37220 PPL=3.94404
+[trained] 0.0[H] 45.1388335108757[M] 2708.330[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.05649 PPL=21.25283
+val epoch=1 loss=2.06552 PPL=7.88940
+train epoch=1 loss=1.18322 PPL=3.26485
+[trained] 0.0[H] 41.343922030925754[M] 2480.635[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.63945 PPL=38.07071
+val epoch=1 loss=2.74634 PPL=15.58548
+train epoch=1 loss=1.34129 PPL=3.82397
+[trained] 0.0[H] 44.50069724321365[M] 2670.042[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.99219 PPL=19.92924
+val epoch=1 loss=2.11169 PPL=8.26216
+train epoch=1 loss=1.15597 PPL=3.17710
+[trained] 0.0[H] 41.03153887987137[M] 2461.892[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.69122 PPL=40.09384
+val epoch=1 loss=2.79154 PPL=16.30605
+train epoch=1 loss=1.31323 PPL=3.71816
+[trained] 0.0[H] 45.27243907054265[M] 2716.346[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.89017 PPL=17.99635
+val epoch=1 loss=2.05285 PPL=7.79006
+train epoch=1 loss=1.13480 PPL=3.11056
+[trained] 0.0[H] 41.108288780848184[M] 2466.497[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.43673 PPL=31.08512
+val epoch=1 loss=2.64907 PPL=14.14095
+train epoch=1 loss=1.29298 PPL=3.64363
+[trained] 0.0[H] 44.97415177822113[M] 2698.449[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.77340 PPL=16.01299
+val epoch=1 loss=1.99160 PPL=7.32726
+train epoch=1 loss=1.11733 PPL=3.05667
+[trained] 0.0[H] 41.14810743729274[M] 2468.886[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.39647 PPL=29.85852
+val epoch=1 loss=2.36330 PPL=10.62593
+train epoch=1 loss=1.27496 PPL=3.57856
+[trained] 0.0[H] 44.73817230463028[M] 2684.290[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.74622 PPL=15.58363
+val epoch=1 loss=2.00091 PPL=7.39578
+train epoch=1 loss=1.10269 PPL=3.01226
+[trained] 0.0[H] 41.041836047172545[M] 2462.510[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.45477 PPL=31.65103
+val epoch=1 loss=2.73762 PPL=15.45019
+train epoch=1 loss=1.25830 PPL=3.51942
+[trained] 0.0[H] 45.509643785158794[M] 2730.579[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.71395 PPL=15.08881
+val epoch=1 loss=2.00103 PPL=7.39668
+train epoch=1 loss=1.09001 PPL=2.97429
+[trained] 0.0[H] 41.28162391185761[M] 2476.897[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.26547 PPL=26.19238
+val epoch=1 loss=2.69914 PPL=14.86692
+train epoch=1 loss=1.24174 PPL=3.46165
+[trained] 0.0[H] 45.42912646929423[M] 2725.748[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.68335 PPL=14.63405
+val epoch=1 loss=2.00004 PPL=7.38934
+train epoch=1 loss=1.07841 PPL=2.94001
+[trained] 0.0[H] 41.447514899571736[M] 2486.851[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.27115 PPL=26.34149
+val epoch=1 loss=2.72310 PPL=15.22747
+train epoch=1 loss=1.23098 PPL=3.42457
+[trained] 0.0[H] 45.18751840988795[M] 2711.251[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.57214 PPL=13.09387
+val epoch=1 loss=1.95365 PPL=7.05438
+train epoch=1 loss=1.06908 PPL=2.91269
+[trained] 0.0[H] 40.959261027971905[M] 2457.556[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=3.18254 PPL=24.10785
+val epoch=1 loss=2.68020 PPL=14.58803
+train epoch=1 loss=1.22046 PPL=3.38875
+[trained] 0.0[H] 45.264945685863495[M] 2715.897[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.57132 PPL=13.08305
+val epoch=1 loss=1.94033 PPL=6.96107
+train epoch=1 loss=1.06083 PPL=2.88875
+[trained] 0.0[H] 41.00604948997498[M] 2460.363[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.99903 PPL=20.06612
+val epoch=1 loss=2.42283 PPL=11.27773
+train epoch=1 loss=1.20782 PPL=3.34619
+[trained] 0.0[H] 45.244081223011015[M] 2714.645[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+val epoch=1 loss=2.55513 PPL=12.87301
+val epoch=1 loss=1.93933 PPL=6.95411
+train epoch=1 loss=1.05271 PPL=2.86539
+[trained] 0.0[H] 41.11795919736226[M] 2467.078[sec]
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [Errno 28] No space left on device
+[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
+[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
+[batch_size] 256
+[accumulate_grad_batches] 2
+[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [Errno 28] No space left on device