[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=76.57510 PPL=1803619835086933004964966285967360.00000 val epoch=1 loss=3.55529 PPL=34.99814 train epoch=1 loss=3.58229 PPL=35.95572 [trained] 0.0[H] 41.41847747564316[M] 2485.109[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=8.62410 PPL=5564.13037 val epoch=1 loss=3.48060 PPL=32.47906 train epoch=1 loss=2.05416 PPL=7.80031 [trained] 0.0[H] 45.51669268210729[M] 2731.002[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=5.33357 PPL=207.17598 val epoch=1 loss=2.69441 PPL=14.79680 train epoch=1 loss=1.59283 PPL=4.91763 [trained] 0.0[H] 41.46436125040054[M] 2487.862[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=5.03823 PPL=154.19640 val epoch=1 loss=3.20544 PPL=24.66638 train epoch=1 loss=1.61361 PPL=5.02092 [trained] 0.0[H] 45.251987334092455[M] 2715.119[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=4.14368 PPL=63.03437 val epoch=1 loss=2.43705 PPL=11.43929 train epoch=1 loss=1.37564 PPL=3.95763 [trained] 0.0[H] 41.47204469839732[M] 2488.323[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=4.28832 PPL=72.84402 val epoch=1 loss=3.02900 PPL=20.67647 train epoch=1 loss=1.48900 PPL=4.43266 [trained] 0.0[H] 45.57923027674357[M] 2734.754[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.70968 PPL=40.84082 val epoch=1 loss=2.28623 PPL=9.83775 train epoch=1 loss=1.27682 PPL=3.58522 [trained] 0.0[H] 41.4678033153216[M] 2488.068[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.83498 PPL=46.29248 val epoch=1 loss=2.79002 PPL=16.28134 train epoch=1 loss=1.41784 PPL=4.12821 [trained] 0.0[H] 45.09872035185496[M] 2705.923[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.38932 PPL=29.64582 val epoch=1 loss=2.20471 PPL=9.06766 train epoch=1 loss=1.22078 PPL=3.38983 [trained] 0.0[H] 41.52079544067383[M] 2491.248[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.77504 PPL=43.59935 val epoch=1 loss=2.75377 PPL=15.70175 train epoch=1 loss=1.37220 PPL=3.94404 [trained] 0.0[H] 45.1388335108757[M] 2708.330[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.05649 PPL=21.25283 val epoch=1 loss=2.06552 PPL=7.88940 train epoch=1 loss=1.18322 PPL=3.26485 [trained] 0.0[H] 41.343922030925754[M] 2480.635[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.63945 PPL=38.07071 val epoch=1 loss=2.74634 PPL=15.58548 train epoch=1 loss=1.34129 PPL=3.82397 [trained] 0.0[H] 44.50069724321365[M] 2670.042[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.99219 PPL=19.92924 val epoch=1 loss=2.11169 PPL=8.26216 train epoch=1 loss=1.15597 PPL=3.17710 [trained] 0.0[H] 41.03153887987137[M] 2461.892[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.69122 PPL=40.09384 val epoch=1 loss=2.79154 PPL=16.30605 train epoch=1 loss=1.31323 PPL=3.71816 [trained] 0.0[H] 45.27243907054265[M] 2716.346[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.89017 PPL=17.99635 val epoch=1 loss=2.05285 PPL=7.79006 train epoch=1 loss=1.13480 PPL=3.11056 [trained] 0.0[H] 41.108288780848184[M] 2466.497[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.43673 PPL=31.08512 val epoch=1 loss=2.64907 PPL=14.14095 train epoch=1 loss=1.29298 PPL=3.64363 [trained] 0.0[H] 44.97415177822113[M] 2698.449[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.77340 PPL=16.01299 val epoch=1 loss=1.99160 PPL=7.32726 train epoch=1 loss=1.11733 PPL=3.05667 [trained] 0.0[H] 41.14810743729274[M] 2468.886[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.39647 PPL=29.85852 val epoch=1 loss=2.36330 PPL=10.62593 train epoch=1 loss=1.27496 PPL=3.57856 [trained] 0.0[H] 44.73817230463028[M] 2684.290[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.74622 PPL=15.58363 val epoch=1 loss=2.00091 PPL=7.39578 train epoch=1 loss=1.10269 PPL=3.01226 [trained] 0.0[H] 41.041836047172545[M] 2462.510[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.45477 PPL=31.65103 val epoch=1 loss=2.73762 PPL=15.45019 train epoch=1 loss=1.25830 PPL=3.51942 [trained] 0.0[H] 45.509643785158794[M] 2730.579[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.71395 PPL=15.08881 val epoch=1 loss=2.00103 PPL=7.39668 train epoch=1 loss=1.09001 PPL=2.97429 [trained] 0.0[H] 41.28162391185761[M] 2476.897[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.26547 PPL=26.19238 val epoch=1 loss=2.69914 PPL=14.86692 train epoch=1 loss=1.24174 PPL=3.46165 [trained] 0.0[H] 45.42912646929423[M] 2725.748[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.68335 PPL=14.63405 val epoch=1 loss=2.00004 PPL=7.38934 train epoch=1 loss=1.07841 PPL=2.94001 [trained] 0.0[H] 41.447514899571736[M] 2486.851[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.27115 PPL=26.34149 val epoch=1 loss=2.72310 PPL=15.22747 train epoch=1 loss=1.23098 PPL=3.42457 [trained] 0.0[H] 45.18751840988795[M] 2711.251[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.57214 PPL=13.09387 val epoch=1 loss=1.95365 PPL=7.05438 train epoch=1 loss=1.06908 PPL=2.91269 [trained] 0.0[H] 40.959261027971905[M] 2457.556[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.18254 PPL=24.10785 val epoch=1 loss=2.68020 PPL=14.58803 train epoch=1 loss=1.22046 PPL=3.38875 [trained] 0.0[H] 45.264945685863495[M] 2715.897[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.57132 PPL=13.08305 val epoch=1 loss=1.94033 PPL=6.96107 train epoch=1 loss=1.06083 PPL=2.88875 [trained] 0.0[H] 41.00604948997498[M] 2460.363[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.99903 PPL=20.06612 val epoch=1 loss=2.42283 PPL=11.27773 train epoch=1 loss=1.20782 PPL=3.34619 [trained] 0.0[H] 45.244081223011015[M] 2714.645[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.55513 PPL=12.87301 val epoch=1 loss=1.93933 PPL=6.95411 train epoch=1 loss=1.05271 PPL=2.86539 [trained] 0.0[H] 41.11795919736226[M] 2467.078[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [Errno 28] No space left on device [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=1.95896 PPL=7.09197 val epoch=1 loss=1.90212 PPL=6.70007 train epoch=1 loss=1.02537 PPL=2.78812 [trained] 0.0[H] 41.157410267988844[M] 2469.445[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.27824 PPL=26.52905 val epoch=1 loss=2.65349 PPL=14.20354 train epoch=1 loss=1.20492 PPL=3.33650 [trained] 0.0[H] 45.46110556125641[M] 2727.666[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.57661 PPL=13.15245 val epoch=1 loss=1.92245 PPL=6.83766 train epoch=1 loss=1.02811 PPL=2.79576 [trained] 0.0[H] 41.425186324119565[M] 2485.511[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.20961 PPL=24.76951 val epoch=1 loss=2.68262 PPL=14.62342 train epoch=1 loss=1.19228 PPL=3.29459 [trained] 0.0[H] 45.62885602712631[M] 2737.731[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.46379 PPL=11.74931 val epoch=1 loss=1.88820 PPL=6.60748 train epoch=1 loss=1.02569 PPL=2.78903 [trained] 0.0[H] 41.40251029332479[M] 2484.151[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.24453 PPL=25.64962 val epoch=1 loss=2.63306 PPL=13.91623 train epoch=1 loss=1.18417 PPL=3.26797 [trained] 0.0[H] 45.55619955062866[M] 2733.372[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.47150 PPL=11.84021 val epoch=1 loss=1.87957 PPL=6.55069 train epoch=1 loss=1.02233 PPL=2.77967 [trained] 0.0[H] 41.72278196414312[M] 2503.367[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.07940 PPL=21.74541 val epoch=1 loss=2.38436 PPL=10.85216 train epoch=1 loss=1.17423 PPL=3.23566 [trained] 0.0[H] 45.18255339066187[M] 2710.953[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.48200 PPL=11.96519 val epoch=1 loss=1.88596 PPL=6.59266 train epoch=1 loss=1.01817 PPL=2.76812 [trained] 0.0[H] 41.02570736805598[M] 2461.542[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.28042 PPL=26.58687 val epoch=1 loss=2.53102 PPL=12.56630 train epoch=1 loss=1.17467 PPL=3.23707 [trained] 0.0[H] 45.04093019167582[M] 2702.456[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.34633 PPL=10.44713 val epoch=1 loss=1.78546 PPL=5.96234 train epoch=1 loss=1.02161 PPL=2.77766 [trained] 0.0[H] 40.93621168136597[M] 2456.173[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.90283 PPL=18.22571 val epoch=1 loss=2.43509 PPL=11.41683 train epoch=1 loss=1.16996 PPL=3.22188 [trained] 0.0[H] 45.014744373162586[M] 2700.885[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.38792 PPL=10.89079 val epoch=1 loss=1.87885 PPL=6.54596 train epoch=1 loss=1.01714 PPL=2.76527 [trained] 0.0[H] 40.9392077644666[M] 2456.352[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=3.02356 PPL=20.56429 val epoch=1 loss=2.60609 PPL=13.54600 train epoch=1 loss=1.16236 PPL=3.19748 [trained] 0.0[H] 44.92984497149785[M] 2695.791[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.38416 PPL=10.84993 val epoch=1 loss=1.85154 PPL=6.36963 train epoch=1 loss=1.01303 PPL=2.75394 [trained] 0.0[H] 40.939160716533664[M] 2456.350[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.99403 PPL=19.96596 val epoch=1 loss=2.40907 PPL=11.12363 train epoch=1 loss=1.15822 PPL=3.18425 [trained] 0.0[H] 45.009803128242496[M] 2700.588[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.31114 PPL=10.08587 val epoch=1 loss=1.80426 PPL=6.07548 train epoch=1 loss=1.00900 PPL=2.74285 [trained] 0.0[H] 40.89026815891266[M] 2453.416[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.62659 PPL=13.82649 val epoch=1 loss=2.22096 PPL=9.21613 train epoch=1 loss=1.15455 PPL=3.17260 [trained] 0.0[H] 45.08568317492803[M] 2705.141[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.33120 PPL=10.29027 val epoch=1 loss=1.82923 PPL=6.22906 train epoch=1 loss=1.00533 PPL=2.73282 [trained] 0.0[H] 41.071046415964766[M] 2464.263[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.96448 PPL=19.38460 val epoch=1 loss=2.56859 PPL=13.04741 train epoch=1 loss=1.14901 PPL=3.15508 [trained] 0.0[H] 45.149318718910216[M] 2708.959[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.31578 PPL=10.13282 val epoch=1 loss=1.84484 PPL=6.32710 train epoch=1 loss=0.99815 PPL=2.71326 [trained] 0.0[H] 41.47976658344269[M] 2488.786[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.79162 PPL=16.30747 val epoch=1 loss=2.55381 PPL=12.85597 train epoch=1 loss=1.13298 PPL=3.10490 [trained] 0.0[H] 44.85936383008957[M] 2691.562[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.19775 PPL=9.00472 val epoch=1 loss=1.86080 PPL=6.42891 train epoch=1 loss=0.99177 PPL=2.69602 [trained] 0.0[H] 41.247025799751285[M] 2474.822[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.88427 PPL=17.89051 val epoch=1 loss=2.61712 PPL=13.69621 train epoch=1 loss=1.13177 PPL=3.10115 [trained] 0.0[H] 44.970662931601204[M] 2698.240[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.13605 PPL=8.46593 val epoch=1 loss=1.82511 PPL=6.20346 train epoch=1 loss=0.98874 PPL=2.68785 [trained] 0.0[H] 41.13867333332698[M] 2468.320[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.80929 PPL=16.59817 val epoch=1 loss=2.57963 PPL=13.19225 train epoch=1 loss=1.12952 PPL=3.09418 [trained] 0.0[H] 44.85550689299901[M] 2691.330[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.10408 PPL=8.19958 val epoch=1 loss=1.82278 PPL=6.18904 train epoch=1 loss=0.98625 PPL=2.68116 [trained] 0.0[H] 41.238258417447405[M] 2474.296[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.54932 PPL=12.79840 val epoch=1 loss=2.30628 PPL=10.03706 train epoch=1 loss=1.12400 PPL=3.07713 [trained] 0.0[H] 45.42449986537297[M] 2725.470[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.11911 PPL=8.32372 val epoch=1 loss=1.83168 PPL=6.24436 train epoch=1 loss=0.98346 PPL=2.67370 [trained] 0.0[H] 41.16821654637655[M] 2470.093[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.69997 PPL=14.87932 val epoch=1 loss=2.48527 PPL=12.00436 train epoch=1 loss=1.12319 PPL=3.07466 [trained] 0.0[H] 44.44852333863576[M] 2666.911[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=1.98369 PPL=7.26949 val epoch=1 loss=1.72509 PPL=5.61302 train epoch=1 loss=0.98371 PPL=2.67435 [trained] 0.0[H] 41.144457550843555[M] 2468.667[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.53665 PPL=12.63722 val epoch=1 loss=2.40133 PPL=11.03790 train epoch=1 loss=1.12251 PPL=3.07255 [trained] 0.0[H] 44.89564416805903[M] 2693.739[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_13.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_13.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.04189 PPL=7.70517 val epoch=1 loss=1.78558 PPL=5.96303 train epoch=1 loss=0.98738 PPL=2.68419 [trained] 0.0[H] 41.18701983690262[M] 2471.221[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_13.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_13.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.55545 PPL=12.87711 val epoch=1 loss=2.23607 PPL=9.35652 train epoch=1 loss=1.12599 PPL=3.08326 [trained] 0.0[H] 44.80764791965485[M] 2688.459[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_12.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_12.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=1.90556 PPL=6.72317 val epoch=1 loss=1.68472 PPL=5.39096 train epoch=1 loss=0.98539 PPL=2.67885 [trained] 0.0[H] 41.29294394652049[M] 2477.577[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_12.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_12.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.60585 PPL=13.54276 val epoch=1 loss=2.41645 PPL=11.20597 train epoch=1 loss=1.12524 PPL=3.08097 [trained] 0.0[H] 47.45244402488073[M] 2847.147[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_11.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_11.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=1.99018 PPL=7.31686 val epoch=1 loss=1.75003 PPL=5.75478 train epoch=1 loss=0.98361 PPL=2.67409 [trained] 0.0[H] 41.130028867721556[M] 2467.802[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_11.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_11.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.69124 PPL=14.74992 val epoch=1 loss=2.52559 PPL=12.49825 train epoch=1 loss=1.12024 PPL=3.06559 [trained] 0.0[H] 44.83212472200394[M] 2689.927[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_10.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_en_msp_10.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.01462 PPL=7.49789 val epoch=1 loss=1.79355 PPL=6.01077 train epoch=1 loss=0.98182 PPL=2.66931 [trained] 0.0[H] 41.305239562193556[M] 2478.314[sec] [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_10.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0) [train] ['/groups/gcc50582/MSP/mc4_ja_msp_10.jsonl'] [batch_size] 256 [accumulate_grad_batches] 2 val epoch=1 loss=2.34977 PPL=10.48312 val epoch=1 loss=2.21273 PPL=9.14063 train epoch=1 loss=1.11990 PPL=3.06456 [trained] 0.0[H] 44.55307694673538[M] 2673.185[sec]