mt5-mini9L / train_log.txt

mini4 AdamW

01929dd about 1 year ago

No virus

72.4 kB

	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=76.57510 PPL=1803619835086933004964966285967360.00000
	val epoch=1 loss=3.55529 PPL=34.99814
	train epoch=1 loss=3.58229 PPL=35.95572
	[trained] 0.0[H] 41.41847747564316[M] 2485.109[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=8.62410 PPL=5564.13037
	val epoch=1 loss=3.48060 PPL=32.47906
	train epoch=1 loss=2.05416 PPL=7.80031
	[trained] 0.0[H] 45.51669268210729[M] 2731.002[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=5.33357 PPL=207.17598
	val epoch=1 loss=2.69441 PPL=14.79680
	train epoch=1 loss=1.59283 PPL=4.91763
	[trained] 0.0[H] 41.46436125040054[M] 2487.862[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=5.03823 PPL=154.19640
	val epoch=1 loss=3.20544 PPL=24.66638
	train epoch=1 loss=1.61361 PPL=5.02092
	[trained] 0.0[H] 45.251987334092455[M] 2715.119[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=4.14368 PPL=63.03437
	val epoch=1 loss=2.43705 PPL=11.43929
	train epoch=1 loss=1.37564 PPL=3.95763
	[trained] 0.0[H] 41.47204469839732[M] 2488.323[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=4.28832 PPL=72.84402
	val epoch=1 loss=3.02900 PPL=20.67647
	train epoch=1 loss=1.48900 PPL=4.43266
	[trained] 0.0[H] 45.57923027674357[M] 2734.754[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.70968 PPL=40.84082
	val epoch=1 loss=2.28623 PPL=9.83775
	train epoch=1 loss=1.27682 PPL=3.58522
	[trained] 0.0[H] 41.4678033153216[M] 2488.068[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.83498 PPL=46.29248
	val epoch=1 loss=2.79002 PPL=16.28134
	train epoch=1 loss=1.41784 PPL=4.12821
	[trained] 0.0[H] 45.09872035185496[M] 2705.923[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.38932 PPL=29.64582
	val epoch=1 loss=2.20471 PPL=9.06766
	train epoch=1 loss=1.22078 PPL=3.38983
	[trained] 0.0[H] 41.52079544067383[M] 2491.248[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.77504 PPL=43.59935
	val epoch=1 loss=2.75377 PPL=15.70175
	train epoch=1 loss=1.37220 PPL=3.94404
	[trained] 0.0[H] 45.1388335108757[M] 2708.330[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.05649 PPL=21.25283
	val epoch=1 loss=2.06552 PPL=7.88940
	train epoch=1 loss=1.18322 PPL=3.26485
	[trained] 0.0[H] 41.343922030925754[M] 2480.635[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.63945 PPL=38.07071
	val epoch=1 loss=2.74634 PPL=15.58548
	train epoch=1 loss=1.34129 PPL=3.82397
	[trained] 0.0[H] 44.50069724321365[M] 2670.042[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.99219 PPL=19.92924
	val epoch=1 loss=2.11169 PPL=8.26216
	train epoch=1 loss=1.15597 PPL=3.17710
	[trained] 0.0[H] 41.03153887987137[M] 2461.892[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.69122 PPL=40.09384
	val epoch=1 loss=2.79154 PPL=16.30605
	train epoch=1 loss=1.31323 PPL=3.71816
	[trained] 0.0[H] 45.27243907054265[M] 2716.346[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.89017 PPL=17.99635
	val epoch=1 loss=2.05285 PPL=7.79006
	train epoch=1 loss=1.13480 PPL=3.11056
	[trained] 0.0[H] 41.108288780848184[M] 2466.497[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.43673 PPL=31.08512
	val epoch=1 loss=2.64907 PPL=14.14095
	train epoch=1 loss=1.29298 PPL=3.64363
	[trained] 0.0[H] 44.97415177822113[M] 2698.449[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.77340 PPL=16.01299
	val epoch=1 loss=1.99160 PPL=7.32726
	train epoch=1 loss=1.11733 PPL=3.05667
	[trained] 0.0[H] 41.14810743729274[M] 2468.886[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.39647 PPL=29.85852
	val epoch=1 loss=2.36330 PPL=10.62593
	train epoch=1 loss=1.27496 PPL=3.57856
	[trained] 0.0[H] 44.73817230463028[M] 2684.290[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.74622 PPL=15.58363
	val epoch=1 loss=2.00091 PPL=7.39578
	train epoch=1 loss=1.10269 PPL=3.01226
	[trained] 0.0[H] 41.041836047172545[M] 2462.510[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.45477 PPL=31.65103
	val epoch=1 loss=2.73762 PPL=15.45019
	train epoch=1 loss=1.25830 PPL=3.51942
	[trained] 0.0[H] 45.509643785158794[M] 2730.579[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.71395 PPL=15.08881
	val epoch=1 loss=2.00103 PPL=7.39668
	train epoch=1 loss=1.09001 PPL=2.97429
	[trained] 0.0[H] 41.28162391185761[M] 2476.897[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.26547 PPL=26.19238
	val epoch=1 loss=2.69914 PPL=14.86692
	train epoch=1 loss=1.24174 PPL=3.46165
	[trained] 0.0[H] 45.42912646929423[M] 2725.748[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.68335 PPL=14.63405
	val epoch=1 loss=2.00004 PPL=7.38934
	train epoch=1 loss=1.07841 PPL=2.94001
	[trained] 0.0[H] 41.447514899571736[M] 2486.851[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.27115 PPL=26.34149
	val epoch=1 loss=2.72310 PPL=15.22747
	train epoch=1 loss=1.23098 PPL=3.42457
	[trained] 0.0[H] 45.18751840988795[M] 2711.251[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.57214 PPL=13.09387
	val epoch=1 loss=1.95365 PPL=7.05438
	train epoch=1 loss=1.06908 PPL=2.91269
	[trained] 0.0[H] 40.959261027971905[M] 2457.556[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.18254 PPL=24.10785
	val epoch=1 loss=2.68020 PPL=14.58803
	train epoch=1 loss=1.22046 PPL=3.38875
	[trained] 0.0[H] 45.264945685863495[M] 2715.897[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.57132 PPL=13.08305
	val epoch=1 loss=1.94033 PPL=6.96107
	train epoch=1 loss=1.06083 PPL=2.88875
	[trained] 0.0[H] 41.00604948997498[M] 2460.363[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.99903 PPL=20.06612
	val epoch=1 loss=2.42283 PPL=11.27773
	train epoch=1 loss=1.20782 PPL=3.34619
	[trained] 0.0[H] 45.244081223011015[M] 2714.645[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.55513 PPL=12.87301
	val epoch=1 loss=1.93933 PPL=6.95411
	train epoch=1 loss=1.05271 PPL=2.86539
	[trained] 0.0[H] 41.11795919736226[M] 2467.078[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	[failed] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [Errno 28] No space left on device
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=1.95896 PPL=7.09197
	val epoch=1 loss=1.90212 PPL=6.70007
	train epoch=1 loss=1.02537 PPL=2.78812
	[trained] 0.0[H] 41.157410267988844[M] 2469.445[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.27824 PPL=26.52905
	val epoch=1 loss=2.65349 PPL=14.20354
	train epoch=1 loss=1.20492 PPL=3.33650
	[trained] 0.0[H] 45.46110556125641[M] 2727.666[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.57661 PPL=13.15245
	val epoch=1 loss=1.92245 PPL=6.83766
	train epoch=1 loss=1.02811 PPL=2.79576
	[trained] 0.0[H] 41.425186324119565[M] 2485.511[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.20961 PPL=24.76951
	val epoch=1 loss=2.68262 PPL=14.62342
	train epoch=1 loss=1.19228 PPL=3.29459
	[trained] 0.0[H] 45.62885602712631[M] 2737.731[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.46379 PPL=11.74931
	val epoch=1 loss=1.88820 PPL=6.60748
	train epoch=1 loss=1.02569 PPL=2.78903
	[trained] 0.0[H] 41.40251029332479[M] 2484.151[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.24453 PPL=25.64962
	val epoch=1 loss=2.63306 PPL=13.91623
	train epoch=1 loss=1.18417 PPL=3.26797
	[trained] 0.0[H] 45.55619955062866[M] 2733.372[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.47150 PPL=11.84021
	val epoch=1 loss=1.87957 PPL=6.55069
	train epoch=1 loss=1.02233 PPL=2.77967
	[trained] 0.0[H] 41.72278196414312[M] 2503.367[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.07940 PPL=21.74541
	val epoch=1 loss=2.38436 PPL=10.85216
	train epoch=1 loss=1.17423 PPL=3.23566
	[trained] 0.0[H] 45.18255339066187[M] 2710.953[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.48200 PPL=11.96519
	val epoch=1 loss=1.88596 PPL=6.59266
	train epoch=1 loss=1.01817 PPL=2.76812
	[trained] 0.0[H] 41.02570736805598[M] 2461.542[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.28042 PPL=26.58687
	val epoch=1 loss=2.53102 PPL=12.56630
	train epoch=1 loss=1.17467 PPL=3.23707
	[trained] 0.0[H] 45.04093019167582[M] 2702.456[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.34633 PPL=10.44713
	val epoch=1 loss=1.78546 PPL=5.96234
	train epoch=1 loss=1.02161 PPL=2.77766
	[trained] 0.0[H] 40.93621168136597[M] 2456.173[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.90283 PPL=18.22571
	val epoch=1 loss=2.43509 PPL=11.41683
	train epoch=1 loss=1.16996 PPL=3.22188
	[trained] 0.0[H] 45.014744373162586[M] 2700.885[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.38792 PPL=10.89079
	val epoch=1 loss=1.87885 PPL=6.54596
	train epoch=1 loss=1.01714 PPL=2.76527
	[trained] 0.0[H] 40.9392077644666[M] 2456.352[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=3.02356 PPL=20.56429
	val epoch=1 loss=2.60609 PPL=13.54600
	train epoch=1 loss=1.16236 PPL=3.19748
	[trained] 0.0[H] 44.92984497149785[M] 2695.791[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.38416 PPL=10.84993
	val epoch=1 loss=1.85154 PPL=6.36963
	train epoch=1 loss=1.01303 PPL=2.75394
	[trained] 0.0[H] 40.939160716533664[M] 2456.350[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.99403 PPL=19.96596
	val epoch=1 loss=2.40907 PPL=11.12363
	train epoch=1 loss=1.15822 PPL=3.18425
	[trained] 0.0[H] 45.009803128242496[M] 2700.588[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.31114 PPL=10.08587
	val epoch=1 loss=1.80426 PPL=6.07548
	train epoch=1 loss=1.00900 PPL=2.74285
	[trained] 0.0[H] 40.89026815891266[M] 2453.416[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.62659 PPL=13.82649
	val epoch=1 loss=2.22096 PPL=9.21613
	train epoch=1 loss=1.15455 PPL=3.17260
	[trained] 0.0[H] 45.08568317492803[M] 2705.141[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.33120 PPL=10.29027
	val epoch=1 loss=1.82923 PPL=6.22906
	train epoch=1 loss=1.00533 PPL=2.73282
	[trained] 0.0[H] 41.071046415964766[M] 2464.263[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini3', num_workers=4, output_path='mini3', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini3', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.96448 PPL=19.38460
	val epoch=1 loss=2.56859 PPL=13.04741
	train epoch=1 loss=1.14901 PPL=3.15508
	[trained] 0.0[H] 45.149318718910216[M] 2708.959[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.31578 PPL=10.13282
	val epoch=1 loss=1.84484 PPL=6.32710
	train epoch=1 loss=0.99815 PPL=2.71326
	[trained] 0.0[H] 41.47976658344269[M] 2488.786[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.79162 PPL=16.30747
	val epoch=1 loss=2.55381 PPL=12.85597
	train epoch=1 loss=1.13298 PPL=3.10490
	[trained] 0.0[H] 44.85936383008957[M] 2691.562[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.19775 PPL=9.00472
	val epoch=1 loss=1.86080 PPL=6.42891
	train epoch=1 loss=0.99177 PPL=2.69602
	[trained] 0.0[H] 41.247025799751285[M] 2474.822[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.88427 PPL=17.89051
	val epoch=1 loss=2.61712 PPL=13.69621
	train epoch=1 loss=1.13177 PPL=3.10115
	[trained] 0.0[H] 44.970662931601204[M] 2698.240[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.13605 PPL=8.46593
	val epoch=1 loss=1.82511 PPL=6.20346
	train epoch=1 loss=0.98874 PPL=2.68785
	[trained] 0.0[H] 41.13867333332698[M] 2468.320[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.80929 PPL=16.59817
	val epoch=1 loss=2.57963 PPL=13.19225
	train epoch=1 loss=1.12952 PPL=3.09418
	[trained] 0.0[H] 44.85550689299901[M] 2691.330[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.10408 PPL=8.19958
	val epoch=1 loss=1.82278 PPL=6.18904
	train epoch=1 loss=0.98625 PPL=2.68116
	[trained] 0.0[H] 41.238258417447405[M] 2474.296[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.54932 PPL=12.79840
	val epoch=1 loss=2.30628 PPL=10.03706
	train epoch=1 loss=1.12400 PPL=3.07713
	[trained] 0.0[H] 45.42449986537297[M] 2725.470[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.11911 PPL=8.32372
	val epoch=1 loss=1.83168 PPL=6.24436
	train epoch=1 loss=0.98346 PPL=2.67370
	[trained] 0.0[H] 41.16821654637655[M] 2470.093[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.69997 PPL=14.87932
	val epoch=1 loss=2.48527 PPL=12.00436
	train epoch=1 loss=1.12319 PPL=3.07466
	[trained] 0.0[H] 44.44852333863576[M] 2666.911[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=1.98369 PPL=7.26949
	val epoch=1 loss=1.72509 PPL=5.61302
	train epoch=1 loss=0.98371 PPL=2.67435
	[trained] 0.0[H] 41.144457550843555[M] 2468.667[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.53665 PPL=12.63722
	val epoch=1 loss=2.40133 PPL=11.03790
	train epoch=1 loss=1.12251 PPL=3.07255
	[trained] 0.0[H] 44.89564416805903[M] 2693.739[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_13.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_13.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.04189 PPL=7.70517
	val epoch=1 loss=1.78558 PPL=5.96303
	train epoch=1 loss=0.98738 PPL=2.68419
	[trained] 0.0[H] 41.18701983690262[M] 2471.221[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_13.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_13.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.55545 PPL=12.87711
	val epoch=1 loss=2.23607 PPL=9.35652
	train epoch=1 loss=1.12599 PPL=3.08326
	[trained] 0.0[H] 44.80764791965485[M] 2688.459[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_12.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_12.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=1.90556 PPL=6.72317
	val epoch=1 loss=1.68472 PPL=5.39096
	train epoch=1 loss=0.98539 PPL=2.67885
	[trained] 0.0[H] 41.29294394652049[M] 2477.577[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_12.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_12.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.60585 PPL=13.54276
	val epoch=1 loss=2.41645 PPL=11.20597
	train epoch=1 loss=1.12524 PPL=3.08097
	[trained] 0.0[H] 47.45244402488073[M] 2847.147[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_11.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_11.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=1.99018 PPL=7.31686
	val epoch=1 loss=1.75003 PPL=5.75478
	train epoch=1 loss=0.98361 PPL=2.67409
	[trained] 0.0[H] 41.130028867721556[M] 2467.802[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_11.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_11.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.69124 PPL=14.74992
	val epoch=1 loss=2.52559 PPL=12.49825
	train epoch=1 loss=1.12024 PPL=3.06559
	[trained] 0.0[H] 44.83212472200394[M] 2689.927[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_10.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_en_msp_10.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.01462 PPL=7.49789
	val epoch=1 loss=1.79355 PPL=6.01077
	train epoch=1 loss=0.98182 PPL=2.66931
	[trained] 0.0[H] 41.305239562193556[M] 2478.314[sec]
	[hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_10.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0002, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini4', num_workers=4, output_path='mini4', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini4', top_k=0, warmup_steps=1, weight_decay=0.0)
	[train] ['/groups/gcc50582/MSP/mc4_ja_msp_10.jsonl']
	[batch_size] 256
	[accumulate_grad_batches] 2
	val epoch=1 loss=2.34977 PPL=10.48312
	val epoch=1 loss=2.21273 PPL=9.14063
	train epoch=1 loss=1.11990 PPL=3.06456
	[trained] 0.0[H] 44.55307694673538[M] 2673.185[sec]