kkuramitsu commited on
Commit
14f0f69
1 Parent(s): f576455

first version

Browse files
config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "mini2",
3
+ "architectures": [
4
+ "MT5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 1536,
7
+ "d_kv": 64,
8
+ "d_model": 384,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "mt5",
19
+ "num_decoder_layers": 9,
20
+ "num_heads": 9,
21
+ "num_layers": 9,
22
+ "pad_token_id": 0,
23
+ "relative_attention_max_distance": 128,
24
+ "relative_attention_num_buckets": 32,
25
+ "tie_word_embeddings": false,
26
+ "tokenizer_class": "T5Tokenizer",
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.26.1",
29
+ "use_cache": true,
30
+ "vocab_size": 32128
31
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "decoder_start_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.26.1"
7
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a31e76db56aec04c81affe569cfb952c62ce5dea9f9c59c8593fdc08122d556
3
+ size 321795553
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:108ea5dbb232558d744aff5011d29b92a76751c210ad8560e6a65738c9630bdf
3
+ size 775057
tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [],
3
+ "eos_token": "</s>",
4
+ "extra_ids": 0,
5
+ "model_max_length": 1000000000000000019884624838656,
6
+ "name_or_path": "mini2",
7
+ "pad_token": "<pad>",
8
+ "sp_model_kwargs": {},
9
+ "special_tokens_map_file": "/home/acc12952oa/.cache/huggingface/hub/models--kkuramitsu--mt5np_mini12L/snapshots/e66bd8feec1522ea93ed176acb765f0c44f81526/special_tokens_map.json",
10
+ "tokenizer_class": "T5Tokenizer",
11
+ "unk_token": "<unk>"
12
+ }
train_log.txt ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
2
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
3
+ [batch_size] 256
4
+ [accumulate_grad_batches] 2
5
+ val epoch=1 loss=76.57510 PPL=1803619835086933004964966285967360.00000
6
+ val epoch=1 loss=3.55529 PPL=34.99814
7
+ train epoch=1 loss=3.58229 PPL=35.95572
8
+ [trained] 0.0[H] 41.41847747564316[M] 2485.109[sec]
9
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
10
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
11
+ [batch_size] 256
12
+ [accumulate_grad_batches] 2
13
+ val epoch=1 loss=8.62410 PPL=5564.13037
14
+ val epoch=1 loss=3.48060 PPL=32.47906
15
+ train epoch=1 loss=2.05416 PPL=7.80031
16
+ [trained] 0.0[H] 45.51669268210729[M] 2731.002[sec]
17
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
18
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
19
+ [batch_size] 256
20
+ [accumulate_grad_batches] 2
21
+ val epoch=1 loss=5.33357 PPL=207.17598
22
+ val epoch=1 loss=2.69441 PPL=14.79680
23
+ train epoch=1 loss=1.59283 PPL=4.91763
24
+ [trained] 0.0[H] 41.46436125040054[M] 2487.862[sec]
25
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
26
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
27
+ [batch_size] 256
28
+ [accumulate_grad_batches] 2
29
+ val epoch=1 loss=5.03823 PPL=154.19640
30
+ val epoch=1 loss=3.20544 PPL=24.66638
31
+ train epoch=1 loss=1.61361 PPL=5.02092
32
+ [trained] 0.0[H] 45.251987334092455[M] 2715.119[sec]
33
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
34
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
35
+ [batch_size] 256
36
+ [accumulate_grad_batches] 2
37
+ val epoch=1 loss=4.14368 PPL=63.03437
38
+ val epoch=1 loss=2.43705 PPL=11.43929
39
+ train epoch=1 loss=1.37564 PPL=3.95763
40
+ [trained] 0.0[H] 41.47204469839732[M] 2488.323[sec]
41
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
42
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
43
+ [batch_size] 256
44
+ [accumulate_grad_batches] 2
45
+ val epoch=1 loss=4.28832 PPL=72.84402
46
+ val epoch=1 loss=3.02900 PPL=20.67647
47
+ train epoch=1 loss=1.48900 PPL=4.43266
48
+ [trained] 0.0[H] 45.57923027674357[M] 2734.754[sec]
49
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
50
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
51
+ [batch_size] 256
52
+ [accumulate_grad_batches] 2
53
+ val epoch=1 loss=3.70968 PPL=40.84082
54
+ val epoch=1 loss=2.28623 PPL=9.83775
55
+ train epoch=1 loss=1.27682 PPL=3.58522
56
+ [trained] 0.0[H] 41.4678033153216[M] 2488.068[sec]
57
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
58
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
59
+ [batch_size] 256
60
+ [accumulate_grad_batches] 2
61
+ val epoch=1 loss=3.83498 PPL=46.29248
62
+ val epoch=1 loss=2.79002 PPL=16.28134
63
+ train epoch=1 loss=1.41784 PPL=4.12821
64
+ [trained] 0.0[H] 45.09872035185496[M] 2705.923[sec]
65
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
66
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
67
+ [batch_size] 256
68
+ [accumulate_grad_batches] 2
69
+ val epoch=1 loss=3.38932 PPL=29.64582
70
+ val epoch=1 loss=2.20471 PPL=9.06766
71
+ train epoch=1 loss=1.22078 PPL=3.38983
72
+ [trained] 0.0[H] 41.52079544067383[M] 2491.248[sec]
73
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
74
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
75
+ [batch_size] 256
76
+ [accumulate_grad_batches] 2
77
+ val epoch=1 loss=3.77504 PPL=43.59935
78
+ val epoch=1 loss=2.75377 PPL=15.70175
79
+ train epoch=1 loss=1.37220 PPL=3.94404
80
+ [trained] 0.0[H] 45.1388335108757[M] 2708.330[sec]
81
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
82
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
83
+ [batch_size] 256
84
+ [accumulate_grad_batches] 2
85
+ val epoch=1 loss=3.05649 PPL=21.25283
86
+ val epoch=1 loss=2.06552 PPL=7.88940
87
+ train epoch=1 loss=1.18322 PPL=3.26485
88
+ [trained] 0.0[H] 41.343922030925754[M] 2480.635[sec]
89
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
90
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
91
+ [batch_size] 256
92
+ [accumulate_grad_batches] 2
93
+ val epoch=1 loss=3.63945 PPL=38.07071
94
+ val epoch=1 loss=2.74634 PPL=15.58548
95
+ train epoch=1 loss=1.34129 PPL=3.82397
96
+ [trained] 0.0[H] 44.50069724321365[M] 2670.042[sec]
97
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
98
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
99
+ [batch_size] 256
100
+ [accumulate_grad_batches] 2
101
+ val epoch=1 loss=2.99219 PPL=19.92924
102
+ val epoch=1 loss=2.11169 PPL=8.26216
103
+ train epoch=1 loss=1.15597 PPL=3.17710
104
+ [trained] 0.0[H] 41.03153887987137[M] 2461.892[sec]
105
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
106
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
107
+ [batch_size] 256
108
+ [accumulate_grad_batches] 2
109
+ val epoch=1 loss=3.69122 PPL=40.09384
110
+ val epoch=1 loss=2.79154 PPL=16.30605
111
+ train epoch=1 loss=1.31323 PPL=3.71816
112
+ [trained] 0.0[H] 45.27243907054265[M] 2716.346[sec]
113
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
114
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
115
+ [batch_size] 256
116
+ [accumulate_grad_batches] 2
117
+ val epoch=1 loss=2.89017 PPL=17.99635
118
+ val epoch=1 loss=2.05285 PPL=7.79006
119
+ train epoch=1 loss=1.13480 PPL=3.11056
120
+ [trained] 0.0[H] 41.108288780848184[M] 2466.497[sec]
121
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
122
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
123
+ [batch_size] 256
124
+ [accumulate_grad_batches] 2
125
+ val epoch=1 loss=3.43673 PPL=31.08512
126
+ val epoch=1 loss=2.64907 PPL=14.14095
127
+ train epoch=1 loss=1.29298 PPL=3.64363
128
+ [trained] 0.0[H] 44.97415177822113[M] 2698.449[sec]
129
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
130
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
131
+ [batch_size] 256
132
+ [accumulate_grad_batches] 2
133
+ val epoch=1 loss=2.77340 PPL=16.01299
134
+ val epoch=1 loss=1.99160 PPL=7.32726
135
+ train epoch=1 loss=1.11733 PPL=3.05667
136
+ [trained] 0.0[H] 41.14810743729274[M] 2468.886[sec]
137
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
138
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
139
+ [batch_size] 256
140
+ [accumulate_grad_batches] 2
141
+ val epoch=1 loss=3.39647 PPL=29.85852
142
+ val epoch=1 loss=2.36330 PPL=10.62593
143
+ train epoch=1 loss=1.27496 PPL=3.57856
144
+ [trained] 0.0[H] 44.73817230463028[M] 2684.290[sec]
145
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
146
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
147
+ [batch_size] 256
148
+ [accumulate_grad_batches] 2
149
+ val epoch=1 loss=2.74622 PPL=15.58363
150
+ val epoch=1 loss=2.00091 PPL=7.39578
151
+ train epoch=1 loss=1.10269 PPL=3.01226
152
+ [trained] 0.0[H] 41.041836047172545[M] 2462.510[sec]
153
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini1', num_workers=4, output_path='mini1', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini1', top_k=0, warmup_steps=1, weight_decay=0.0)
154
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
155
+ [batch_size] 256
156
+ [accumulate_grad_batches] 2
157
+ val epoch=1 loss=3.45477 PPL=31.65103
158
+ val epoch=1 loss=2.73762 PPL=15.45019
159
+ train epoch=1 loss=1.25830 PPL=3.51942
160
+ [trained] 0.0[H] 45.509643785158794[M] 2730.579[sec]
161
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
162
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_09.jsonl']
163
+ [batch_size] 256
164
+ [accumulate_grad_batches] 2
165
+ val epoch=1 loss=2.71395 PPL=15.08881
166
+ val epoch=1 loss=2.00103 PPL=7.39668
167
+ train epoch=1 loss=1.09001 PPL=2.97429
168
+ [trained] 0.0[H] 41.28162391185761[M] 2476.897[sec]
169
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
170
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_09.jsonl']
171
+ [batch_size] 256
172
+ [accumulate_grad_batches] 2
173
+ val epoch=1 loss=3.26547 PPL=26.19238
174
+ val epoch=1 loss=2.69914 PPL=14.86692
175
+ train epoch=1 loss=1.24174 PPL=3.46165
176
+ [trained] 0.0[H] 45.42912646929423[M] 2725.748[sec]
177
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
178
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_08.jsonl']
179
+ [batch_size] 256
180
+ [accumulate_grad_batches] 2
181
+ val epoch=1 loss=2.68335 PPL=14.63405
182
+ val epoch=1 loss=2.00004 PPL=7.38934
183
+ train epoch=1 loss=1.07841 PPL=2.94001
184
+ [trained] 0.0[H] 41.447514899571736[M] 2486.851[sec]
185
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
186
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_08.jsonl']
187
+ [batch_size] 256
188
+ [accumulate_grad_batches] 2
189
+ val epoch=1 loss=3.27115 PPL=26.34149
190
+ val epoch=1 loss=2.72310 PPL=15.22747
191
+ train epoch=1 loss=1.23098 PPL=3.42457
192
+ [trained] 0.0[H] 45.18751840988795[M] 2711.251[sec]
193
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
194
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_07.jsonl']
195
+ [batch_size] 256
196
+ [accumulate_grad_batches] 2
197
+ val epoch=1 loss=2.57214 PPL=13.09387
198
+ val epoch=1 loss=1.95365 PPL=7.05438
199
+ train epoch=1 loss=1.06908 PPL=2.91269
200
+ [trained] 0.0[H] 40.959261027971905[M] 2457.556[sec]
201
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
202
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_07.jsonl']
203
+ [batch_size] 256
204
+ [accumulate_grad_batches] 2
205
+ val epoch=1 loss=3.18254 PPL=24.10785
206
+ val epoch=1 loss=2.68020 PPL=14.58803
207
+ train epoch=1 loss=1.22046 PPL=3.38875
208
+ [trained] 0.0[H] 45.264945685863495[M] 2715.897[sec]
209
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
210
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_06.jsonl']
211
+ [batch_size] 256
212
+ [accumulate_grad_batches] 2
213
+ val epoch=1 loss=2.57132 PPL=13.08305
214
+ val epoch=1 loss=1.94033 PPL=6.96107
215
+ train epoch=1 loss=1.06083 PPL=2.88875
216
+ [trained] 0.0[H] 41.00604948997498[M] 2460.363[sec]
217
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
218
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_06.jsonl']
219
+ [batch_size] 256
220
+ [accumulate_grad_batches] 2
221
+ val epoch=1 loss=2.99903 PPL=20.06612
222
+ val epoch=1 loss=2.42283 PPL=11.27773
223
+ train epoch=1 loss=1.20782 PPL=3.34619
224
+ [trained] 0.0[H] 45.244081223011015[M] 2714.645[sec]
225
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
226
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_05.jsonl']
227
+ [batch_size] 256
228
+ [accumulate_grad_batches] 2
229
+ val epoch=1 loss=2.55513 PPL=12.87301
230
+ val epoch=1 loss=1.93933 PPL=6.95411
231
+ train epoch=1 loss=1.05271 PPL=2.86539
232
+ [trained] 0.0[H] 41.11795919736226[M] 2467.078[sec]
233
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
234
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl']
235
+ [batch_size] 256
236
+ [accumulate_grad_batches] 2
237
+ [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_05.jsonl'] [Errno 28] No space left on device
238
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
239
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl']
240
+ [batch_size] 256
241
+ [accumulate_grad_batches] 2
242
+ [failed] ['/groups/gcc50582/MSP/mc4_en_msp_04.jsonl'] [Errno 28] No space left on device
243
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
244
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl']
245
+ [batch_size] 256
246
+ [accumulate_grad_batches] 2
247
+ [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_04.jsonl'] [Errno 28] No space left on device
248
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
249
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl']
250
+ [batch_size] 256
251
+ [accumulate_grad_batches] 2
252
+ [failed] ['/groups/gcc50582/MSP/mc4_en_msp_03.jsonl'] [Errno 28] No space left on device
253
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
254
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl']
255
+ [batch_size] 256
256
+ [accumulate_grad_batches] 2
257
+ [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_03.jsonl'] [Errno 28] No space left on device
258
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
259
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl']
260
+ [batch_size] 256
261
+ [accumulate_grad_batches] 2
262
+ [failed] ['/groups/gcc50582/MSP/mc4_en_msp_02.jsonl'] [Errno 28] No space left on device
263
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
264
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl']
265
+ [batch_size] 256
266
+ [accumulate_grad_batches] 2
267
+ [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_02.jsonl'] [Errno 28] No space left on device
268
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
269
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl']
270
+ [batch_size] 256
271
+ [accumulate_grad_batches] 2
272
+ [failed] ['/groups/gcc50582/MSP/mc4_en_msp_01.jsonl'] [Errno 28] No space left on device
273
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
274
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl']
275
+ [batch_size] 256
276
+ [accumulate_grad_batches] 2
277
+ [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_01.jsonl'] [Errno 28] No space left on device
278
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
279
+ [train] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl']
280
+ [batch_size] 256
281
+ [accumulate_grad_batches] 2
282
+ [failed] ['/groups/gcc50582/MSP/mc4_en_msp_00.jsonl'] [Errno 28] No space left on device
283
+ [hparams] Namespace(accelerator=None, adam_epsilon=1e-08, batch_size=256, cache=False, checkpoint_path=None, desc='', devices=1, early_stopping=False, fast_dev_run=False, files=['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'], float32_matmul_precision=None, gradient_accumulation_steps=1, learning_rate=0.0003, max_epochs=1, max_grad_norm=1.0, max_hours=None, max_length=128, model_path='mini2', num_workers=4, output_path='mini2', precision='bf16', pretrain=False, score=None, score_file=None, seed=42, solver='adamw', source_max_length=128, step_batch_size=128, target_max_length=128, tokenizer_path='mini2', top_k=0, warmup_steps=1, weight_decay=0.0)
284
+ [train] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl']
285
+ [batch_size] 256
286
+ [accumulate_grad_batches] 2
287
+ [failed] ['/groups/gcc50582/MSP/mc4_ja_msp_00.jsonl'] [Errno 28] No space left on device