pszemraj commited on
Commit
f60ce31
1 Parent(s): b7c66d7

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - generated_from_trainer
4
+ metrics:
5
+ - accuracy
6
+ ---
7
+
8
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
9
+ should probably proofread and complete it, then remove this comment. -->
10
+
11
+ # griffin-1024-c3t-8layer-simple_wikipedia_LM-vN
12
+
13
+ This model is a fine-tuned version of [./griffin-1024-c3t-8layer](https://huggingface.co/./griffin-1024-c3t-8layer) on an unknown dataset.
14
+ It achieves the following results on the evaluation set:
15
+ - Loss: 4.1928
16
+ - Accuracy: 0.4084
17
+
18
+ ## Model description
19
+
20
+ More information needed
21
+
22
+ ## Intended uses & limitations
23
+
24
+ More information needed
25
+
26
+ ## Training and evaluation data
27
+
28
+ More information needed
29
+
30
+ ## Training procedure
31
+
32
+ ### Training hyperparameters
33
+
34
+ The following hyperparameters were used during training:
35
+ - learning_rate: 0.0003
36
+ - train_batch_size: 4
37
+ - eval_batch_size: 4
38
+ - seed: 80085
39
+ - gradient_accumulation_steps: 32
40
+ - total_train_batch_size: 128
41
+ - optimizer: Adam with betas=(0.9,0.99) and epsilon=1e-07
42
+ - lr_scheduler_type: constant_with_warmup
43
+ - lr_scheduler_warmup_ratio: 0.05
44
+ - num_epochs: 2.0
45
+
46
+ ### Training results
47
+
48
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
49
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
50
+ | 13.2525 | 0.2548 | 100 | 11.9768 | 0.0131 |
51
+ | 8.8873 | 0.5095 | 200 | 8.0127 | 0.0357 |
52
+ | 7.2457 | 0.7643 | 300 | 6.4508 | 0.0512 |
53
+ | 6.3152 | 1.0190 | 400 | 5.6163 | 0.0460 |
54
+ | 5.5586 | 1.2738 | 500 | 4.7645 | 0.3650 |
55
+ | 5.2936 | 1.5285 | 600 | 4.3919 | 0.3934 |
56
+ | 4.8839 | 1.7833 | 700 | 4.1928 | 0.4084 |
57
+
58
+
59
+ ### Framework versions
60
+
61
+ - Transformers 4.40.1
62
+ - Pytorch 2.2.0+cu121
63
+ - Datasets 2.19.0
64
+ - Tokenizers 0.19.1
checkpoint-700/config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_block_types": [
3
+ "recurrent",
4
+ "recurrent",
5
+ "attention"
6
+ ],
7
+ "_name_or_path": "./griffin-1024-c3t-8layer",
8
+ "architectures": [
9
+ "RecurrentGemmaForCausalLM"
10
+ ],
11
+ "attention_bias": false,
12
+ "attention_dropout": 0.0,
13
+ "attention_window_size": 2048,
14
+ "block_types": [
15
+ "recurrent",
16
+ "recurrent",
17
+ "attention"
18
+ ],
19
+ "bos_token_id": 0,
20
+ "conv1d_width": 4,
21
+ "embeddings_scale_by_sqrt_dim": true,
22
+ "eos_token_id": 0,
23
+ "final_w_init_variance_scale": 0.25,
24
+ "head_dim": 128,
25
+ "hidden_activation": "gelu_pytorch_tanh",
26
+ "hidden_size": 1024,
27
+ "intermediate_size": 6144,
28
+ "logits_soft_cap": 30.0,
29
+ "lru_width": 1024,
30
+ "model_type": "recurrent_gemma",
31
+ "num_attention_heads": 8,
32
+ "num_hidden_layers": 8,
33
+ "num_key_value_heads": 2,
34
+ "pad_token_id": 0,
35
+ "partial_rotary_factor": 0.5,
36
+ "rms_norm_eps": 1e-06,
37
+ "rope_theta": 10000.0,
38
+ "torch_dtype": "float32",
39
+ "transformers_version": "4.40.1",
40
+ "use_cache": true,
41
+ "vocab_size": 65024,
42
+ "w_init_variance_scale": 0.01
43
+ }
checkpoint-700/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.40.1"
7
+ }
checkpoint-700/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-700/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:293c092a8f8799e6300d498d53a2ca8e779bea567aebb76936ccee19e8207577
3
+ size 671684224
checkpoint-700/special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<EOT>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<EOT>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<EOT>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
checkpoint-700/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-700/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "0": {
5
+ "content": "<EOT>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "1": {
13
+ "content": "<META>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "2": {
21
+ "content": "<META_START>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "3": {
29
+ "content": "<META_END>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "4": {
37
+ "content": "<SOS>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ }
44
+ },
45
+ "bos_token": "<EOT>",
46
+ "clean_up_tokenization_spaces": true,
47
+ "eos_token": "<EOT>",
48
+ "model_max_length": 200000,
49
+ "tokenizer_class": "GPT2Tokenizer",
50
+ "unk_token": "<EOT>"
51
+ }
checkpoint-700/trainer_state.json ADDED
@@ -0,0 +1,1064 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.7832975081601783,
5
+ "eval_steps": 100,
6
+ "global_step": 700,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.012737839344001274,
13
+ "grad_norm": 7.432073593139648,
14
+ "learning_rate": 3.75e-05,
15
+ "loss": 37.7163,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.02547567868800255,
20
+ "grad_norm": 3.020324945449829,
21
+ "learning_rate": 7.5e-05,
22
+ "loss": 33.9598,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.03821351803200382,
27
+ "grad_norm": 1.8713821172714233,
28
+ "learning_rate": 0.0001125,
29
+ "loss": 31.1454,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.0509513573760051,
34
+ "grad_norm": 1.4051786661148071,
35
+ "learning_rate": 0.00015,
36
+ "loss": 28.2185,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.06368919672000636,
41
+ "grad_norm": 1.191188931465149,
42
+ "learning_rate": 0.00018749999999999998,
43
+ "loss": 26.7731,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.07642703606400764,
48
+ "grad_norm": 1.0205295085906982,
49
+ "learning_rate": 0.000225,
50
+ "loss": 24.2433,
51
+ "step": 30
52
+ },
53
+ {
54
+ "epoch": 0.08916487540800892,
55
+ "grad_norm": 0.9422863125801086,
56
+ "learning_rate": 0.0002625,
57
+ "loss": 22.2937,
58
+ "step": 35
59
+ },
60
+ {
61
+ "epoch": 0.1019027147520102,
62
+ "grad_norm": 0.789577305316925,
63
+ "learning_rate": 0.0003,
64
+ "loss": 20.3004,
65
+ "step": 40
66
+ },
67
+ {
68
+ "epoch": 0.11464055409601147,
69
+ "grad_norm": 0.74051433801651,
70
+ "learning_rate": 0.0003,
71
+ "loss": 18.6344,
72
+ "step": 45
73
+ },
74
+ {
75
+ "epoch": 0.12737839344001273,
76
+ "grad_norm": 0.6285977959632874,
77
+ "learning_rate": 0.0003,
78
+ "loss": 17.597,
79
+ "step": 50
80
+ },
81
+ {
82
+ "epoch": 0.140116232784014,
83
+ "grad_norm": 0.5593317747116089,
84
+ "learning_rate": 0.0003,
85
+ "loss": 16.7403,
86
+ "step": 55
87
+ },
88
+ {
89
+ "epoch": 0.15285407212801527,
90
+ "grad_norm": 0.49293985962867737,
91
+ "learning_rate": 0.0003,
92
+ "loss": 16.0871,
93
+ "step": 60
94
+ },
95
+ {
96
+ "epoch": 0.16559191147201657,
97
+ "grad_norm": 0.4442497789859772,
98
+ "learning_rate": 0.0003,
99
+ "loss": 15.8307,
100
+ "step": 65
101
+ },
102
+ {
103
+ "epoch": 0.17832975081601785,
104
+ "grad_norm": 0.4205341637134552,
105
+ "learning_rate": 0.0003,
106
+ "loss": 15.15,
107
+ "step": 70
108
+ },
109
+ {
110
+ "epoch": 0.19106759016001912,
111
+ "grad_norm": 0.4151783883571625,
112
+ "learning_rate": 0.0003,
113
+ "loss": 15.0298,
114
+ "step": 75
115
+ },
116
+ {
117
+ "epoch": 0.2038054295040204,
118
+ "grad_norm": 0.407552033662796,
119
+ "learning_rate": 0.0003,
120
+ "loss": 14.694,
121
+ "step": 80
122
+ },
123
+ {
124
+ "epoch": 0.21654326884802166,
125
+ "grad_norm": 0.39507508277893066,
126
+ "learning_rate": 0.0003,
127
+ "loss": 14.1493,
128
+ "step": 85
129
+ },
130
+ {
131
+ "epoch": 0.22928110819202294,
132
+ "grad_norm": 0.4005606770515442,
133
+ "learning_rate": 0.0003,
134
+ "loss": 13.8652,
135
+ "step": 90
136
+ },
137
+ {
138
+ "epoch": 0.2420189475360242,
139
+ "grad_norm": 0.38723692297935486,
140
+ "learning_rate": 0.0003,
141
+ "loss": 13.4051,
142
+ "step": 95
143
+ },
144
+ {
145
+ "epoch": 0.25475678688002545,
146
+ "grad_norm": 0.36756670475006104,
147
+ "learning_rate": 0.0003,
148
+ "loss": 13.2525,
149
+ "step": 100
150
+ },
151
+ {
152
+ "epoch": 0.25475678688002545,
153
+ "eval_accuracy": 0.013114369501466275,
154
+ "eval_loss": 11.976838111877441,
155
+ "eval_runtime": 14.6659,
156
+ "eval_samples_per_second": 17.046,
157
+ "eval_steps_per_second": 4.296,
158
+ "step": 100
159
+ },
160
+ {
161
+ "epoch": 0.26749462622402675,
162
+ "grad_norm": 0.3957444727420807,
163
+ "learning_rate": 0.0003,
164
+ "loss": 13.0266,
165
+ "step": 105
166
+ },
167
+ {
168
+ "epoch": 0.280232465568028,
169
+ "grad_norm": 0.3462829887866974,
170
+ "learning_rate": 0.0003,
171
+ "loss": 12.5453,
172
+ "step": 110
173
+ },
174
+ {
175
+ "epoch": 0.2929703049120293,
176
+ "grad_norm": 0.33832067251205444,
177
+ "learning_rate": 0.0003,
178
+ "loss": 12.4192,
179
+ "step": 115
180
+ },
181
+ {
182
+ "epoch": 0.30570814425603055,
183
+ "grad_norm": 0.3296329975128174,
184
+ "learning_rate": 0.0003,
185
+ "loss": 12.16,
186
+ "step": 120
187
+ },
188
+ {
189
+ "epoch": 0.31844598360003185,
190
+ "grad_norm": 0.34072279930114746,
191
+ "learning_rate": 0.0003,
192
+ "loss": 11.9492,
193
+ "step": 125
194
+ },
195
+ {
196
+ "epoch": 0.33118382294403315,
197
+ "grad_norm": 0.317059725522995,
198
+ "learning_rate": 0.0003,
199
+ "loss": 11.7652,
200
+ "step": 130
201
+ },
202
+ {
203
+ "epoch": 0.3439216622880344,
204
+ "grad_norm": 0.32677391171455383,
205
+ "learning_rate": 0.0003,
206
+ "loss": 11.4164,
207
+ "step": 135
208
+ },
209
+ {
210
+ "epoch": 0.3566595016320357,
211
+ "grad_norm": 0.34426912665367126,
212
+ "learning_rate": 0.0003,
213
+ "loss": 11.2546,
214
+ "step": 140
215
+ },
216
+ {
217
+ "epoch": 0.36939734097603694,
218
+ "grad_norm": 0.3237800896167755,
219
+ "learning_rate": 0.0003,
220
+ "loss": 10.9715,
221
+ "step": 145
222
+ },
223
+ {
224
+ "epoch": 0.38213518032003824,
225
+ "grad_norm": 0.3214079439640045,
226
+ "learning_rate": 0.0003,
227
+ "loss": 10.612,
228
+ "step": 150
229
+ },
230
+ {
231
+ "epoch": 0.3948730196640395,
232
+ "grad_norm": 0.3305688500404358,
233
+ "learning_rate": 0.0003,
234
+ "loss": 10.4704,
235
+ "step": 155
236
+ },
237
+ {
238
+ "epoch": 0.4076108590080408,
239
+ "grad_norm": 0.3146178722381592,
240
+ "learning_rate": 0.0003,
241
+ "loss": 10.1443,
242
+ "step": 160
243
+ },
244
+ {
245
+ "epoch": 0.420348698352042,
246
+ "grad_norm": 0.3035760521888733,
247
+ "learning_rate": 0.0003,
248
+ "loss": 10.1132,
249
+ "step": 165
250
+ },
251
+ {
252
+ "epoch": 0.4330865376960433,
253
+ "grad_norm": 0.3146466910839081,
254
+ "learning_rate": 0.0003,
255
+ "loss": 9.7969,
256
+ "step": 170
257
+ },
258
+ {
259
+ "epoch": 0.4458243770400446,
260
+ "grad_norm": 0.3030209243297577,
261
+ "learning_rate": 0.0003,
262
+ "loss": 9.6162,
263
+ "step": 175
264
+ },
265
+ {
266
+ "epoch": 0.4585622163840459,
267
+ "grad_norm": 0.3140353560447693,
268
+ "learning_rate": 0.0003,
269
+ "loss": 9.4377,
270
+ "step": 180
271
+ },
272
+ {
273
+ "epoch": 0.4713000557280471,
274
+ "grad_norm": 0.2986455261707306,
275
+ "learning_rate": 0.0003,
276
+ "loss": 9.1966,
277
+ "step": 185
278
+ },
279
+ {
280
+ "epoch": 0.4840378950720484,
281
+ "grad_norm": 0.30883148312568665,
282
+ "learning_rate": 0.0003,
283
+ "loss": 9.1145,
284
+ "step": 190
285
+ },
286
+ {
287
+ "epoch": 0.49677573441604966,
288
+ "grad_norm": 0.29901790618896484,
289
+ "learning_rate": 0.0003,
290
+ "loss": 8.9888,
291
+ "step": 195
292
+ },
293
+ {
294
+ "epoch": 0.5095135737600509,
295
+ "grad_norm": 0.30676862597465515,
296
+ "learning_rate": 0.0003,
297
+ "loss": 8.8873,
298
+ "step": 200
299
+ },
300
+ {
301
+ "epoch": 0.5095135737600509,
302
+ "eval_accuracy": 0.0356871945259042,
303
+ "eval_loss": 8.012660026550293,
304
+ "eval_runtime": 14.5542,
305
+ "eval_samples_per_second": 17.177,
306
+ "eval_steps_per_second": 4.329,
307
+ "step": 200
308
+ },
309
+ {
310
+ "epoch": 0.5222514131040522,
311
+ "grad_norm": 0.3067306876182556,
312
+ "learning_rate": 0.0003,
313
+ "loss": 8.788,
314
+ "step": 205
315
+ },
316
+ {
317
+ "epoch": 0.5349892524480535,
318
+ "grad_norm": 0.30200693011283875,
319
+ "learning_rate": 0.0003,
320
+ "loss": 8.7241,
321
+ "step": 210
322
+ },
323
+ {
324
+ "epoch": 0.5477270917920548,
325
+ "grad_norm": 0.3111984431743622,
326
+ "learning_rate": 0.0003,
327
+ "loss": 8.4474,
328
+ "step": 215
329
+ },
330
+ {
331
+ "epoch": 0.560464931136056,
332
+ "grad_norm": 0.3096470534801483,
333
+ "learning_rate": 0.0003,
334
+ "loss": 8.4588,
335
+ "step": 220
336
+ },
337
+ {
338
+ "epoch": 0.5732027704800573,
339
+ "grad_norm": 0.28584450483322144,
340
+ "learning_rate": 0.0003,
341
+ "loss": 8.1215,
342
+ "step": 225
343
+ },
344
+ {
345
+ "epoch": 0.5859406098240586,
346
+ "grad_norm": 0.2976541817188263,
347
+ "learning_rate": 0.0003,
348
+ "loss": 8.2402,
349
+ "step": 230
350
+ },
351
+ {
352
+ "epoch": 0.5986784491680599,
353
+ "grad_norm": 0.28990888595581055,
354
+ "learning_rate": 0.0003,
355
+ "loss": 8.0817,
356
+ "step": 235
357
+ },
358
+ {
359
+ "epoch": 0.6114162885120611,
360
+ "grad_norm": 0.30657345056533813,
361
+ "learning_rate": 0.0003,
362
+ "loss": 8.1059,
363
+ "step": 240
364
+ },
365
+ {
366
+ "epoch": 0.6241541278560624,
367
+ "grad_norm": 0.2960628569126129,
368
+ "learning_rate": 0.0003,
369
+ "loss": 7.7854,
370
+ "step": 245
371
+ },
372
+ {
373
+ "epoch": 0.6368919672000637,
374
+ "grad_norm": 0.28521808981895447,
375
+ "learning_rate": 0.0003,
376
+ "loss": 7.9146,
377
+ "step": 250
378
+ },
379
+ {
380
+ "epoch": 0.649629806544065,
381
+ "grad_norm": 0.3004601001739502,
382
+ "learning_rate": 0.0003,
383
+ "loss": 7.7238,
384
+ "step": 255
385
+ },
386
+ {
387
+ "epoch": 0.6623676458880663,
388
+ "grad_norm": 0.2811897099018097,
389
+ "learning_rate": 0.0003,
390
+ "loss": 7.7869,
391
+ "step": 260
392
+ },
393
+ {
394
+ "epoch": 0.6751054852320675,
395
+ "grad_norm": 0.31247615814208984,
396
+ "learning_rate": 0.0003,
397
+ "loss": 7.6116,
398
+ "step": 265
399
+ },
400
+ {
401
+ "epoch": 0.6878433245760688,
402
+ "grad_norm": 0.28785058856010437,
403
+ "learning_rate": 0.0003,
404
+ "loss": 7.6421,
405
+ "step": 270
406
+ },
407
+ {
408
+ "epoch": 0.7005811639200701,
409
+ "grad_norm": 0.3141111731529236,
410
+ "learning_rate": 0.0003,
411
+ "loss": 7.452,
412
+ "step": 275
413
+ },
414
+ {
415
+ "epoch": 0.7133190032640714,
416
+ "grad_norm": 0.2942105233669281,
417
+ "learning_rate": 0.0003,
418
+ "loss": 7.4557,
419
+ "step": 280
420
+ },
421
+ {
422
+ "epoch": 0.7260568426080726,
423
+ "grad_norm": 0.2928450107574463,
424
+ "learning_rate": 0.0003,
425
+ "loss": 7.5162,
426
+ "step": 285
427
+ },
428
+ {
429
+ "epoch": 0.7387946819520739,
430
+ "grad_norm": 0.28676503896713257,
431
+ "learning_rate": 0.0003,
432
+ "loss": 7.3625,
433
+ "step": 290
434
+ },
435
+ {
436
+ "epoch": 0.7515325212960752,
437
+ "grad_norm": 0.32866013050079346,
438
+ "learning_rate": 0.0003,
439
+ "loss": 7.2396,
440
+ "step": 295
441
+ },
442
+ {
443
+ "epoch": 0.7642703606400765,
444
+ "grad_norm": 0.2969712018966675,
445
+ "learning_rate": 0.0003,
446
+ "loss": 7.2457,
447
+ "step": 300
448
+ },
449
+ {
450
+ "epoch": 0.7642703606400765,
451
+ "eval_accuracy": 0.051225806451612906,
452
+ "eval_loss": 6.45078706741333,
453
+ "eval_runtime": 14.5738,
454
+ "eval_samples_per_second": 17.154,
455
+ "eval_steps_per_second": 4.323,
456
+ "step": 300
457
+ },
458
+ {
459
+ "epoch": 0.7770081999840777,
460
+ "grad_norm": 0.3022700548171997,
461
+ "learning_rate": 0.0003,
462
+ "loss": 7.0441,
463
+ "step": 305
464
+ },
465
+ {
466
+ "epoch": 0.789746039328079,
467
+ "grad_norm": 0.29694968461990356,
468
+ "learning_rate": 0.0003,
469
+ "loss": 7.0164,
470
+ "step": 310
471
+ },
472
+ {
473
+ "epoch": 0.8024838786720803,
474
+ "grad_norm": 0.33195170760154724,
475
+ "learning_rate": 0.0003,
476
+ "loss": 7.0554,
477
+ "step": 315
478
+ },
479
+ {
480
+ "epoch": 0.8152217180160816,
481
+ "grad_norm": 0.3369743824005127,
482
+ "learning_rate": 0.0003,
483
+ "loss": 6.9671,
484
+ "step": 320
485
+ },
486
+ {
487
+ "epoch": 0.8279595573600828,
488
+ "grad_norm": 0.3308833837509155,
489
+ "learning_rate": 0.0003,
490
+ "loss": 6.9425,
491
+ "step": 325
492
+ },
493
+ {
494
+ "epoch": 0.840697396704084,
495
+ "grad_norm": 0.33486923575401306,
496
+ "learning_rate": 0.0003,
497
+ "loss": 6.7649,
498
+ "step": 330
499
+ },
500
+ {
501
+ "epoch": 0.8534352360480854,
502
+ "grad_norm": 0.33329740166664124,
503
+ "learning_rate": 0.0003,
504
+ "loss": 6.7606,
505
+ "step": 335
506
+ },
507
+ {
508
+ "epoch": 0.8661730753920867,
509
+ "grad_norm": 0.29600027203559875,
510
+ "learning_rate": 0.0003,
511
+ "loss": 6.6892,
512
+ "step": 340
513
+ },
514
+ {
515
+ "epoch": 0.8789109147360878,
516
+ "grad_norm": 0.31691107153892517,
517
+ "learning_rate": 0.0003,
518
+ "loss": 6.6104,
519
+ "step": 345
520
+ },
521
+ {
522
+ "epoch": 0.8916487540800891,
523
+ "grad_norm": 0.31995242834091187,
524
+ "learning_rate": 0.0003,
525
+ "loss": 6.6987,
526
+ "step": 350
527
+ },
528
+ {
529
+ "epoch": 0.9043865934240904,
530
+ "grad_norm": 0.3355189859867096,
531
+ "learning_rate": 0.0003,
532
+ "loss": 6.5496,
533
+ "step": 355
534
+ },
535
+ {
536
+ "epoch": 0.9171244327680917,
537
+ "grad_norm": 0.34733301401138306,
538
+ "learning_rate": 0.0003,
539
+ "loss": 6.6299,
540
+ "step": 360
541
+ },
542
+ {
543
+ "epoch": 0.9298622721120929,
544
+ "grad_norm": 0.3098255693912506,
545
+ "learning_rate": 0.0003,
546
+ "loss": 6.5739,
547
+ "step": 365
548
+ },
549
+ {
550
+ "epoch": 0.9426001114560942,
551
+ "grad_norm": 0.38446882367134094,
552
+ "learning_rate": 0.0003,
553
+ "loss": 6.5487,
554
+ "step": 370
555
+ },
556
+ {
557
+ "epoch": 0.9553379508000955,
558
+ "grad_norm": 0.33057910203933716,
559
+ "learning_rate": 0.0003,
560
+ "loss": 6.4514,
561
+ "step": 375
562
+ },
563
+ {
564
+ "epoch": 0.9680757901440968,
565
+ "grad_norm": 0.3184017539024353,
566
+ "learning_rate": 0.0003,
567
+ "loss": 6.4215,
568
+ "step": 380
569
+ },
570
+ {
571
+ "epoch": 0.980813629488098,
572
+ "grad_norm": 0.33589789271354675,
573
+ "learning_rate": 0.0003,
574
+ "loss": 6.5542,
575
+ "step": 385
576
+ },
577
+ {
578
+ "epoch": 0.9935514688320993,
579
+ "grad_norm": 0.2836326062679291,
580
+ "learning_rate": 0.0003,
581
+ "loss": 6.3827,
582
+ "step": 390
583
+ },
584
+ {
585
+ "epoch": 1.0062893081761006,
586
+ "grad_norm": 0.3056611120700836,
587
+ "learning_rate": 0.0003,
588
+ "loss": 6.387,
589
+ "step": 395
590
+ },
591
+ {
592
+ "epoch": 1.0190271475201018,
593
+ "grad_norm": 0.2845809757709503,
594
+ "learning_rate": 0.0003,
595
+ "loss": 6.3152,
596
+ "step": 400
597
+ },
598
+ {
599
+ "epoch": 1.0190271475201018,
600
+ "eval_accuracy": 0.04596676441837732,
601
+ "eval_loss": 5.616324424743652,
602
+ "eval_runtime": 14.8427,
603
+ "eval_samples_per_second": 16.843,
604
+ "eval_steps_per_second": 4.245,
605
+ "step": 400
606
+ },
607
+ {
608
+ "epoch": 1.0317649868641032,
609
+ "grad_norm": 0.2871081531047821,
610
+ "learning_rate": 0.0003,
611
+ "loss": 6.3239,
612
+ "step": 405
613
+ },
614
+ {
615
+ "epoch": 1.0445028262081044,
616
+ "grad_norm": 0.37001457810401917,
617
+ "learning_rate": 0.0003,
618
+ "loss": 6.3937,
619
+ "step": 410
620
+ },
621
+ {
622
+ "epoch": 1.0572406655521058,
623
+ "grad_norm": 0.32665711641311646,
624
+ "learning_rate": 0.0003,
625
+ "loss": 6.2438,
626
+ "step": 415
627
+ },
628
+ {
629
+ "epoch": 1.069978504896107,
630
+ "grad_norm": 0.37014567852020264,
631
+ "learning_rate": 0.0003,
632
+ "loss": 6.1073,
633
+ "step": 420
634
+ },
635
+ {
636
+ "epoch": 1.0827163442401082,
637
+ "grad_norm": 0.42654964327812195,
638
+ "learning_rate": 0.0003,
639
+ "loss": 6.2657,
640
+ "step": 425
641
+ },
642
+ {
643
+ "epoch": 1.0954541835841096,
644
+ "grad_norm": 0.43892917037010193,
645
+ "learning_rate": 0.0003,
646
+ "loss": 5.9774,
647
+ "step": 430
648
+ },
649
+ {
650
+ "epoch": 1.1081920229281108,
651
+ "grad_norm": 0.40710192918777466,
652
+ "learning_rate": 0.0003,
653
+ "loss": 6.0797,
654
+ "step": 435
655
+ },
656
+ {
657
+ "epoch": 1.120929862272112,
658
+ "grad_norm": 0.3674974739551544,
659
+ "learning_rate": 0.0003,
660
+ "loss": 5.9008,
661
+ "step": 440
662
+ },
663
+ {
664
+ "epoch": 1.1336677016161134,
665
+ "grad_norm": 0.41214117407798767,
666
+ "learning_rate": 0.0003,
667
+ "loss": 5.9841,
668
+ "step": 445
669
+ },
670
+ {
671
+ "epoch": 1.1464055409601146,
672
+ "grad_norm": 0.7298715114593506,
673
+ "learning_rate": 0.0003,
674
+ "loss": 5.9017,
675
+ "step": 450
676
+ },
677
+ {
678
+ "epoch": 1.159143380304116,
679
+ "grad_norm": 0.4723041355609894,
680
+ "learning_rate": 0.0003,
681
+ "loss": 5.8628,
682
+ "step": 455
683
+ },
684
+ {
685
+ "epoch": 1.1718812196481172,
686
+ "grad_norm": 0.758711576461792,
687
+ "learning_rate": 0.0003,
688
+ "loss": 5.7863,
689
+ "step": 460
690
+ },
691
+ {
692
+ "epoch": 1.1846190589921184,
693
+ "grad_norm": 0.4319106936454773,
694
+ "learning_rate": 0.0003,
695
+ "loss": 5.7691,
696
+ "step": 465
697
+ },
698
+ {
699
+ "epoch": 1.1973568983361198,
700
+ "grad_norm": 0.43299469351768494,
701
+ "learning_rate": 0.0003,
702
+ "loss": 5.667,
703
+ "step": 470
704
+ },
705
+ {
706
+ "epoch": 1.210094737680121,
707
+ "grad_norm": 0.48413950204849243,
708
+ "learning_rate": 0.0003,
709
+ "loss": 5.5798,
710
+ "step": 475
711
+ },
712
+ {
713
+ "epoch": 1.2228325770241222,
714
+ "grad_norm": 0.41688182950019836,
715
+ "learning_rate": 0.0003,
716
+ "loss": 5.6484,
717
+ "step": 480
718
+ },
719
+ {
720
+ "epoch": 1.2355704163681236,
721
+ "grad_norm": 0.9052969813346863,
722
+ "learning_rate": 0.0003,
723
+ "loss": 5.5806,
724
+ "step": 485
725
+ },
726
+ {
727
+ "epoch": 1.2483082557121248,
728
+ "grad_norm": 0.9680259227752686,
729
+ "learning_rate": 0.0003,
730
+ "loss": 5.6531,
731
+ "step": 490
732
+ },
733
+ {
734
+ "epoch": 1.261046095056126,
735
+ "grad_norm": 0.5839616656303406,
736
+ "learning_rate": 0.0003,
737
+ "loss": 5.5656,
738
+ "step": 495
739
+ },
740
+ {
741
+ "epoch": 1.2737839344001274,
742
+ "grad_norm": 0.48688173294067383,
743
+ "learning_rate": 0.0003,
744
+ "loss": 5.5586,
745
+ "step": 500
746
+ },
747
+ {
748
+ "epoch": 1.2737839344001274,
749
+ "eval_accuracy": 0.3649931573802542,
750
+ "eval_loss": 4.76446008682251,
751
+ "eval_runtime": 14.7002,
752
+ "eval_samples_per_second": 17.007,
753
+ "eval_steps_per_second": 4.286,
754
+ "step": 500
755
+ },
756
+ {
757
+ "epoch": 1.2865217737441286,
758
+ "grad_norm": 0.4973750412464142,
759
+ "learning_rate": 0.0003,
760
+ "loss": 5.5474,
761
+ "step": 505
762
+ },
763
+ {
764
+ "epoch": 1.29925961308813,
765
+ "grad_norm": 0.4334980845451355,
766
+ "learning_rate": 0.0003,
767
+ "loss": 5.4498,
768
+ "step": 510
769
+ },
770
+ {
771
+ "epoch": 1.3119974524321312,
772
+ "grad_norm": 0.4760842025279999,
773
+ "learning_rate": 0.0003,
774
+ "loss": 5.4224,
775
+ "step": 515
776
+ },
777
+ {
778
+ "epoch": 1.3247352917761326,
779
+ "grad_norm": 0.5825368762016296,
780
+ "learning_rate": 0.0003,
781
+ "loss": 5.5087,
782
+ "step": 520
783
+ },
784
+ {
785
+ "epoch": 1.3374731311201338,
786
+ "grad_norm": 0.651641309261322,
787
+ "learning_rate": 0.0003,
788
+ "loss": 5.3186,
789
+ "step": 525
790
+ },
791
+ {
792
+ "epoch": 1.350210970464135,
793
+ "grad_norm": 0.5380859375,
794
+ "learning_rate": 0.0003,
795
+ "loss": 5.3928,
796
+ "step": 530
797
+ },
798
+ {
799
+ "epoch": 1.3629488098081364,
800
+ "grad_norm": 0.5173642635345459,
801
+ "learning_rate": 0.0003,
802
+ "loss": 5.351,
803
+ "step": 535
804
+ },
805
+ {
806
+ "epoch": 1.3756866491521376,
807
+ "grad_norm": 0.4927425682544708,
808
+ "learning_rate": 0.0003,
809
+ "loss": 5.2646,
810
+ "step": 540
811
+ },
812
+ {
813
+ "epoch": 1.3884244884961388,
814
+ "grad_norm": 0.6876756548881531,
815
+ "learning_rate": 0.0003,
816
+ "loss": 5.4045,
817
+ "step": 545
818
+ },
819
+ {
820
+ "epoch": 1.4011623278401402,
821
+ "grad_norm": 0.7293450832366943,
822
+ "learning_rate": 0.0003,
823
+ "loss": 5.4399,
824
+ "step": 550
825
+ },
826
+ {
827
+ "epoch": 1.4139001671841414,
828
+ "grad_norm": 0.4836059808731079,
829
+ "learning_rate": 0.0003,
830
+ "loss": 5.3522,
831
+ "step": 555
832
+ },
833
+ {
834
+ "epoch": 1.4266380065281425,
835
+ "grad_norm": 0.5378084778785706,
836
+ "learning_rate": 0.0003,
837
+ "loss": 5.1109,
838
+ "step": 560
839
+ },
840
+ {
841
+ "epoch": 1.439375845872144,
842
+ "grad_norm": 0.5663474202156067,
843
+ "learning_rate": 0.0003,
844
+ "loss": 5.3989,
845
+ "step": 565
846
+ },
847
+ {
848
+ "epoch": 1.4521136852161451,
849
+ "grad_norm": 0.6027519702911377,
850
+ "learning_rate": 0.0003,
851
+ "loss": 5.3082,
852
+ "step": 570
853
+ },
854
+ {
855
+ "epoch": 1.4648515245601466,
856
+ "grad_norm": 0.5912690758705139,
857
+ "learning_rate": 0.0003,
858
+ "loss": 5.3432,
859
+ "step": 575
860
+ },
861
+ {
862
+ "epoch": 1.4775893639041477,
863
+ "grad_norm": 0.5942875742912292,
864
+ "learning_rate": 0.0003,
865
+ "loss": 5.2603,
866
+ "step": 580
867
+ },
868
+ {
869
+ "epoch": 1.4903272032481492,
870
+ "grad_norm": 0.45755377411842346,
871
+ "learning_rate": 0.0003,
872
+ "loss": 5.1047,
873
+ "step": 585
874
+ },
875
+ {
876
+ "epoch": 1.5030650425921503,
877
+ "grad_norm": 0.6130331754684448,
878
+ "learning_rate": 0.0003,
879
+ "loss": 5.1628,
880
+ "step": 590
881
+ },
882
+ {
883
+ "epoch": 1.5158028819361515,
884
+ "grad_norm": 0.6434487700462341,
885
+ "learning_rate": 0.0003,
886
+ "loss": 5.164,
887
+ "step": 595
888
+ },
889
+ {
890
+ "epoch": 1.528540721280153,
891
+ "grad_norm": 0.919582724571228,
892
+ "learning_rate": 0.0003,
893
+ "loss": 5.2936,
894
+ "step": 600
895
+ },
896
+ {
897
+ "epoch": 1.528540721280153,
898
+ "eval_accuracy": 0.39341935483870966,
899
+ "eval_loss": 4.391851425170898,
900
+ "eval_runtime": 14.8627,
901
+ "eval_samples_per_second": 16.821,
902
+ "eval_steps_per_second": 4.239,
903
+ "step": 600
904
+ },
905
+ {
906
+ "epoch": 1.5412785606241541,
907
+ "grad_norm": 0.6150545477867126,
908
+ "learning_rate": 0.0003,
909
+ "loss": 5.0455,
910
+ "step": 605
911
+ },
912
+ {
913
+ "epoch": 1.5540163999681553,
914
+ "grad_norm": 0.5225240588188171,
915
+ "learning_rate": 0.0003,
916
+ "loss": 5.1175,
917
+ "step": 610
918
+ },
919
+ {
920
+ "epoch": 1.5667542393121567,
921
+ "grad_norm": 0.8378353714942932,
922
+ "learning_rate": 0.0003,
923
+ "loss": 5.1146,
924
+ "step": 615
925
+ },
926
+ {
927
+ "epoch": 1.579492078656158,
928
+ "grad_norm": 0.5006564855575562,
929
+ "learning_rate": 0.0003,
930
+ "loss": 4.9924,
931
+ "step": 620
932
+ },
933
+ {
934
+ "epoch": 1.5922299180001591,
935
+ "grad_norm": 0.7312870621681213,
936
+ "learning_rate": 0.0003,
937
+ "loss": 5.0733,
938
+ "step": 625
939
+ },
940
+ {
941
+ "epoch": 1.6049677573441605,
942
+ "grad_norm": 0.6706296801567078,
943
+ "learning_rate": 0.0003,
944
+ "loss": 4.9791,
945
+ "step": 630
946
+ },
947
+ {
948
+ "epoch": 1.6177055966881617,
949
+ "grad_norm": 0.5874515175819397,
950
+ "learning_rate": 0.0003,
951
+ "loss": 5.0827,
952
+ "step": 635
953
+ },
954
+ {
955
+ "epoch": 1.630443436032163,
956
+ "grad_norm": 0.6047885417938232,
957
+ "learning_rate": 0.0003,
958
+ "loss": 5.1284,
959
+ "step": 640
960
+ },
961
+ {
962
+ "epoch": 1.6431812753761643,
963
+ "grad_norm": 0.8195576667785645,
964
+ "learning_rate": 0.0003,
965
+ "loss": 5.0817,
966
+ "step": 645
967
+ },
968
+ {
969
+ "epoch": 1.6559191147201657,
970
+ "grad_norm": 0.8390661478042603,
971
+ "learning_rate": 0.0003,
972
+ "loss": 4.9869,
973
+ "step": 650
974
+ },
975
+ {
976
+ "epoch": 1.668656954064167,
977
+ "grad_norm": 0.6308897733688354,
978
+ "learning_rate": 0.0003,
979
+ "loss": 5.0157,
980
+ "step": 655
981
+ },
982
+ {
983
+ "epoch": 1.681394793408168,
984
+ "grad_norm": 0.9929732084274292,
985
+ "learning_rate": 0.0003,
986
+ "loss": 5.0349,
987
+ "step": 660
988
+ },
989
+ {
990
+ "epoch": 1.6941326327521695,
991
+ "grad_norm": 0.660764753818512,
992
+ "learning_rate": 0.0003,
993
+ "loss": 5.1106,
994
+ "step": 665
995
+ },
996
+ {
997
+ "epoch": 1.7068704720961707,
998
+ "grad_norm": 0.7146616578102112,
999
+ "learning_rate": 0.0003,
1000
+ "loss": 5.0494,
1001
+ "step": 670
1002
+ },
1003
+ {
1004
+ "epoch": 1.719608311440172,
1005
+ "grad_norm": 0.8408402800559998,
1006
+ "learning_rate": 0.0003,
1007
+ "loss": 4.9912,
1008
+ "step": 675
1009
+ },
1010
+ {
1011
+ "epoch": 1.7323461507841733,
1012
+ "grad_norm": 0.7403599619865417,
1013
+ "learning_rate": 0.0003,
1014
+ "loss": 4.8969,
1015
+ "step": 680
1016
+ },
1017
+ {
1018
+ "epoch": 1.7450839901281745,
1019
+ "grad_norm": 0.9758443832397461,
1020
+ "learning_rate": 0.0003,
1021
+ "loss": 4.9444,
1022
+ "step": 685
1023
+ },
1024
+ {
1025
+ "epoch": 1.7578218294721757,
1026
+ "grad_norm": 0.551741898059845,
1027
+ "learning_rate": 0.0003,
1028
+ "loss": 4.9441,
1029
+ "step": 690
1030
+ },
1031
+ {
1032
+ "epoch": 1.770559668816177,
1033
+ "grad_norm": 0.6962785720825195,
1034
+ "learning_rate": 0.0003,
1035
+ "loss": 4.9756,
1036
+ "step": 695
1037
+ },
1038
+ {
1039
+ "epoch": 1.7832975081601783,
1040
+ "grad_norm": 0.5543167591094971,
1041
+ "learning_rate": 0.0003,
1042
+ "loss": 4.8839,
1043
+ "step": 700
1044
+ },
1045
+ {
1046
+ "epoch": 1.7832975081601783,
1047
+ "eval_accuracy": 0.40842619745845554,
1048
+ "eval_loss": 4.192822456359863,
1049
+ "eval_runtime": 14.7931,
1050
+ "eval_samples_per_second": 16.9,
1051
+ "eval_steps_per_second": 4.259,
1052
+ "step": 700
1053
+ }
1054
+ ],
1055
+ "logging_steps": 5,
1056
+ "max_steps": 784,
1057
+ "num_input_tokens_seen": 0,
1058
+ "num_train_epochs": 2,
1059
+ "save_steps": 100,
1060
+ "total_flos": 5.578286899711181e+16,
1061
+ "train_batch_size": 4,
1062
+ "trial_name": null,
1063
+ "trial_params": null
1064
+ }
checkpoint-700/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:698e3be45e384522b83865d39a95c98084e8feccafe7e17ac91b0853d9a956a4
3
+ size 5176
checkpoint-700/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 0,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.40.1"
7
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:293c092a8f8799e6300d498d53a2ca8e779bea567aebb76936ccee19e8207577
3
  size 671684224
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0f05c2c379e62ba1f6aefd3101cab184a44bf05d79ac51787305f79f74706cb
3
  size 671684224