BaoLocTown commited on
Commit
a491beb
1 Parent(s): beaefdd

Model save

Browse files
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: hllj/meta-math-mistral-vi-math
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: sft-metamath-mistral-7b-vi-v1
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ # sft-metamath-mistral-7b-vi-v1
14
+
15
+ This model is a fine-tuned version of [hllj/meta-math-mistral-vi-math](https://huggingface.co/hllj/meta-math-mistral-vi-math) on an unknown dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 0.4947
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 5e-05
37
+ - train_batch_size: 4
38
+ - eval_batch_size: 4
39
+ - seed: 42
40
+ - distributed_type: multi-GPU
41
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
+ - lr_scheduler_type: cosine
43
+ - lr_scheduler_warmup_ratio: 0.05
44
+ - num_epochs: 2
45
+ - mixed_precision_training: Native AMP
46
+
47
+ ### Training results
48
+
49
+ | Training Loss | Epoch | Step | Validation Loss |
50
+ |:-------------:|:-----:|:----:|:---------------:|
51
+ | 0.3128 | 0.26 | 500 | 0.5093 |
52
+ | 0.2751 | 1.07 | 1000 | 0.4884 |
53
+ | 0.2585 | 1.33 | 1500 | 0.4943 |
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.35.2
59
+ - Pytorch 2.1.0
60
+ - Datasets 2.15.0
61
+ - Tokenizers 0.15.0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4abd5ee97c7a0ed272cd2f5d2f4dcf3a0f28cb4dd1101d257b353b71eac73e62
3
  size 872450448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c1c6d38aaa8b73288201d28f58545d8b6f81996152fdc798b7f18efbfca3814
3
  size 872450448
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.45,
3
+ "eval_loss": 0.494733989238739,
4
+ "eval_runtime": 107.6123,
5
+ "eval_samples": 852,
6
+ "eval_samples_per_second": 7.917,
7
+ "eval_steps_per_second": 1.979,
8
+ "train_loss": 0.30544669220139453,
9
+ "train_runtime": 6733.7325,
10
+ "train_samples": 7665,
11
+ "train_samples_per_second": 2.277,
12
+ "train_steps_per_second": 0.569
13
+ }
config_argument.yaml ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cache_dir: ./cache
2
+ ddp_find_unused_parameters: false
3
+ ddp_timeout: 30000
4
+ device_map: auto
5
+ do_eval: true
6
+ do_train: true
7
+ eval_steps: 500
8
+ evaluation_strategy: steps
9
+ fp16: true
10
+ gradient_accumulation_steps: 1
11
+ gradient_checkpointing: true
12
+ gradient_checkpointing_kwargs:
13
+ use_reentrant: false
14
+ hub_model_id: BaoLocTown/sft-metamath-mistral-7b-vi-v1
15
+ hub_strategy: every_save
16
+ learning_rate: 5.0e-05
17
+ log_level: info
18
+ logging_first_step: true
19
+ logging_steps: 10
20
+ logging_strategy: steps
21
+ lora_alpha: 128
22
+ lora_dropout: 0.05
23
+ lora_r: 256
24
+ lora_target_modules:
25
+ - q_proj
26
+ - k_proj
27
+ - v_proj
28
+ - o_proj
29
+ lr_scheduler_type: cosine
30
+ max_seq_length: 1024
31
+ model_name_or_path: hllj/meta-math-mistral-vi-math
32
+ model_type: auto
33
+ num_train_epochs: 2
34
+ output_dir: outputs-sft-metamath-mistral-7b-vi-v1
35
+ overwrite_output_dir: true
36
+ per_device_eval_batch_size: 4
37
+ per_device_train_batch_size: 4
38
+ preprocessing_num_workers: 4
39
+ push_to_hub: true
40
+ report_to: wandb
41
+ run_name: sft-metamath-mistral-7b-vi-v1
42
+ save_steps: 500
43
+ save_strategy: steps
44
+ save_total_limit: 13
45
+ seed: 42
46
+ torch_dtype: float16
47
+ train_file_dir: datasets/finetune
48
+ use_peft: true
49
+ warmup_ratio: 0.05
50
+ weight_decay: 0.05
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.45,
3
+ "eval_loss": 0.494733989238739,
4
+ "eval_runtime": 107.6123,
5
+ "eval_samples": 852,
6
+ "eval_samples_per_second": 7.917,
7
+ "eval_steps_per_second": 1.979
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.45,
3
+ "train_loss": 0.30544669220139453,
4
+ "train_runtime": 6733.7325,
5
+ "train_samples": 7665,
6
+ "train_samples_per_second": 2.277,
7
+ "train_steps_per_second": 0.569
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1084 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.4480959833072509,
5
+ "eval_steps": 500,
6
+ "global_step": 1718,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 2.604166666666667e-07,
14
+ "loss": 0.7721,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.01,
19
+ "learning_rate": 2.604166666666667e-06,
20
+ "loss": 0.7229,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.01,
25
+ "learning_rate": 5.208333333333334e-06,
26
+ "loss": 0.6469,
27
+ "step": 20
28
+ },
29
+ {
30
+ "epoch": 0.02,
31
+ "learning_rate": 7.8125e-06,
32
+ "loss": 0.5906,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.02,
37
+ "learning_rate": 1.0416666666666668e-05,
38
+ "loss": 0.5379,
39
+ "step": 40
40
+ },
41
+ {
42
+ "epoch": 0.03,
43
+ "learning_rate": 1.3020833333333334e-05,
44
+ "loss": 0.4474,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.03,
49
+ "learning_rate": 1.5625e-05,
50
+ "loss": 0.4287,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.04,
55
+ "learning_rate": 1.8229166666666668e-05,
56
+ "loss": 0.371,
57
+ "step": 70
58
+ },
59
+ {
60
+ "epoch": 0.04,
61
+ "learning_rate": 2.0833333333333336e-05,
62
+ "loss": 0.3793,
63
+ "step": 80
64
+ },
65
+ {
66
+ "epoch": 0.05,
67
+ "learning_rate": 2.34375e-05,
68
+ "loss": 0.3635,
69
+ "step": 90
70
+ },
71
+ {
72
+ "epoch": 0.05,
73
+ "learning_rate": 2.604166666666667e-05,
74
+ "loss": 0.3436,
75
+ "step": 100
76
+ },
77
+ {
78
+ "epoch": 0.06,
79
+ "learning_rate": 2.8645833333333333e-05,
80
+ "loss": 0.3837,
81
+ "step": 110
82
+ },
83
+ {
84
+ "epoch": 0.06,
85
+ "learning_rate": 3.125e-05,
86
+ "loss": 0.354,
87
+ "step": 120
88
+ },
89
+ {
90
+ "epoch": 0.07,
91
+ "learning_rate": 3.385416666666667e-05,
92
+ "loss": 0.3472,
93
+ "step": 130
94
+ },
95
+ {
96
+ "epoch": 0.07,
97
+ "learning_rate": 3.6458333333333336e-05,
98
+ "loss": 0.347,
99
+ "step": 140
100
+ },
101
+ {
102
+ "epoch": 0.08,
103
+ "learning_rate": 3.90625e-05,
104
+ "loss": 0.3444,
105
+ "step": 150
106
+ },
107
+ {
108
+ "epoch": 0.08,
109
+ "learning_rate": 4.166666666666667e-05,
110
+ "loss": 0.342,
111
+ "step": 160
112
+ },
113
+ {
114
+ "epoch": 0.09,
115
+ "learning_rate": 4.4270833333333337e-05,
116
+ "loss": 0.3392,
117
+ "step": 170
118
+ },
119
+ {
120
+ "epoch": 0.09,
121
+ "learning_rate": 4.6875e-05,
122
+ "loss": 0.3424,
123
+ "step": 180
124
+ },
125
+ {
126
+ "epoch": 0.1,
127
+ "learning_rate": 4.947916666666667e-05,
128
+ "loss": 0.3616,
129
+ "step": 190
130
+ },
131
+ {
132
+ "epoch": 0.1,
133
+ "learning_rate": 4.99994047380455e-05,
134
+ "loss": 0.3431,
135
+ "step": 200
136
+ },
137
+ {
138
+ "epoch": 0.11,
139
+ "learning_rate": 4.999698653493815e-05,
140
+ "loss": 0.3418,
141
+ "step": 210
142
+ },
143
+ {
144
+ "epoch": 0.11,
145
+ "learning_rate": 4.999270836660003e-05,
146
+ "loss": 0.3645,
147
+ "step": 220
148
+ },
149
+ {
150
+ "epoch": 0.12,
151
+ "learning_rate": 4.998657055135927e-05,
152
+ "loss": 0.3418,
153
+ "step": 230
154
+ },
155
+ {
156
+ "epoch": 0.13,
157
+ "learning_rate": 4.9978573545915854e-05,
158
+ "loss": 0.335,
159
+ "step": 240
160
+ },
161
+ {
162
+ "epoch": 0.13,
163
+ "learning_rate": 4.996871794530757e-05,
164
+ "loss": 0.3441,
165
+ "step": 250
166
+ },
167
+ {
168
+ "epoch": 0.14,
169
+ "learning_rate": 4.9957004482865796e-05,
170
+ "loss": 0.3463,
171
+ "step": 260
172
+ },
173
+ {
174
+ "epoch": 0.14,
175
+ "learning_rate": 4.994343403016093e-05,
176
+ "loss": 0.3284,
177
+ "step": 270
178
+ },
179
+ {
180
+ "epoch": 0.15,
181
+ "learning_rate": 4.992800759693746e-05,
182
+ "loss": 0.3232,
183
+ "step": 280
184
+ },
185
+ {
186
+ "epoch": 0.15,
187
+ "learning_rate": 4.9910726331038935e-05,
188
+ "loss": 0.3251,
189
+ "step": 290
190
+ },
191
+ {
192
+ "epoch": 0.16,
193
+ "learning_rate": 4.989159151832251e-05,
194
+ "loss": 0.3346,
195
+ "step": 300
196
+ },
197
+ {
198
+ "epoch": 0.16,
199
+ "learning_rate": 4.987060458256324e-05,
200
+ "loss": 0.3397,
201
+ "step": 310
202
+ },
203
+ {
204
+ "epoch": 0.17,
205
+ "learning_rate": 4.985013406298429e-05,
206
+ "loss": 0.3273,
207
+ "step": 320
208
+ },
209
+ {
210
+ "epoch": 0.17,
211
+ "learning_rate": 4.9825632509424134e-05,
212
+ "loss": 0.3227,
213
+ "step": 330
214
+ },
215
+ {
216
+ "epoch": 0.18,
217
+ "learning_rate": 4.979928374067127e-05,
218
+ "loss": 0.329,
219
+ "step": 340
220
+ },
221
+ {
222
+ "epoch": 0.18,
223
+ "learning_rate": 4.977108971727373e-05,
224
+ "loss": 0.3177,
225
+ "step": 350
226
+ },
227
+ {
228
+ "epoch": 0.19,
229
+ "learning_rate": 4.9741052537080565e-05,
230
+ "loss": 0.3428,
231
+ "step": 360
232
+ },
233
+ {
234
+ "epoch": 0.19,
235
+ "learning_rate": 4.970917443508558e-05,
236
+ "loss": 0.3077,
237
+ "step": 370
238
+ },
239
+ {
240
+ "epoch": 0.2,
241
+ "learning_rate": 4.967545778326114e-05,
242
+ "loss": 0.3248,
243
+ "step": 380
244
+ },
245
+ {
246
+ "epoch": 0.2,
247
+ "learning_rate": 4.963990509038167e-05,
248
+ "loss": 0.3073,
249
+ "step": 390
250
+ },
251
+ {
252
+ "epoch": 0.21,
253
+ "learning_rate": 4.9602519001836933e-05,
254
+ "loss": 0.3386,
255
+ "step": 400
256
+ },
257
+ {
258
+ "epoch": 0.21,
259
+ "learning_rate": 4.9563302299435246e-05,
260
+ "loss": 0.3272,
261
+ "step": 410
262
+ },
263
+ {
264
+ "epoch": 0.22,
265
+ "learning_rate": 4.952225790119644e-05,
266
+ "loss": 0.3355,
267
+ "step": 420
268
+ },
269
+ {
270
+ "epoch": 0.22,
271
+ "learning_rate": 4.947938886113482e-05,
272
+ "loss": 0.3166,
273
+ "step": 430
274
+ },
275
+ {
276
+ "epoch": 0.23,
277
+ "learning_rate": 4.943469836903181e-05,
278
+ "loss": 0.3289,
279
+ "step": 440
280
+ },
281
+ {
282
+ "epoch": 0.23,
283
+ "learning_rate": 4.938818975019871e-05,
284
+ "loss": 0.3369,
285
+ "step": 450
286
+ },
287
+ {
288
+ "epoch": 0.24,
289
+ "learning_rate": 4.933986646522921e-05,
290
+ "loss": 0.3161,
291
+ "step": 460
292
+ },
293
+ {
294
+ "epoch": 0.25,
295
+ "learning_rate": 4.928973210974192e-05,
296
+ "loss": 0.3296,
297
+ "step": 470
298
+ },
299
+ {
300
+ "epoch": 0.25,
301
+ "learning_rate": 4.9237790414112806e-05,
302
+ "loss": 0.2899,
303
+ "step": 480
304
+ },
305
+ {
306
+ "epoch": 0.26,
307
+ "learning_rate": 4.918404524319766e-05,
308
+ "loss": 0.3216,
309
+ "step": 490
310
+ },
311
+ {
312
+ "epoch": 0.26,
313
+ "learning_rate": 4.912850059604448e-05,
314
+ "loss": 0.3128,
315
+ "step": 500
316
+ },
317
+ {
318
+ "epoch": 0.26,
319
+ "eval_loss": 0.5092809200286865,
320
+ "eval_runtime": 107.6589,
321
+ "eval_samples_per_second": 7.914,
322
+ "eval_steps_per_second": 1.978,
323
+ "step": 500
324
+ },
325
+ {
326
+ "epoch": 0.27,
327
+ "learning_rate": 4.907116060559596e-05,
328
+ "loss": 0.3152,
329
+ "step": 510
330
+ },
331
+ {
332
+ "epoch": 0.27,
333
+ "learning_rate": 4.901202953838191e-05,
334
+ "loss": 0.3091,
335
+ "step": 520
336
+ },
337
+ {
338
+ "epoch": 0.28,
339
+ "learning_rate": 4.8951111794201845e-05,
340
+ "loss": 0.3238,
341
+ "step": 530
342
+ },
343
+ {
344
+ "epoch": 0.28,
345
+ "learning_rate": 4.8888411905797574e-05,
346
+ "loss": 0.3462,
347
+ "step": 540
348
+ },
349
+ {
350
+ "epoch": 0.29,
351
+ "learning_rate": 4.8823934538515946e-05,
352
+ "loss": 0.3253,
353
+ "step": 550
354
+ },
355
+ {
356
+ "epoch": 0.29,
357
+ "learning_rate": 4.875768448996173e-05,
358
+ "loss": 0.3321,
359
+ "step": 560
360
+ },
361
+ {
362
+ "epoch": 0.3,
363
+ "learning_rate": 4.868966668964057e-05,
364
+ "loss": 0.3204,
365
+ "step": 570
366
+ },
367
+ {
368
+ "epoch": 0.3,
369
+ "learning_rate": 4.8619886198592275e-05,
370
+ "loss": 0.3109,
371
+ "step": 580
372
+ },
373
+ {
374
+ "epoch": 0.31,
375
+ "learning_rate": 4.854834820901419e-05,
376
+ "loss": 0.3277,
377
+ "step": 590
378
+ },
379
+ {
380
+ "epoch": 0.31,
381
+ "learning_rate": 4.8475058043874875e-05,
382
+ "loss": 0.3081,
383
+ "step": 600
384
+ },
385
+ {
386
+ "epoch": 0.32,
387
+ "learning_rate": 4.840002115651802e-05,
388
+ "loss": 0.3314,
389
+ "step": 610
390
+ },
391
+ {
392
+ "epoch": 0.32,
393
+ "learning_rate": 4.832324313025669e-05,
394
+ "loss": 0.3164,
395
+ "step": 620
396
+ },
397
+ {
398
+ "epoch": 0.33,
399
+ "learning_rate": 4.824472967795788e-05,
400
+ "loss": 0.294,
401
+ "step": 630
402
+ },
403
+ {
404
+ "epoch": 0.33,
405
+ "learning_rate": 4.816448664161743e-05,
406
+ "loss": 0.3211,
407
+ "step": 640
408
+ },
409
+ {
410
+ "epoch": 0.34,
411
+ "learning_rate": 4.808251999192536e-05,
412
+ "loss": 0.3309,
413
+ "step": 650
414
+ },
415
+ {
416
+ "epoch": 0.34,
417
+ "learning_rate": 4.7998835827821536e-05,
418
+ "loss": 0.3374,
419
+ "step": 660
420
+ },
421
+ {
422
+ "epoch": 0.35,
423
+ "learning_rate": 4.7913440376041975e-05,
424
+ "loss": 0.3269,
425
+ "step": 670
426
+ },
427
+ {
428
+ "epoch": 0.35,
429
+ "learning_rate": 4.782633999065541e-05,
430
+ "loss": 0.3159,
431
+ "step": 680
432
+ },
433
+ {
434
+ "epoch": 0.36,
435
+ "learning_rate": 4.77375411525906e-05,
436
+ "loss": 0.3138,
437
+ "step": 690
438
+ },
439
+ {
440
+ "epoch": 0.37,
441
+ "learning_rate": 4.764705046915402e-05,
442
+ "loss": 0.3362,
443
+ "step": 700
444
+ },
445
+ {
446
+ "epoch": 0.37,
447
+ "learning_rate": 4.755487467353829e-05,
448
+ "loss": 0.3315,
449
+ "step": 710
450
+ },
451
+ {
452
+ "epoch": 0.38,
453
+ "learning_rate": 4.7461020624321104e-05,
454
+ "loss": 0.3344,
455
+ "step": 720
456
+ },
457
+ {
458
+ "epoch": 0.38,
459
+ "learning_rate": 4.7365495304955e-05,
460
+ "loss": 0.3148,
461
+ "step": 730
462
+ },
463
+ {
464
+ "epoch": 0.39,
465
+ "learning_rate": 4.7268305823247635e-05,
466
+ "loss": 0.3054,
467
+ "step": 740
468
+ },
469
+ {
470
+ "epoch": 0.39,
471
+ "learning_rate": 4.7169459410832986e-05,
472
+ "loss": 0.3213,
473
+ "step": 750
474
+ },
475
+ {
476
+ "epoch": 0.4,
477
+ "learning_rate": 4.706896342263319e-05,
478
+ "loss": 0.3165,
479
+ "step": 760
480
+ },
481
+ {
482
+ "epoch": 0.4,
483
+ "learning_rate": 4.6966825336311376e-05,
484
+ "loss": 0.3516,
485
+ "step": 770
486
+ },
487
+ {
488
+ "epoch": 0.41,
489
+ "learning_rate": 4.6863052751715175e-05,
490
+ "loss": 0.3083,
491
+ "step": 780
492
+ },
493
+ {
494
+ "epoch": 0.41,
495
+ "learning_rate": 4.67576533903113e-05,
496
+ "loss": 0.3044,
497
+ "step": 790
498
+ },
499
+ {
500
+ "epoch": 0.42,
501
+ "learning_rate": 4.665063509461097e-05,
502
+ "loss": 0.3207,
503
+ "step": 800
504
+ },
505
+ {
506
+ "epoch": 0.42,
507
+ "learning_rate": 4.65420058275864e-05,
508
+ "loss": 0.3172,
509
+ "step": 810
510
+ },
511
+ {
512
+ "epoch": 0.43,
513
+ "learning_rate": 4.643177367207827e-05,
514
+ "loss": 0.3177,
515
+ "step": 820
516
+ },
517
+ {
518
+ "epoch": 0.43,
519
+ "learning_rate": 4.6319946830194314e-05,
520
+ "loss": 0.3072,
521
+ "step": 830
522
+ },
523
+ {
524
+ "epoch": 0.44,
525
+ "learning_rate": 4.620653362269902e-05,
526
+ "loss": 0.3115,
527
+ "step": 840
528
+ },
529
+ {
530
+ "epoch": 0.44,
531
+ "learning_rate": 4.609154248839449e-05,
532
+ "loss": 0.2946,
533
+ "step": 850
534
+ },
535
+ {
536
+ "epoch": 1.0,
537
+ "learning_rate": 4.597498198349254e-05,
538
+ "loss": 0.2979,
539
+ "step": 860
540
+ },
541
+ {
542
+ "epoch": 1.01,
543
+ "learning_rate": 4.585686078097806e-05,
544
+ "loss": 0.2866,
545
+ "step": 870
546
+ },
547
+ {
548
+ "epoch": 1.01,
549
+ "learning_rate": 4.5737187669963675e-05,
550
+ "loss": 0.2918,
551
+ "step": 880
552
+ },
553
+ {
554
+ "epoch": 1.02,
555
+ "learning_rate": 4.561597155503574e-05,
556
+ "loss": 0.2801,
557
+ "step": 890
558
+ },
559
+ {
560
+ "epoch": 1.02,
561
+ "learning_rate": 4.549322145559181e-05,
562
+ "loss": 0.2839,
563
+ "step": 900
564
+ },
565
+ {
566
+ "epoch": 1.03,
567
+ "learning_rate": 4.536894650516952e-05,
568
+ "loss": 0.2815,
569
+ "step": 910
570
+ },
571
+ {
572
+ "epoch": 1.03,
573
+ "learning_rate": 4.524315595076695e-05,
574
+ "loss": 0.2825,
575
+ "step": 920
576
+ },
577
+ {
578
+ "epoch": 1.04,
579
+ "learning_rate": 4.511585915215462e-05,
580
+ "loss": 0.2788,
581
+ "step": 930
582
+ },
583
+ {
584
+ "epoch": 1.04,
585
+ "learning_rate": 4.498706558117903e-05,
586
+ "loss": 0.3051,
587
+ "step": 940
588
+ },
589
+ {
590
+ "epoch": 1.05,
591
+ "learning_rate": 4.485678482105789e-05,
592
+ "loss": 0.2595,
593
+ "step": 950
594
+ },
595
+ {
596
+ "epoch": 1.05,
597
+ "learning_rate": 4.4725026565667055e-05,
598
+ "loss": 0.2741,
599
+ "step": 960
600
+ },
601
+ {
602
+ "epoch": 1.06,
603
+ "learning_rate": 4.45918006188192e-05,
604
+ "loss": 0.2862,
605
+ "step": 970
606
+ },
607
+ {
608
+ "epoch": 1.06,
609
+ "learning_rate": 4.44571168935344e-05,
610
+ "loss": 0.2712,
611
+ "step": 980
612
+ },
613
+ {
614
+ "epoch": 1.07,
615
+ "learning_rate": 4.432098541130247e-05,
616
+ "loss": 0.2935,
617
+ "step": 990
618
+ },
619
+ {
620
+ "epoch": 1.07,
621
+ "learning_rate": 4.418341630133733e-05,
622
+ "loss": 0.2751,
623
+ "step": 1000
624
+ },
625
+ {
626
+ "epoch": 1.07,
627
+ "eval_loss": 0.4884406328201294,
628
+ "eval_runtime": 107.9712,
629
+ "eval_samples_per_second": 7.891,
630
+ "eval_steps_per_second": 1.973,
631
+ "step": 1000
632
+ },
633
+ {
634
+ "epoch": 1.08,
635
+ "learning_rate": 4.404441979982329e-05,
636
+ "loss": 0.2706,
637
+ "step": 1010
638
+ },
639
+ {
640
+ "epoch": 1.08,
641
+ "learning_rate": 4.3904006249153395e-05,
642
+ "loss": 0.2509,
643
+ "step": 1020
644
+ },
645
+ {
646
+ "epoch": 1.09,
647
+ "learning_rate": 4.376218609715992e-05,
648
+ "loss": 0.2774,
649
+ "step": 1030
650
+ },
651
+ {
652
+ "epoch": 1.09,
653
+ "learning_rate": 4.36189698963369e-05,
654
+ "loss": 0.2675,
655
+ "step": 1040
656
+ },
657
+ {
658
+ "epoch": 1.1,
659
+ "learning_rate": 4.3474368303055e-05,
660
+ "loss": 0.2946,
661
+ "step": 1050
662
+ },
663
+ {
664
+ "epoch": 1.1,
665
+ "learning_rate": 4.3328392076768597e-05,
666
+ "loss": 0.275,
667
+ "step": 1060
668
+ },
669
+ {
670
+ "epoch": 1.11,
671
+ "learning_rate": 4.318105207921515e-05,
672
+ "loss": 0.2635,
673
+ "step": 1070
674
+ },
675
+ {
676
+ "epoch": 1.12,
677
+ "learning_rate": 4.303235927360706e-05,
678
+ "loss": 0.2917,
679
+ "step": 1080
680
+ },
681
+ {
682
+ "epoch": 1.12,
683
+ "learning_rate": 4.288232472381589e-05,
684
+ "loss": 0.275,
685
+ "step": 1090
686
+ },
687
+ {
688
+ "epoch": 1.13,
689
+ "learning_rate": 4.273095959354914e-05,
690
+ "loss": 0.2551,
691
+ "step": 1100
692
+ },
693
+ {
694
+ "epoch": 1.13,
695
+ "learning_rate": 4.257827514551957e-05,
696
+ "loss": 0.2716,
697
+ "step": 1110
698
+ },
699
+ {
700
+ "epoch": 1.14,
701
+ "learning_rate": 4.2424282740607205e-05,
702
+ "loss": 0.2693,
703
+ "step": 1120
704
+ },
705
+ {
706
+ "epoch": 1.14,
707
+ "learning_rate": 4.2268993837013935e-05,
708
+ "loss": 0.2862,
709
+ "step": 1130
710
+ },
711
+ {
712
+ "epoch": 1.15,
713
+ "learning_rate": 4.2112419989411026e-05,
714
+ "loss": 0.2869,
715
+ "step": 1140
716
+ },
717
+ {
718
+ "epoch": 1.15,
719
+ "learning_rate": 4.195457284807927e-05,
720
+ "loss": 0.2698,
721
+ "step": 1150
722
+ },
723
+ {
724
+ "epoch": 1.16,
725
+ "learning_rate": 4.179546415804221e-05,
726
+ "loss": 0.275,
727
+ "step": 1160
728
+ },
729
+ {
730
+ "epoch": 1.16,
731
+ "learning_rate": 4.163510575819212e-05,
732
+ "loss": 0.2569,
733
+ "step": 1170
734
+ },
735
+ {
736
+ "epoch": 1.17,
737
+ "learning_rate": 4.1473509580409174e-05,
738
+ "loss": 0.249,
739
+ "step": 1180
740
+ },
741
+ {
742
+ "epoch": 1.17,
743
+ "learning_rate": 4.131068764867363e-05,
744
+ "loss": 0.283,
745
+ "step": 1190
746
+ },
747
+ {
748
+ "epoch": 1.18,
749
+ "learning_rate": 4.11466520781711e-05,
750
+ "loss": 0.2735,
751
+ "step": 1200
752
+ },
753
+ {
754
+ "epoch": 1.18,
755
+ "learning_rate": 4.0981415074391124e-05,
756
+ "loss": 0.2677,
757
+ "step": 1210
758
+ },
759
+ {
760
+ "epoch": 1.19,
761
+ "learning_rate": 4.081498893221899e-05,
762
+ "loss": 0.2928,
763
+ "step": 1220
764
+ },
765
+ {
766
+ "epoch": 1.19,
767
+ "learning_rate": 4.0647386035020884e-05,
768
+ "loss": 0.2819,
769
+ "step": 1230
770
+ },
771
+ {
772
+ "epoch": 1.2,
773
+ "learning_rate": 4.047861885372249e-05,
774
+ "loss": 0.2644,
775
+ "step": 1240
776
+ },
777
+ {
778
+ "epoch": 1.2,
779
+ "learning_rate": 4.0308699945881055e-05,
780
+ "loss": 0.2757,
781
+ "step": 1250
782
+ },
783
+ {
784
+ "epoch": 1.21,
785
+ "learning_rate": 4.013764195475101e-05,
786
+ "loss": 0.2782,
787
+ "step": 1260
788
+ },
789
+ {
790
+ "epoch": 1.21,
791
+ "learning_rate": 3.9965457608343194e-05,
792
+ "loss": 0.279,
793
+ "step": 1270
794
+ },
795
+ {
796
+ "epoch": 1.22,
797
+ "learning_rate": 3.979215971847783e-05,
798
+ "loss": 0.2726,
799
+ "step": 1280
800
+ },
801
+ {
802
+ "epoch": 1.22,
803
+ "learning_rate": 3.961776117983123e-05,
804
+ "loss": 0.2542,
805
+ "step": 1290
806
+ },
807
+ {
808
+ "epoch": 1.23,
809
+ "learning_rate": 3.944227496897629e-05,
810
+ "loss": 0.263,
811
+ "step": 1300
812
+ },
813
+ {
814
+ "epoch": 1.24,
815
+ "learning_rate": 3.9265714143416967e-05,
816
+ "loss": 0.2762,
817
+ "step": 1310
818
+ },
819
+ {
820
+ "epoch": 1.24,
821
+ "learning_rate": 3.9088091840616705e-05,
822
+ "loss": 0.2825,
823
+ "step": 1320
824
+ },
825
+ {
826
+ "epoch": 1.25,
827
+ "learning_rate": 3.890942127702089e-05,
828
+ "loss": 0.2862,
829
+ "step": 1330
830
+ },
831
+ {
832
+ "epoch": 1.25,
833
+ "learning_rate": 3.872971574707345e-05,
834
+ "loss": 0.2598,
835
+ "step": 1340
836
+ },
837
+ {
838
+ "epoch": 1.26,
839
+ "learning_rate": 3.8548988622227666e-05,
840
+ "loss": 0.2636,
841
+ "step": 1350
842
+ },
843
+ {
844
+ "epoch": 1.26,
845
+ "learning_rate": 3.836725334995123e-05,
846
+ "loss": 0.258,
847
+ "step": 1360
848
+ },
849
+ {
850
+ "epoch": 1.27,
851
+ "learning_rate": 3.81845234527256e-05,
852
+ "loss": 0.2562,
853
+ "step": 1370
854
+ },
855
+ {
856
+ "epoch": 1.27,
857
+ "learning_rate": 3.800081252703993e-05,
858
+ "loss": 0.2711,
859
+ "step": 1380
860
+ },
861
+ {
862
+ "epoch": 1.28,
863
+ "learning_rate": 3.781613424237926e-05,
864
+ "loss": 0.249,
865
+ "step": 1390
866
+ },
867
+ {
868
+ "epoch": 1.28,
869
+ "learning_rate": 3.763050234020752e-05,
870
+ "loss": 0.2386,
871
+ "step": 1400
872
+ },
873
+ {
874
+ "epoch": 1.29,
875
+ "learning_rate": 3.744393063294499e-05,
876
+ "loss": 0.2517,
877
+ "step": 1410
878
+ },
879
+ {
880
+ "epoch": 1.29,
881
+ "learning_rate": 3.725643300294056e-05,
882
+ "loss": 0.2762,
883
+ "step": 1420
884
+ },
885
+ {
886
+ "epoch": 1.3,
887
+ "learning_rate": 3.706802340143881e-05,
888
+ "loss": 0.2344,
889
+ "step": 1430
890
+ },
891
+ {
892
+ "epoch": 1.3,
893
+ "learning_rate": 3.68787158475419e-05,
894
+ "loss": 0.2555,
895
+ "step": 1440
896
+ },
897
+ {
898
+ "epoch": 1.31,
899
+ "learning_rate": 3.668852442716645e-05,
900
+ "loss": 0.2632,
901
+ "step": 1450
902
+ },
903
+ {
904
+ "epoch": 1.31,
905
+ "learning_rate": 3.649746329199545e-05,
906
+ "loss": 0.2422,
907
+ "step": 1460
908
+ },
909
+ {
910
+ "epoch": 1.32,
911
+ "learning_rate": 3.630554665842525e-05,
912
+ "loss": 0.2545,
913
+ "step": 1470
914
+ },
915
+ {
916
+ "epoch": 1.32,
917
+ "learning_rate": 3.611278880650779e-05,
918
+ "loss": 0.2518,
919
+ "step": 1480
920
+ },
921
+ {
922
+ "epoch": 1.33,
923
+ "learning_rate": 3.5919204078888004e-05,
924
+ "loss": 0.2467,
925
+ "step": 1490
926
+ },
927
+ {
928
+ "epoch": 1.33,
929
+ "learning_rate": 3.572480687973665e-05,
930
+ "loss": 0.2585,
931
+ "step": 1500
932
+ },
933
+ {
934
+ "epoch": 1.33,
935
+ "eval_loss": 0.49433010816574097,
936
+ "eval_runtime": 107.5959,
937
+ "eval_samples_per_second": 7.919,
938
+ "eval_steps_per_second": 1.98,
939
+ "step": 1500
940
+ },
941
+ {
942
+ "epoch": 1.34,
943
+ "learning_rate": 3.5529611673678543e-05,
944
+ "loss": 0.2635,
945
+ "step": 1510
946
+ },
947
+ {
948
+ "epoch": 1.34,
949
+ "learning_rate": 3.5333632984716226e-05,
950
+ "loss": 0.2351,
951
+ "step": 1520
952
+ },
953
+ {
954
+ "epoch": 1.35,
955
+ "learning_rate": 3.5136885395149345e-05,
956
+ "loss": 0.2663,
957
+ "step": 1530
958
+ },
959
+ {
960
+ "epoch": 1.36,
961
+ "learning_rate": 3.493938354448954e-05,
962
+ "loss": 0.2527,
963
+ "step": 1540
964
+ },
965
+ {
966
+ "epoch": 1.36,
967
+ "learning_rate": 3.474114212837123e-05,
968
+ "loss": 0.2453,
969
+ "step": 1550
970
+ },
971
+ {
972
+ "epoch": 1.37,
973
+ "learning_rate": 3.454217589745809e-05,
974
+ "loss": 0.2462,
975
+ "step": 1560
976
+ },
977
+ {
978
+ "epoch": 1.37,
979
+ "learning_rate": 3.43424996563455e-05,
980
+ "loss": 0.2612,
981
+ "step": 1570
982
+ },
983
+ {
984
+ "epoch": 1.38,
985
+ "learning_rate": 3.414212826245898e-05,
986
+ "loss": 0.2333,
987
+ "step": 1580
988
+ },
989
+ {
990
+ "epoch": 1.38,
991
+ "learning_rate": 3.394107662494872e-05,
992
+ "loss": 0.257,
993
+ "step": 1590
994
+ },
995
+ {
996
+ "epoch": 1.39,
997
+ "learning_rate": 3.3739359703580144e-05,
998
+ "loss": 0.249,
999
+ "step": 1600
1000
+ },
1001
+ {
1002
+ "epoch": 1.39,
1003
+ "learning_rate": 3.3536992507620854e-05,
1004
+ "loss": 0.2542,
1005
+ "step": 1610
1006
+ },
1007
+ {
1008
+ "epoch": 1.4,
1009
+ "learning_rate": 3.3333990094723826e-05,
1010
+ "loss": 0.2789,
1011
+ "step": 1620
1012
+ },
1013
+ {
1014
+ "epoch": 1.4,
1015
+ "learning_rate": 3.3130367569806965e-05,
1016
+ "loss": 0.2743,
1017
+ "step": 1630
1018
+ },
1019
+ {
1020
+ "epoch": 1.41,
1021
+ "learning_rate": 3.292614008392923e-05,
1022
+ "loss": 0.2592,
1023
+ "step": 1640
1024
+ },
1025
+ {
1026
+ "epoch": 1.41,
1027
+ "learning_rate": 3.272132283316324e-05,
1028
+ "loss": 0.2371,
1029
+ "step": 1650
1030
+ },
1031
+ {
1032
+ "epoch": 1.42,
1033
+ "learning_rate": 3.25159310574646e-05,
1034
+ "loss": 0.2574,
1035
+ "step": 1660
1036
+ },
1037
+ {
1038
+ "epoch": 1.42,
1039
+ "learning_rate": 3.230998003953793e-05,
1040
+ "loss": 0.2278,
1041
+ "step": 1670
1042
+ },
1043
+ {
1044
+ "epoch": 1.43,
1045
+ "learning_rate": 3.210348510369972e-05,
1046
+ "loss": 0.2496,
1047
+ "step": 1680
1048
+ },
1049
+ {
1050
+ "epoch": 1.43,
1051
+ "learning_rate": 3.1896461614738026e-05,
1052
+ "loss": 0.2349,
1053
+ "step": 1690
1054
+ },
1055
+ {
1056
+ "epoch": 1.44,
1057
+ "learning_rate": 3.1688924976769324e-05,
1058
+ "loss": 0.2339,
1059
+ "step": 1700
1060
+ },
1061
+ {
1062
+ "epoch": 1.44,
1063
+ "learning_rate": 3.1480890632092236e-05,
1064
+ "loss": 0.249,
1065
+ "step": 1710
1066
+ },
1067
+ {
1068
+ "epoch": 1.45,
1069
+ "step": 1718,
1070
+ "total_flos": 3.091619088653353e+17,
1071
+ "train_loss": 0.30544669220139453,
1072
+ "train_runtime": 6733.7325,
1073
+ "train_samples_per_second": 2.277,
1074
+ "train_steps_per_second": 0.569
1075
+ }
1076
+ ],
1077
+ "logging_steps": 10,
1078
+ "max_steps": 3834,
1079
+ "num_train_epochs": 2,
1080
+ "save_steps": 500,
1081
+ "total_flos": 3.091619088653353e+17,
1082
+ "trial_name": null,
1083
+ "trial_params": null
1084
+ }