hllj commited on
Commit
ab64e51
1 Parent(s): a3cae6f

Model save

Browse files
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: hllj/mistral-vi-math
3
+ tags:
4
+ - generated_from_trainer
5
+ model-index:
6
+ - name: sft-mistral-v2-clean-valid
7
+ results: []
8
+ ---
9
+
10
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
11
+ should probably proofread and complete it, then remove this comment. -->
12
+
13
+ # sft-mistral-v2-clean-valid
14
+
15
+ This model is a fine-tuned version of [hllj/mistral-vi-math](https://huggingface.co/hllj/mistral-vi-math) on an unknown dataset.
16
+ It achieves the following results on the evaluation set:
17
+ - Loss: 0.3176
18
+
19
+ ## Model description
20
+
21
+ More information needed
22
+
23
+ ## Intended uses & limitations
24
+
25
+ More information needed
26
+
27
+ ## Training and evaluation data
28
+
29
+ More information needed
30
+
31
+ ## Training procedure
32
+
33
+ ### Training hyperparameters
34
+
35
+ The following hyperparameters were used during training:
36
+ - learning_rate: 3e-05
37
+ - train_batch_size: 4
38
+ - eval_batch_size: 4
39
+ - seed: 42
40
+ - distributed_type: multi-GPU
41
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
+ - lr_scheduler_type: cosine
43
+ - lr_scheduler_warmup_ratio: 0.05
44
+ - num_epochs: 2
45
+ - mixed_precision_training: Native AMP
46
+
47
+ ### Training results
48
+
49
+ | Training Loss | Epoch | Step | Validation Loss |
50
+ |:-------------:|:-----:|:----:|:---------------:|
51
+ | 0.3163 | 0.23 | 500 | 0.4199 |
52
+ | 0.2988 | 1.02 | 1000 | 0.3697 |
53
+ | 0.2716 | 1.25 | 1500 | 0.3408 |
54
+
55
+
56
+ ### Framework versions
57
+
58
+ - Transformers 4.35.2
59
+ - Pytorch 2.1.0
60
+ - Datasets 2.15.0
61
+ - Tokenizers 0.15.0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:808ef63f4c3abf55366d6ffa00acbdb42c64d5d27a80eb417c0ee20020711520
3
  size 872450448
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b8f36ea0088edbb1a9d2ce2c84cf16e562021deec2051d522515f70b2562edf
3
  size 872450448
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.44,
3
+ "eval_loss": 0.31756070256233215,
4
+ "eval_runtime": 77.5284,
5
+ "eval_samples": 1336,
6
+ "eval_samples_per_second": 17.232,
7
+ "eval_steps_per_second": 4.308,
8
+ "train_loss": 0.3273629436693876,
9
+ "train_runtime": 4032.5888,
10
+ "train_samples": 8657,
11
+ "train_samples_per_second": 4.294,
12
+ "train_steps_per_second": 1.074
13
+ }
config_argument.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cache_dir: ./cache
2
+ ddp_find_unused_parameters: false
3
+ ddp_timeout: 30000
4
+ device_map: auto
5
+ do_eval: true
6
+ do_train: true
7
+ eval_steps: 500
8
+ evaluation_strategy: steps
9
+ fp16: true
10
+ gradient_accumulation_steps: 1
11
+ gradient_checkpointing: true
12
+ gradient_checkpointing_kwargs:
13
+ use_reentrant: false
14
+ hub_model_id: hllj/sft-mistral-v2-clean-valid
15
+ hub_strategy: every_save
16
+ learning_rate: 3.0e-05
17
+ log_level: info
18
+ logging_first_step: true
19
+ logging_steps: 10
20
+ logging_strategy: steps
21
+ lora_alpha: 128
22
+ lora_dropout: 0.05
23
+ lora_r: 256
24
+ lora_target_modules:
25
+ - q_proj
26
+ - k_proj
27
+ - v_proj
28
+ - o_proj
29
+ lr_scheduler_type: cosine
30
+ max_seq_length: 1024
31
+ model_name_or_path: hllj/mistral-vi-math
32
+ model_type: auto
33
+ num_train_epochs: 2
34
+ output_dir: outputs-sft-mistral-v2-clean-valid
35
+ overwrite_output_dir: true
36
+ per_device_eval_batch_size: 4
37
+ per_device_train_batch_size: 4
38
+ preprocessing_num_workers: 4
39
+ push_to_hub: true
40
+ report_to: wandb
41
+ run_name: sft-mistral-v2-clean-valid
42
+ save_steps: 500
43
+ save_strategy: steps
44
+ save_total_limit: 13
45
+ seed: 42
46
+ token: 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
47
+ torch_dtype: float16
48
+ train_file_dir: datasets/finetune
49
+ use_peft: true
50
+ validation_file_dir: datasets/validation
51
+ warmup_ratio: 0.05
52
+ weight_decay: 0.05
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.44,
3
+ "eval_loss": 0.31756070256233215,
4
+ "eval_runtime": 77.5284,
5
+ "eval_samples": 1336,
6
+ "eval_samples_per_second": 17.232,
7
+ "eval_steps_per_second": 4.308
8
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.44,
3
+ "train_loss": 0.3273629436693876,
4
+ "train_runtime": 4032.5888,
5
+ "train_samples": 8657,
6
+ "train_samples_per_second": 4.294,
7
+ "train_steps_per_second": 1.074
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.4438799076212472,
5
+ "eval_steps": 500,
6
+ "global_step": 1922,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 1.3824884792626728e-07,
14
+ "loss": 0.7351,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.0,
19
+ "learning_rate": 1.3824884792626729e-06,
20
+ "loss": 0.7455,
21
+ "step": 10
22
+ },
23
+ {
24
+ "epoch": 0.01,
25
+ "learning_rate": 2.7649769585253458e-06,
26
+ "loss": 0.7061,
27
+ "step": 20
28
+ },
29
+ {
30
+ "epoch": 0.01,
31
+ "learning_rate": 4.147465437788019e-06,
32
+ "loss": 0.6593,
33
+ "step": 30
34
+ },
35
+ {
36
+ "epoch": 0.02,
37
+ "learning_rate": 5.5299539170506915e-06,
38
+ "loss": 0.6177,
39
+ "step": 40
40
+ },
41
+ {
42
+ "epoch": 0.02,
43
+ "learning_rate": 6.912442396313364e-06,
44
+ "loss": 0.5817,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.03,
49
+ "learning_rate": 8.294930875576038e-06,
50
+ "loss": 0.5095,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.03,
55
+ "learning_rate": 9.67741935483871e-06,
56
+ "loss": 0.4615,
57
+ "step": 70
58
+ },
59
+ {
60
+ "epoch": 0.04,
61
+ "learning_rate": 1.1059907834101383e-05,
62
+ "loss": 0.4126,
63
+ "step": 80
64
+ },
65
+ {
66
+ "epoch": 0.04,
67
+ "learning_rate": 1.2442396313364056e-05,
68
+ "loss": 0.4067,
69
+ "step": 90
70
+ },
71
+ {
72
+ "epoch": 0.05,
73
+ "learning_rate": 1.3824884792626728e-05,
74
+ "loss": 0.3974,
75
+ "step": 100
76
+ },
77
+ {
78
+ "epoch": 0.05,
79
+ "learning_rate": 1.5207373271889403e-05,
80
+ "loss": 0.3864,
81
+ "step": 110
82
+ },
83
+ {
84
+ "epoch": 0.06,
85
+ "learning_rate": 1.6589861751152075e-05,
86
+ "loss": 0.4016,
87
+ "step": 120
88
+ },
89
+ {
90
+ "epoch": 0.06,
91
+ "learning_rate": 1.7972350230414745e-05,
92
+ "loss": 0.3873,
93
+ "step": 130
94
+ },
95
+ {
96
+ "epoch": 0.06,
97
+ "learning_rate": 1.935483870967742e-05,
98
+ "loss": 0.3837,
99
+ "step": 140
100
+ },
101
+ {
102
+ "epoch": 0.07,
103
+ "learning_rate": 2.0737327188940094e-05,
104
+ "loss": 0.3868,
105
+ "step": 150
106
+ },
107
+ {
108
+ "epoch": 0.07,
109
+ "learning_rate": 2.2119815668202766e-05,
110
+ "loss": 0.3817,
111
+ "step": 160
112
+ },
113
+ {
114
+ "epoch": 0.08,
115
+ "learning_rate": 2.350230414746544e-05,
116
+ "loss": 0.3593,
117
+ "step": 170
118
+ },
119
+ {
120
+ "epoch": 0.08,
121
+ "learning_rate": 2.488479262672811e-05,
122
+ "loss": 0.3641,
123
+ "step": 180
124
+ },
125
+ {
126
+ "epoch": 0.09,
127
+ "learning_rate": 2.6267281105990784e-05,
128
+ "loss": 0.3497,
129
+ "step": 190
130
+ },
131
+ {
132
+ "epoch": 0.09,
133
+ "learning_rate": 2.7649769585253457e-05,
134
+ "loss": 0.3679,
135
+ "step": 200
136
+ },
137
+ {
138
+ "epoch": 0.1,
139
+ "learning_rate": 2.903225806451613e-05,
140
+ "loss": 0.3751,
141
+ "step": 210
142
+ },
143
+ {
144
+ "epoch": 0.1,
145
+ "learning_rate": 2.9999960619075335e-05,
146
+ "loss": 0.3828,
147
+ "step": 220
148
+ },
149
+ {
150
+ "epoch": 0.11,
151
+ "learning_rate": 2.9999260519500367e-05,
152
+ "loss": 0.3763,
153
+ "step": 230
154
+ },
155
+ {
156
+ "epoch": 0.11,
157
+ "learning_rate": 2.9997685335280646e-05,
158
+ "loss": 0.3553,
159
+ "step": 240
160
+ },
161
+ {
162
+ "epoch": 0.12,
163
+ "learning_rate": 2.9995235158315353e-05,
164
+ "loss": 0.3589,
165
+ "step": 250
166
+ },
167
+ {
168
+ "epoch": 0.12,
169
+ "learning_rate": 2.999191013155234e-05,
170
+ "loss": 0.3585,
171
+ "step": 260
172
+ },
173
+ {
174
+ "epoch": 0.12,
175
+ "learning_rate": 2.998771044897983e-05,
176
+ "loss": 0.3529,
177
+ "step": 270
178
+ },
179
+ {
180
+ "epoch": 0.13,
181
+ "learning_rate": 2.9982636355615092e-05,
182
+ "loss": 0.3303,
183
+ "step": 280
184
+ },
185
+ {
186
+ "epoch": 0.13,
187
+ "learning_rate": 2.997668814749012e-05,
188
+ "loss": 0.3696,
189
+ "step": 290
190
+ },
191
+ {
192
+ "epoch": 0.14,
193
+ "learning_rate": 2.99698661716344e-05,
194
+ "loss": 0.3353,
195
+ "step": 300
196
+ },
197
+ {
198
+ "epoch": 0.14,
199
+ "learning_rate": 2.9962170826054645e-05,
200
+ "loss": 0.3562,
201
+ "step": 310
202
+ },
203
+ {
204
+ "epoch": 0.15,
205
+ "learning_rate": 2.995360255971157e-05,
206
+ "loss": 0.3652,
207
+ "step": 320
208
+ },
209
+ {
210
+ "epoch": 0.15,
211
+ "learning_rate": 2.994416187249371e-05,
212
+ "loss": 0.3522,
213
+ "step": 330
214
+ },
215
+ {
216
+ "epoch": 0.16,
217
+ "learning_rate": 2.9933849315188233e-05,
218
+ "loss": 0.3754,
219
+ "step": 340
220
+ },
221
+ {
222
+ "epoch": 0.16,
223
+ "learning_rate": 2.992266548944885e-05,
224
+ "loss": 0.3348,
225
+ "step": 350
226
+ },
227
+ {
228
+ "epoch": 0.17,
229
+ "learning_rate": 2.991061104776067e-05,
230
+ "loss": 0.3513,
231
+ "step": 360
232
+ },
233
+ {
234
+ "epoch": 0.17,
235
+ "learning_rate": 2.9897686693402138e-05,
236
+ "loss": 0.3214,
237
+ "step": 370
238
+ },
239
+ {
240
+ "epoch": 0.18,
241
+ "learning_rate": 2.9883893180404046e-05,
242
+ "loss": 0.3451,
243
+ "step": 380
244
+ },
245
+ {
246
+ "epoch": 0.18,
247
+ "learning_rate": 2.986923131350549e-05,
248
+ "loss": 0.3579,
249
+ "step": 390
250
+ },
251
+ {
252
+ "epoch": 0.18,
253
+ "learning_rate": 2.9853701948106944e-05,
254
+ "loss": 0.3353,
255
+ "step": 400
256
+ },
257
+ {
258
+ "epoch": 0.19,
259
+ "learning_rate": 2.9837305990220357e-05,
260
+ "loss": 0.3374,
261
+ "step": 410
262
+ },
263
+ {
264
+ "epoch": 0.19,
265
+ "learning_rate": 2.982004439641628e-05,
266
+ "loss": 0.3544,
267
+ "step": 420
268
+ },
269
+ {
270
+ "epoch": 0.2,
271
+ "learning_rate": 2.980191817376808e-05,
272
+ "loss": 0.3313,
273
+ "step": 430
274
+ },
275
+ {
276
+ "epoch": 0.2,
277
+ "learning_rate": 2.9782928379793154e-05,
278
+ "loss": 0.3488,
279
+ "step": 440
280
+ },
281
+ {
282
+ "epoch": 0.21,
283
+ "learning_rate": 2.976307612239127e-05,
284
+ "loss": 0.3384,
285
+ "step": 450
286
+ },
287
+ {
288
+ "epoch": 0.21,
289
+ "learning_rate": 2.97423625597799e-05,
290
+ "loss": 0.3359,
291
+ "step": 460
292
+ },
293
+ {
294
+ "epoch": 0.22,
295
+ "learning_rate": 2.9720788900426657e-05,
296
+ "loss": 0.3353,
297
+ "step": 470
298
+ },
299
+ {
300
+ "epoch": 0.22,
301
+ "learning_rate": 2.969835640297879e-05,
302
+ "loss": 0.356,
303
+ "step": 480
304
+ },
305
+ {
306
+ "epoch": 0.23,
307
+ "learning_rate": 2.967506637618976e-05,
308
+ "loss": 0.3575,
309
+ "step": 490
310
+ },
311
+ {
312
+ "epoch": 0.23,
313
+ "learning_rate": 2.9650920178842874e-05,
314
+ "loss": 0.3163,
315
+ "step": 500
316
+ },
317
+ {
318
+ "epoch": 0.23,
319
+ "eval_loss": 0.41985267400741577,
320
+ "eval_runtime": 77.3783,
321
+ "eval_samples_per_second": 17.266,
322
+ "eval_steps_per_second": 4.316,
323
+ "step": 500
324
+ },
325
+ {
326
+ "epoch": 0.24,
327
+ "learning_rate": 2.9625919219672017e-05,
328
+ "loss": 0.3489,
329
+ "step": 510
330
+ },
331
+ {
332
+ "epoch": 0.24,
333
+ "learning_rate": 2.960006495727946e-05,
334
+ "loss": 0.3237,
335
+ "step": 520
336
+ },
337
+ {
338
+ "epoch": 0.24,
339
+ "learning_rate": 2.9573358900050764e-05,
340
+ "loss": 0.3336,
341
+ "step": 530
342
+ },
343
+ {
344
+ "epoch": 0.25,
345
+ "learning_rate": 2.9545802606066778e-05,
346
+ "loss": 0.3247,
347
+ "step": 540
348
+ },
349
+ {
350
+ "epoch": 0.25,
351
+ "learning_rate": 2.9517397683012747e-05,
352
+ "loss": 0.3316,
353
+ "step": 550
354
+ },
355
+ {
356
+ "epoch": 0.26,
357
+ "learning_rate": 2.9488145788084502e-05,
358
+ "loss": 0.3504,
359
+ "step": 560
360
+ },
361
+ {
362
+ "epoch": 0.26,
363
+ "learning_rate": 2.945804862789178e-05,
364
+ "loss": 0.3387,
365
+ "step": 570
366
+ },
367
+ {
368
+ "epoch": 0.27,
369
+ "learning_rate": 2.942710795835866e-05,
370
+ "loss": 0.3407,
371
+ "step": 580
372
+ },
373
+ {
374
+ "epoch": 0.27,
375
+ "learning_rate": 2.9395325584621122e-05,
376
+ "loss": 0.34,
377
+ "step": 590
378
+ },
379
+ {
380
+ "epoch": 0.28,
381
+ "learning_rate": 2.9362703360921722e-05,
382
+ "loss": 0.3314,
383
+ "step": 600
384
+ },
385
+ {
386
+ "epoch": 0.28,
387
+ "learning_rate": 2.932924319050143e-05,
388
+ "loss": 0.3488,
389
+ "step": 610
390
+ },
391
+ {
392
+ "epoch": 0.29,
393
+ "learning_rate": 2.9294947025488568e-05,
394
+ "loss": 0.355,
395
+ "step": 620
396
+ },
397
+ {
398
+ "epoch": 0.29,
399
+ "learning_rate": 2.925981686678494e-05,
400
+ "loss": 0.3252,
401
+ "step": 630
402
+ },
403
+ {
404
+ "epoch": 0.3,
405
+ "learning_rate": 2.9223854763949082e-05,
406
+ "loss": 0.3424,
407
+ "step": 640
408
+ },
409
+ {
410
+ "epoch": 0.3,
411
+ "learning_rate": 2.9187062815076688e-05,
412
+ "loss": 0.3443,
413
+ "step": 650
414
+ },
415
+ {
416
+ "epoch": 0.3,
417
+ "learning_rate": 2.914944316667822e-05,
418
+ "loss": 0.3288,
419
+ "step": 660
420
+ },
421
+ {
422
+ "epoch": 0.31,
423
+ "learning_rate": 2.9110998013553653e-05,
424
+ "loss": 0.3194,
425
+ "step": 670
426
+ },
427
+ {
428
+ "epoch": 0.31,
429
+ "learning_rate": 2.9071729598664433e-05,
430
+ "loss": 0.3259,
431
+ "step": 680
432
+ },
433
+ {
434
+ "epoch": 0.32,
435
+ "learning_rate": 2.9031640213002638e-05,
436
+ "loss": 0.3348,
437
+ "step": 690
438
+ },
439
+ {
440
+ "epoch": 0.32,
441
+ "learning_rate": 2.899073219545729e-05,
442
+ "loss": 0.3595,
443
+ "step": 700
444
+ },
445
+ {
446
+ "epoch": 0.33,
447
+ "learning_rate": 2.8949007932677915e-05,
448
+ "loss": 0.3315,
449
+ "step": 710
450
+ },
451
+ {
452
+ "epoch": 0.33,
453
+ "learning_rate": 2.89064698589353e-05,
454
+ "loss": 0.3208,
455
+ "step": 720
456
+ },
457
+ {
458
+ "epoch": 0.34,
459
+ "learning_rate": 2.8863120455979458e-05,
460
+ "loss": 0.3311,
461
+ "step": 730
462
+ },
463
+ {
464
+ "epoch": 0.34,
465
+ "learning_rate": 2.8818962252894872e-05,
466
+ "loss": 0.3459,
467
+ "step": 740
468
+ },
469
+ {
470
+ "epoch": 0.35,
471
+ "learning_rate": 2.8773997825952914e-05,
472
+ "loss": 0.3296,
473
+ "step": 750
474
+ },
475
+ {
476
+ "epoch": 0.35,
477
+ "learning_rate": 2.872822979846154e-05,
478
+ "loss": 0.3248,
479
+ "step": 760
480
+ },
481
+ {
482
+ "epoch": 0.36,
483
+ "learning_rate": 2.8681660840612262e-05,
484
+ "loss": 0.3046,
485
+ "step": 770
486
+ },
487
+ {
488
+ "epoch": 0.36,
489
+ "learning_rate": 2.8634293669324353e-05,
490
+ "loss": 0.3172,
491
+ "step": 780
492
+ },
493
+ {
494
+ "epoch": 0.36,
495
+ "learning_rate": 2.8586131048086334e-05,
496
+ "loss": 0.3228,
497
+ "step": 790
498
+ },
499
+ {
500
+ "epoch": 0.37,
501
+ "learning_rate": 2.853717578679474e-05,
502
+ "loss": 0.3152,
503
+ "step": 800
504
+ },
505
+ {
506
+ "epoch": 0.37,
507
+ "learning_rate": 2.848743074159021e-05,
508
+ "loss": 0.3173,
509
+ "step": 810
510
+ },
511
+ {
512
+ "epoch": 0.38,
513
+ "learning_rate": 2.8436898814690837e-05,
514
+ "loss": 0.3046,
515
+ "step": 820
516
+ },
517
+ {
518
+ "epoch": 0.38,
519
+ "learning_rate": 2.838558295422284e-05,
520
+ "loss": 0.3427,
521
+ "step": 830
522
+ },
523
+ {
524
+ "epoch": 0.39,
525
+ "learning_rate": 2.833348615404859e-05,
526
+ "loss": 0.3371,
527
+ "step": 840
528
+ },
529
+ {
530
+ "epoch": 0.39,
531
+ "learning_rate": 2.8280611453591908e-05,
532
+ "loss": 0.32,
533
+ "step": 850
534
+ },
535
+ {
536
+ "epoch": 0.4,
537
+ "learning_rate": 2.8226961937660773e-05,
538
+ "loss": 0.3195,
539
+ "step": 860
540
+ },
541
+ {
542
+ "epoch": 0.4,
543
+ "learning_rate": 2.817254073626733e-05,
544
+ "loss": 0.3299,
545
+ "step": 870
546
+ },
547
+ {
548
+ "epoch": 0.41,
549
+ "learning_rate": 2.811735102444528e-05,
550
+ "loss": 0.3301,
551
+ "step": 880
552
+ },
553
+ {
554
+ "epoch": 0.41,
555
+ "learning_rate": 2.8061396022064657e-05,
556
+ "loss": 0.332,
557
+ "step": 890
558
+ },
559
+ {
560
+ "epoch": 0.42,
561
+ "learning_rate": 2.8004678993643952e-05,
562
+ "loss": 0.3514,
563
+ "step": 900
564
+ },
565
+ {
566
+ "epoch": 0.42,
567
+ "learning_rate": 2.7947203248159665e-05,
568
+ "loss": 0.3261,
569
+ "step": 910
570
+ },
571
+ {
572
+ "epoch": 0.42,
573
+ "learning_rate": 2.788897213885327e-05,
574
+ "loss": 0.3225,
575
+ "step": 920
576
+ },
577
+ {
578
+ "epoch": 0.43,
579
+ "learning_rate": 2.782998906303555e-05,
580
+ "loss": 0.3212,
581
+ "step": 930
582
+ },
583
+ {
584
+ "epoch": 0.43,
585
+ "learning_rate": 2.777025746188842e-05,
586
+ "loss": 0.3014,
587
+ "step": 940
588
+ },
589
+ {
590
+ "epoch": 0.44,
591
+ "learning_rate": 2.7709780820264147e-05,
592
+ "loss": 0.3244,
593
+ "step": 950
594
+ },
595
+ {
596
+ "epoch": 0.44,
597
+ "learning_rate": 2.764856266648202e-05,
598
+ "loss": 0.3466,
599
+ "step": 960
600
+ },
601
+ {
602
+ "epoch": 1.0,
603
+ "learning_rate": 2.758660657212255e-05,
604
+ "loss": 0.3227,
605
+ "step": 970
606
+ },
607
+ {
608
+ "epoch": 1.01,
609
+ "learning_rate": 2.7523916151819048e-05,
610
+ "loss": 0.34,
611
+ "step": 980
612
+ },
613
+ {
614
+ "epoch": 1.01,
615
+ "learning_rate": 2.746049506304678e-05,
616
+ "loss": 0.3104,
617
+ "step": 990
618
+ },
619
+ {
620
+ "epoch": 1.02,
621
+ "learning_rate": 2.7396347005909535e-05,
622
+ "loss": 0.2988,
623
+ "step": 1000
624
+ },
625
+ {
626
+ "epoch": 1.02,
627
+ "eval_loss": 0.3696992099285126,
628
+ "eval_runtime": 77.5023,
629
+ "eval_samples_per_second": 17.238,
630
+ "eval_steps_per_second": 4.31,
631
+ "step": 1000
632
+ },
633
+ {
634
+ "epoch": 1.02,
635
+ "learning_rate": 2.733147572292381e-05,
636
+ "loss": 0.3104,
637
+ "step": 1010
638
+ },
639
+ {
640
+ "epoch": 1.03,
641
+ "learning_rate": 2.7265884998800434e-05,
642
+ "loss": 0.2974,
643
+ "step": 1020
644
+ },
645
+ {
646
+ "epoch": 1.03,
647
+ "learning_rate": 2.7199578660223743e-05,
648
+ "loss": 0.3149,
649
+ "step": 1030
650
+ },
651
+ {
652
+ "epoch": 1.04,
653
+ "learning_rate": 2.7132560575628377e-05,
654
+ "loss": 0.308,
655
+ "step": 1040
656
+ },
657
+ {
658
+ "epoch": 1.04,
659
+ "learning_rate": 2.7064834654973534e-05,
660
+ "loss": 0.3029,
661
+ "step": 1050
662
+ },
663
+ {
664
+ "epoch": 1.05,
665
+ "learning_rate": 2.6996404849514885e-05,
666
+ "loss": 0.3118,
667
+ "step": 1060
668
+ },
669
+ {
670
+ "epoch": 1.05,
671
+ "learning_rate": 2.6927275151574053e-05,
672
+ "loss": 0.3032,
673
+ "step": 1070
674
+ },
675
+ {
676
+ "epoch": 1.05,
677
+ "learning_rate": 2.6857449594305674e-05,
678
+ "loss": 0.3234,
679
+ "step": 1080
680
+ },
681
+ {
682
+ "epoch": 1.06,
683
+ "learning_rate": 2.678693225146211e-05,
684
+ "loss": 0.2718,
685
+ "step": 1090
686
+ },
687
+ {
688
+ "epoch": 1.06,
689
+ "learning_rate": 2.6715727237155777e-05,
690
+ "loss": 0.3301,
691
+ "step": 1100
692
+ },
693
+ {
694
+ "epoch": 1.07,
695
+ "learning_rate": 2.6643838705619117e-05,
696
+ "loss": 0.31,
697
+ "step": 1110
698
+ },
699
+ {
700
+ "epoch": 1.07,
701
+ "learning_rate": 2.6571270850962234e-05,
702
+ "loss": 0.3058,
703
+ "step": 1120
704
+ },
705
+ {
706
+ "epoch": 1.08,
707
+ "learning_rate": 2.6498027906928195e-05,
708
+ "loss": 0.2993,
709
+ "step": 1130
710
+ },
711
+ {
712
+ "epoch": 1.08,
713
+ "learning_rate": 2.6424114146646043e-05,
714
+ "loss": 0.2877,
715
+ "step": 1140
716
+ },
717
+ {
718
+ "epoch": 1.09,
719
+ "learning_rate": 2.6349533882381475e-05,
720
+ "loss": 0.3052,
721
+ "step": 1150
722
+ },
723
+ {
724
+ "epoch": 1.09,
725
+ "learning_rate": 2.6274291465285266e-05,
726
+ "loss": 0.2991,
727
+ "step": 1160
728
+ },
729
+ {
730
+ "epoch": 1.1,
731
+ "learning_rate": 2.6198391285139417e-05,
732
+ "loss": 0.2869,
733
+ "step": 1170
734
+ },
735
+ {
736
+ "epoch": 1.1,
737
+ "learning_rate": 2.612183777010104e-05,
738
+ "loss": 0.3069,
739
+ "step": 1180
740
+ },
741
+ {
742
+ "epoch": 1.11,
743
+ "learning_rate": 2.6044635386444024e-05,
744
+ "loss": 0.3033,
745
+ "step": 1190
746
+ },
747
+ {
748
+ "epoch": 1.11,
749
+ "learning_rate": 2.5966788638298443e-05,
750
+ "loss": 0.3073,
751
+ "step": 1200
752
+ },
753
+ {
754
+ "epoch": 1.12,
755
+ "learning_rate": 2.5888302067387793e-05,
756
+ "loss": 0.2915,
757
+ "step": 1210
758
+ },
759
+ {
760
+ "epoch": 1.12,
761
+ "learning_rate": 2.5809180252764022e-05,
762
+ "loss": 0.3184,
763
+ "step": 1220
764
+ },
765
+ {
766
+ "epoch": 1.12,
767
+ "learning_rate": 2.572942781054036e-05,
768
+ "loss": 0.2882,
769
+ "step": 1230
770
+ },
771
+ {
772
+ "epoch": 1.13,
773
+ "learning_rate": 2.564904939362204e-05,
774
+ "loss": 0.3235,
775
+ "step": 1240
776
+ },
777
+ {
778
+ "epoch": 1.13,
779
+ "learning_rate": 2.5568049691434794e-05,
780
+ "loss": 0.2978,
781
+ "step": 1250
782
+ },
783
+ {
784
+ "epoch": 1.14,
785
+ "learning_rate": 2.5486433429651304e-05,
786
+ "loss": 0.3167,
787
+ "step": 1260
788
+ },
789
+ {
790
+ "epoch": 1.14,
791
+ "learning_rate": 2.5404205369915473e-05,
792
+ "loss": 0.3099,
793
+ "step": 1270
794
+ },
795
+ {
796
+ "epoch": 1.15,
797
+ "learning_rate": 2.532137030956464e-05,
798
+ "loss": 0.2853,
799
+ "step": 1280
800
+ },
801
+ {
802
+ "epoch": 1.15,
803
+ "learning_rate": 2.523793308134967e-05,
804
+ "loss": 0.3147,
805
+ "step": 1290
806
+ },
807
+ {
808
+ "epoch": 1.16,
809
+ "learning_rate": 2.5153898553153024e-05,
810
+ "loss": 0.2833,
811
+ "step": 1300
812
+ },
813
+ {
814
+ "epoch": 1.16,
815
+ "learning_rate": 2.506927162770475e-05,
816
+ "loss": 0.3133,
817
+ "step": 1310
818
+ },
819
+ {
820
+ "epoch": 1.17,
821
+ "learning_rate": 2.4984057242296464e-05,
822
+ "loss": 0.2826,
823
+ "step": 1320
824
+ },
825
+ {
826
+ "epoch": 1.17,
827
+ "learning_rate": 2.489826036849325e-05,
828
+ "loss": 0.2962,
829
+ "step": 1330
830
+ },
831
+ {
832
+ "epoch": 1.18,
833
+ "learning_rate": 2.4811886011843673e-05,
834
+ "loss": 0.2818,
835
+ "step": 1340
836
+ },
837
+ {
838
+ "epoch": 1.18,
839
+ "learning_rate": 2.4724939211587706e-05,
840
+ "loss": 0.3102,
841
+ "step": 1350
842
+ },
843
+ {
844
+ "epoch": 1.18,
845
+ "learning_rate": 2.4637425040362744e-05,
846
+ "loss": 0.301,
847
+ "step": 1360
848
+ },
849
+ {
850
+ "epoch": 1.19,
851
+ "learning_rate": 2.4549348603907658e-05,
852
+ "loss": 0.2799,
853
+ "step": 1370
854
+ },
855
+ {
856
+ "epoch": 1.19,
857
+ "learning_rate": 2.4460715040764916e-05,
858
+ "loss": 0.293,
859
+ "step": 1380
860
+ },
861
+ {
862
+ "epoch": 1.2,
863
+ "learning_rate": 2.4371529521980775e-05,
864
+ "loss": 0.2987,
865
+ "step": 1390
866
+ },
867
+ {
868
+ "epoch": 1.2,
869
+ "learning_rate": 2.428179725080362e-05,
870
+ "loss": 0.2762,
871
+ "step": 1400
872
+ },
873
+ {
874
+ "epoch": 1.21,
875
+ "learning_rate": 2.419152346238038e-05,
876
+ "loss": 0.2936,
877
+ "step": 1410
878
+ },
879
+ {
880
+ "epoch": 1.21,
881
+ "learning_rate": 2.410071342345111e-05,
882
+ "loss": 0.3099,
883
+ "step": 1420
884
+ },
885
+ {
886
+ "epoch": 1.22,
887
+ "learning_rate": 2.4009372432041702e-05,
888
+ "loss": 0.2863,
889
+ "step": 1430
890
+ },
891
+ {
892
+ "epoch": 1.22,
893
+ "learning_rate": 2.3917505817154795e-05,
894
+ "loss": 0.2977,
895
+ "step": 1440
896
+ },
897
+ {
898
+ "epoch": 1.23,
899
+ "learning_rate": 2.3825118938458894e-05,
900
+ "loss": 0.3138,
901
+ "step": 1450
902
+ },
903
+ {
904
+ "epoch": 1.23,
905
+ "learning_rate": 2.373221718597564e-05,
906
+ "loss": 0.2904,
907
+ "step": 1460
908
+ },
909
+ {
910
+ "epoch": 1.24,
911
+ "learning_rate": 2.3638805979765387e-05,
912
+ "loss": 0.2981,
913
+ "step": 1470
914
+ },
915
+ {
916
+ "epoch": 1.24,
917
+ "learning_rate": 2.3544890769610936e-05,
918
+ "loss": 0.266,
919
+ "step": 1480
920
+ },
921
+ {
922
+ "epoch": 1.24,
923
+ "learning_rate": 2.3450477034699632e-05,
924
+ "loss": 0.2885,
925
+ "step": 1490
926
+ },
927
+ {
928
+ "epoch": 1.25,
929
+ "learning_rate": 2.335557028330366e-05,
930
+ "loss": 0.2716,
931
+ "step": 1500
932
+ },
933
+ {
934
+ "epoch": 1.25,
935
+ "eval_loss": 0.34079140424728394,
936
+ "eval_runtime": 77.3896,
937
+ "eval_samples_per_second": 17.263,
938
+ "eval_steps_per_second": 4.316,
939
+ "step": 1500
940
+ },
941
+ {
942
+ "epoch": 1.25,
943
+ "learning_rate": 2.326017605245872e-05,
944
+ "loss": 0.2932,
945
+ "step": 1510
946
+ },
947
+ {
948
+ "epoch": 1.26,
949
+ "learning_rate": 2.3164299907640955e-05,
950
+ "loss": 0.2851,
951
+ "step": 1520
952
+ },
953
+ {
954
+ "epoch": 1.26,
955
+ "learning_rate": 2.3067947442442264e-05,
956
+ "loss": 0.2665,
957
+ "step": 1530
958
+ },
959
+ {
960
+ "epoch": 1.27,
961
+ "learning_rate": 2.2971124278243957e-05,
962
+ "loss": 0.2677,
963
+ "step": 1540
964
+ },
965
+ {
966
+ "epoch": 1.27,
967
+ "learning_rate": 2.28738360638888e-05,
968
+ "loss": 0.3013,
969
+ "step": 1550
970
+ },
971
+ {
972
+ "epoch": 1.28,
973
+ "learning_rate": 2.2776088475351445e-05,
974
+ "loss": 0.2815,
975
+ "step": 1560
976
+ },
977
+ {
978
+ "epoch": 1.28,
979
+ "learning_rate": 2.2677887215407278e-05,
980
+ "loss": 0.2724,
981
+ "step": 1570
982
+ },
983
+ {
984
+ "epoch": 1.29,
985
+ "learning_rate": 2.257923801329973e-05,
986
+ "loss": 0.2858,
987
+ "step": 1580
988
+ },
989
+ {
990
+ "epoch": 1.29,
991
+ "learning_rate": 2.248014662440599e-05,
992
+ "loss": 0.274,
993
+ "step": 1590
994
+ },
995
+ {
996
+ "epoch": 1.3,
997
+ "learning_rate": 2.238061882990126e-05,
998
+ "loss": 0.2817,
999
+ "step": 1600
1000
+ },
1001
+ {
1002
+ "epoch": 1.3,
1003
+ "learning_rate": 2.2280660436421443e-05,
1004
+ "loss": 0.3102,
1005
+ "step": 1610
1006
+ },
1007
+ {
1008
+ "epoch": 1.3,
1009
+ "learning_rate": 2.2180277275724385e-05,
1010
+ "loss": 0.2673,
1011
+ "step": 1620
1012
+ },
1013
+ {
1014
+ "epoch": 1.31,
1015
+ "learning_rate": 2.2079475204349645e-05,
1016
+ "loss": 0.287,
1017
+ "step": 1630
1018
+ },
1019
+ {
1020
+ "epoch": 1.31,
1021
+ "learning_rate": 2.1978260103276796e-05,
1022
+ "loss": 0.2712,
1023
+ "step": 1640
1024
+ },
1025
+ {
1026
+ "epoch": 1.32,
1027
+ "learning_rate": 2.187663787758234e-05,
1028
+ "loss": 0.2763,
1029
+ "step": 1650
1030
+ },
1031
+ {
1032
+ "epoch": 1.32,
1033
+ "learning_rate": 2.177461445609518e-05,
1034
+ "loss": 0.2729,
1035
+ "step": 1660
1036
+ },
1037
+ {
1038
+ "epoch": 1.33,
1039
+ "learning_rate": 2.1672195791050712e-05,
1040
+ "loss": 0.2853,
1041
+ "step": 1670
1042
+ },
1043
+ {
1044
+ "epoch": 1.33,
1045
+ "learning_rate": 2.1569387857743596e-05,
1046
+ "loss": 0.2773,
1047
+ "step": 1680
1048
+ },
1049
+ {
1050
+ "epoch": 1.34,
1051
+ "learning_rate": 2.1466196654179107e-05,
1052
+ "loss": 0.2882,
1053
+ "step": 1690
1054
+ },
1055
+ {
1056
+ "epoch": 1.34,
1057
+ "learning_rate": 2.1362628200723228e-05,
1058
+ "loss": 0.274,
1059
+ "step": 1700
1060
+ },
1061
+ {
1062
+ "epoch": 1.35,
1063
+ "learning_rate": 2.1258688539751387e-05,
1064
+ "loss": 0.2785,
1065
+ "step": 1710
1066
+ },
1067
+ {
1068
+ "epoch": 1.35,
1069
+ "learning_rate": 2.115438373529596e-05,
1070
+ "loss": 0.2738,
1071
+ "step": 1720
1072
+ },
1073
+ {
1074
+ "epoch": 1.36,
1075
+ "learning_rate": 2.104971987269245e-05,
1076
+ "loss": 0.2569,
1077
+ "step": 1730
1078
+ },
1079
+ {
1080
+ "epoch": 1.36,
1081
+ "learning_rate": 2.0944703058224504e-05,
1082
+ "loss": 0.2423,
1083
+ "step": 1740
1084
+ },
1085
+ {
1086
+ "epoch": 1.36,
1087
+ "learning_rate": 2.0839339418767616e-05,
1088
+ "loss": 0.2721,
1089
+ "step": 1750
1090
+ },
1091
+ {
1092
+ "epoch": 1.37,
1093
+ "learning_rate": 2.0733635101431694e-05,
1094
+ "loss": 0.2858,
1095
+ "step": 1760
1096
+ },
1097
+ {
1098
+ "epoch": 1.37,
1099
+ "learning_rate": 2.0627596273202435e-05,
1100
+ "loss": 0.2793,
1101
+ "step": 1770
1102
+ },
1103
+ {
1104
+ "epoch": 1.38,
1105
+ "learning_rate": 2.05212291205815e-05,
1106
+ "loss": 0.277,
1107
+ "step": 1780
1108
+ },
1109
+ {
1110
+ "epoch": 1.38,
1111
+ "learning_rate": 2.0414539849225637e-05,
1112
+ "loss": 0.2676,
1113
+ "step": 1790
1114
+ },
1115
+ {
1116
+ "epoch": 1.39,
1117
+ "learning_rate": 2.0307534683584565e-05,
1118
+ "loss": 0.26,
1119
+ "step": 1800
1120
+ },
1121
+ {
1122
+ "epoch": 1.39,
1123
+ "learning_rate": 2.0200219866537882e-05,
1124
+ "loss": 0.2772,
1125
+ "step": 1810
1126
+ },
1127
+ {
1128
+ "epoch": 1.4,
1129
+ "learning_rate": 2.0092601659030807e-05,
1130
+ "loss": 0.2988,
1131
+ "step": 1820
1132
+ },
1133
+ {
1134
+ "epoch": 1.4,
1135
+ "learning_rate": 1.9984686339708927e-05,
1136
+ "loss": 0.2611,
1137
+ "step": 1830
1138
+ },
1139
+ {
1140
+ "epoch": 1.41,
1141
+ "learning_rate": 1.9876480204551894e-05,
1142
+ "loss": 0.2727,
1143
+ "step": 1840
1144
+ },
1145
+ {
1146
+ "epoch": 1.41,
1147
+ "learning_rate": 1.976798956650607e-05,
1148
+ "loss": 0.2732,
1149
+ "step": 1850
1150
+ },
1151
+ {
1152
+ "epoch": 1.42,
1153
+ "learning_rate": 1.9659220755116277e-05,
1154
+ "loss": 0.2686,
1155
+ "step": 1860
1156
+ },
1157
+ {
1158
+ "epoch": 1.42,
1159
+ "learning_rate": 1.9550180116156447e-05,
1160
+ "loss": 0.2731,
1161
+ "step": 1870
1162
+ },
1163
+ {
1164
+ "epoch": 1.42,
1165
+ "learning_rate": 1.9440874011259458e-05,
1166
+ "loss": 0.2945,
1167
+ "step": 1880
1168
+ },
1169
+ {
1170
+ "epoch": 1.43,
1171
+ "learning_rate": 1.9331308817545963e-05,
1172
+ "loss": 0.2941,
1173
+ "step": 1890
1174
+ },
1175
+ {
1176
+ "epoch": 1.43,
1177
+ "learning_rate": 1.922149092725233e-05,
1178
+ "loss": 0.2803,
1179
+ "step": 1900
1180
+ },
1181
+ {
1182
+ "epoch": 1.44,
1183
+ "learning_rate": 1.911142674735771e-05,
1184
+ "loss": 0.2759,
1185
+ "step": 1910
1186
+ },
1187
+ {
1188
+ "epoch": 1.44,
1189
+ "learning_rate": 1.900112269921026e-05,
1190
+ "loss": 0.2689,
1191
+ "step": 1920
1192
+ },
1193
+ {
1194
+ "epoch": 1.44,
1195
+ "step": 1922,
1196
+ "total_flos": 3.4599459174219776e+17,
1197
+ "train_loss": 0.3273629436693876,
1198
+ "train_runtime": 4032.5888,
1199
+ "train_samples_per_second": 4.294,
1200
+ "train_steps_per_second": 1.074
1201
+ }
1202
+ ],
1203
+ "logging_steps": 10,
1204
+ "max_steps": 4330,
1205
+ "num_train_epochs": 2,
1206
+ "save_steps": 500,
1207
+ "total_flos": 3.4599459174219776e+17,
1208
+ "trial_name": null,
1209
+ "trial_params": null
1210
+ }