chansung commited on
Commit
9073cb0
1 Parent(s): 90add56

Model save

Browse files
README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: gemma
3
+ library_name: peft
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ base_model: google/gemma-2b
9
+ datasets:
10
+ - generator
11
+ model-index:
12
+ - name: gemma2b-summarize-gpt4o-16k
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # gemma2b-summarize-gpt4o-16k
20
+
21
+ This model is a fine-tuned version of [google/gemma-2b](https://huggingface.co/google/gemma-2b) on the generator dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 2.5758
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 8
44
+ - eval_batch_size: 8
45
+ - seed: 42
46
+ - distributed_type: multi-GPU
47
+ - num_devices: 3
48
+ - gradient_accumulation_steps: 2
49
+ - total_train_batch_size: 48
50
+ - total_eval_batch_size: 24
51
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
52
+ - lr_scheduler_type: cosine
53
+ - lr_scheduler_warmup_ratio: 0.1
54
+ - num_epochs: 10
55
+
56
+ ### Training results
57
+
58
+ | Training Loss | Epoch | Step | Validation Loss |
59
+ |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.8806 | 0.9863 | 36 | 2.6094 |
61
+ | 1.3239 | 2.0 | 73 | 2.5358 |
62
+ | 1.2327 | 2.9863 | 109 | 2.5192 |
63
+ | 1.1735 | 4.0 | 146 | 2.5203 |
64
+ | 1.1354 | 4.9863 | 182 | 2.5467 |
65
+ | 1.1015 | 6.0 | 219 | 2.5496 |
66
+ | 1.0858 | 6.9863 | 255 | 2.5680 |
67
+ | 1.0624 | 8.0 | 292 | 2.5723 |
68
+ | 1.0546 | 8.9863 | 328 | 2.5756 |
69
+ | 1.0623 | 9.8630 | 360 | 2.5758 |
70
+
71
+
72
+ ### Framework versions
73
+
74
+ - PEFT 0.11.1
75
+ - Transformers 4.41.2
76
+ - Pytorch 2.3.0+cu121
77
+ - Datasets 2.19.2
78
+ - Tokenizers 0.19.1
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c48f0f67e6c56e7f4a77ace382ef4a25b0d3cfd16bc4f0768d3c391e324b105f
3
  size 19644912
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:283585d02f26a343c41d2c2dc9db52a2669fe9904094a966bd813779d536c6a6
3
  size 19644912
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.863013698630137,
3
+ "total_flos": 2.1145440197646746e+17,
4
+ "train_loss": 1.288370986117257,
5
+ "train_runtime": 1930.7273,
6
+ "train_samples": 16152,
7
+ "train_samples_per_second": 9.074,
8
+ "train_steps_per_second": 0.186
9
+ }
runs/Jun05_14-12-12_user-HP-Z8-Fury-G5-Workstation-Desktop-PC/events.out.tfevents.1717564350.user-HP-Z8-Fury-G5-Workstation-Desktop-PC.25863.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d04a49d96f138319c7c368833a1c125b2a75916bd0d10c3f434420f96ab7bf08
3
- size 20517
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8d1755fec369e9a066da439dc77107fe925342b73586691d4f238fccae494d8
3
+ size 23945
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 9.863013698630137,
3
+ "total_flos": 2.1145440197646746e+17,
4
+ "train_loss": 1.288370986117257,
5
+ "train_runtime": 1930.7273,
6
+ "train_samples": 16152,
7
+ "train_samples_per_second": 9.074,
8
+ "train_steps_per_second": 0.186
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 9.863013698630137,
5
+ "eval_steps": 500,
6
+ "global_step": 360,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0273972602739726,
13
+ "grad_norm": 1.8984375,
14
+ "learning_rate": 5.555555555555556e-06,
15
+ "loss": 3.0637,
16
+ "step": 1
17
+ },
18
+ {
19
+ "epoch": 0.136986301369863,
20
+ "grad_norm": 2.265625,
21
+ "learning_rate": 2.777777777777778e-05,
22
+ "loss": 3.056,
23
+ "step": 5
24
+ },
25
+ {
26
+ "epoch": 0.273972602739726,
27
+ "grad_norm": 2.015625,
28
+ "learning_rate": 5.555555555555556e-05,
29
+ "loss": 3.0171,
30
+ "step": 10
31
+ },
32
+ {
33
+ "epoch": 0.410958904109589,
34
+ "grad_norm": 1.9609375,
35
+ "learning_rate": 8.333333333333334e-05,
36
+ "loss": 2.8054,
37
+ "step": 15
38
+ },
39
+ {
40
+ "epoch": 0.547945205479452,
41
+ "grad_norm": 2.1875,
42
+ "learning_rate": 0.00011111111111111112,
43
+ "loss": 2.4802,
44
+ "step": 20
45
+ },
46
+ {
47
+ "epoch": 0.684931506849315,
48
+ "grad_norm": 2.546875,
49
+ "learning_rate": 0.0001388888888888889,
50
+ "loss": 2.2946,
51
+ "step": 25
52
+ },
53
+ {
54
+ "epoch": 0.821917808219178,
55
+ "grad_norm": 2.109375,
56
+ "learning_rate": 0.0001666666666666667,
57
+ "loss": 2.0875,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.958904109589041,
62
+ "grad_norm": 0.57421875,
63
+ "learning_rate": 0.00019444444444444446,
64
+ "loss": 1.8806,
65
+ "step": 35
66
+ },
67
+ {
68
+ "epoch": 0.9863013698630136,
69
+ "eval_loss": 2.6093997955322266,
70
+ "eval_runtime": 0.5633,
71
+ "eval_samples_per_second": 17.753,
72
+ "eval_steps_per_second": 1.775,
73
+ "step": 36
74
+ },
75
+ {
76
+ "epoch": 1.095890410958904,
77
+ "grad_norm": 44.0,
78
+ "learning_rate": 0.00019992479525042303,
79
+ "loss": 1.6979,
80
+ "step": 40
81
+ },
82
+ {
83
+ "epoch": 1.2328767123287672,
84
+ "grad_norm": 0.62890625,
85
+ "learning_rate": 0.00019961946980917456,
86
+ "loss": 1.5925,
87
+ "step": 45
88
+ },
89
+ {
90
+ "epoch": 1.36986301369863,
91
+ "grad_norm": 0.4765625,
92
+ "learning_rate": 0.00019908004033648453,
93
+ "loss": 1.5152,
94
+ "step": 50
95
+ },
96
+ {
97
+ "epoch": 1.5068493150684932,
98
+ "grad_norm": 0.4765625,
99
+ "learning_rate": 0.00019830777448228603,
100
+ "loss": 1.4565,
101
+ "step": 55
102
+ },
103
+ {
104
+ "epoch": 1.643835616438356,
105
+ "grad_norm": 0.310546875,
106
+ "learning_rate": 0.00019730448705798239,
107
+ "loss": 1.3866,
108
+ "step": 60
109
+ },
110
+ {
111
+ "epoch": 1.7808219178082192,
112
+ "grad_norm": 0.31640625,
113
+ "learning_rate": 0.00019607253577167205,
114
+ "loss": 1.3522,
115
+ "step": 65
116
+ },
117
+ {
118
+ "epoch": 1.9178082191780823,
119
+ "grad_norm": 0.369140625,
120
+ "learning_rate": 0.00019461481568757506,
121
+ "loss": 1.3239,
122
+ "step": 70
123
+ },
124
+ {
125
+ "epoch": 2.0,
126
+ "eval_loss": 2.535839080810547,
127
+ "eval_runtime": 0.5617,
128
+ "eval_samples_per_second": 17.804,
129
+ "eval_steps_per_second": 1.78,
130
+ "step": 73
131
+ },
132
+ {
133
+ "epoch": 2.0547945205479454,
134
+ "grad_norm": 0.26953125,
135
+ "learning_rate": 0.00019293475242268223,
136
+ "loss": 1.3035,
137
+ "step": 75
138
+ },
139
+ {
140
+ "epoch": 2.191780821917808,
141
+ "grad_norm": 0.2275390625,
142
+ "learning_rate": 0.0001910362940966147,
143
+ "loss": 1.2854,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 2.328767123287671,
148
+ "grad_norm": 0.306640625,
149
+ "learning_rate": 0.00018892390205361062,
150
+ "loss": 1.2525,
151
+ "step": 85
152
+ },
153
+ {
154
+ "epoch": 2.4657534246575343,
155
+ "grad_norm": 0.578125,
156
+ "learning_rate": 0.00018660254037844388,
157
+ "loss": 1.26,
158
+ "step": 90
159
+ },
160
+ {
161
+ "epoch": 2.602739726027397,
162
+ "grad_norm": 0.54296875,
163
+ "learning_rate": 0.00018407766423091034,
164
+ "loss": 1.2323,
165
+ "step": 95
166
+ },
167
+ {
168
+ "epoch": 2.73972602739726,
169
+ "grad_norm": 0.28515625,
170
+ "learning_rate": 0.00018135520702629675,
171
+ "loss": 1.2267,
172
+ "step": 100
173
+ },
174
+ {
175
+ "epoch": 2.8767123287671232,
176
+ "grad_norm": 0.267578125,
177
+ "learning_rate": 0.00017844156649195759,
178
+ "loss": 1.2327,
179
+ "step": 105
180
+ },
181
+ {
182
+ "epoch": 2.9863013698630136,
183
+ "eval_loss": 2.519155979156494,
184
+ "eval_runtime": 0.5663,
185
+ "eval_samples_per_second": 17.659,
186
+ "eval_steps_per_second": 1.766,
187
+ "step": 109
188
+ },
189
+ {
190
+ "epoch": 3.0136986301369864,
191
+ "grad_norm": 0.275390625,
192
+ "learning_rate": 0.00017534358963276607,
193
+ "loss": 1.2242,
194
+ "step": 110
195
+ },
196
+ {
197
+ "epoch": 3.1506849315068495,
198
+ "grad_norm": 0.30078125,
199
+ "learning_rate": 0.00017206855664077147,
200
+ "loss": 1.1938,
201
+ "step": 115
202
+ },
203
+ {
204
+ "epoch": 3.287671232876712,
205
+ "grad_norm": 0.51171875,
206
+ "learning_rate": 0.0001686241637868734,
207
+ "loss": 1.1879,
208
+ "step": 120
209
+ },
210
+ {
211
+ "epoch": 3.4246575342465753,
212
+ "grad_norm": 0.48828125,
213
+ "learning_rate": 0.00016501850533471836,
214
+ "loss": 1.1748,
215
+ "step": 125
216
+ },
217
+ {
218
+ "epoch": 3.5616438356164384,
219
+ "grad_norm": 0.29296875,
220
+ "learning_rate": 0.0001612600545193203,
221
+ "loss": 1.1704,
222
+ "step": 130
223
+ },
224
+ {
225
+ "epoch": 3.6986301369863015,
226
+ "grad_norm": 0.6015625,
227
+ "learning_rate": 0.0001573576436351046,
228
+ "loss": 1.1679,
229
+ "step": 135
230
+ },
231
+ {
232
+ "epoch": 3.8356164383561646,
233
+ "grad_norm": 0.392578125,
234
+ "learning_rate": 0.00015332044328016914,
235
+ "loss": 1.1663,
236
+ "step": 140
237
+ },
238
+ {
239
+ "epoch": 3.9726027397260273,
240
+ "grad_norm": 0.33203125,
241
+ "learning_rate": 0.00014915794080553707,
242
+ "loss": 1.1735,
243
+ "step": 145
244
+ },
245
+ {
246
+ "epoch": 4.0,
247
+ "eval_loss": 2.520301103591919,
248
+ "eval_runtime": 0.5527,
249
+ "eval_samples_per_second": 18.093,
250
+ "eval_steps_per_second": 1.809,
251
+ "step": 146
252
+ },
253
+ {
254
+ "epoch": 4.109589041095891,
255
+ "grad_norm": 0.302734375,
256
+ "learning_rate": 0.00014487991802004623,
257
+ "loss": 1.144,
258
+ "step": 150
259
+ },
260
+ {
261
+ "epoch": 4.2465753424657535,
262
+ "grad_norm": 0.310546875,
263
+ "learning_rate": 0.00014049642820326735,
264
+ "loss": 1.1273,
265
+ "step": 155
266
+ },
267
+ {
268
+ "epoch": 4.383561643835616,
269
+ "grad_norm": 0.51953125,
270
+ "learning_rate": 0.00013601777248047105,
271
+ "loss": 1.1344,
272
+ "step": 160
273
+ },
274
+ {
275
+ "epoch": 4.52054794520548,
276
+ "grad_norm": 0.6171875,
277
+ "learning_rate": 0.00013145447561516138,
278
+ "loss": 1.1364,
279
+ "step": 165
280
+ },
281
+ {
282
+ "epoch": 4.657534246575342,
283
+ "grad_norm": 0.50390625,
284
+ "learning_rate": 0.00012681726127606376,
285
+ "loss": 1.1344,
286
+ "step": 170
287
+ },
288
+ {
289
+ "epoch": 4.794520547945205,
290
+ "grad_norm": 0.453125,
291
+ "learning_rate": 0.00012211702683668878,
292
+ "loss": 1.1252,
293
+ "step": 175
294
+ },
295
+ {
296
+ "epoch": 4.931506849315069,
297
+ "grad_norm": 0.310546875,
298
+ "learning_rate": 0.00011736481776669306,
299
+ "loss": 1.1354,
300
+ "step": 180
301
+ },
302
+ {
303
+ "epoch": 4.986301369863014,
304
+ "eval_loss": 2.546712875366211,
305
+ "eval_runtime": 0.5597,
306
+ "eval_samples_per_second": 17.868,
307
+ "eval_steps_per_second": 1.787,
308
+ "step": 182
309
+ },
310
+ {
311
+ "epoch": 5.068493150684931,
312
+ "grad_norm": 0.419921875,
313
+ "learning_rate": 0.00011257180167521629,
314
+ "loss": 1.1069,
315
+ "step": 185
316
+ },
317
+ {
318
+ "epoch": 5.205479452054795,
319
+ "grad_norm": 0.515625,
320
+ "learning_rate": 0.0001077492420671931,
321
+ "loss": 1.0983,
322
+ "step": 190
323
+ },
324
+ {
325
+ "epoch": 5.342465753424658,
326
+ "grad_norm": 0.373046875,
327
+ "learning_rate": 0.00010290847187431113,
328
+ "loss": 1.0952,
329
+ "step": 195
330
+ },
331
+ {
332
+ "epoch": 5.47945205479452,
333
+ "grad_norm": 0.50390625,
334
+ "learning_rate": 9.806086682281758e-05,
335
+ "loss": 1.1105,
336
+ "step": 200
337
+ },
338
+ {
339
+ "epoch": 5.616438356164384,
340
+ "grad_norm": 0.470703125,
341
+ "learning_rate": 9.321781870075908e-05,
342
+ "loss": 1.1041,
343
+ "step": 205
344
+ },
345
+ {
346
+ "epoch": 5.7534246575342465,
347
+ "grad_norm": 0.4296875,
348
+ "learning_rate": 8.839070858747697e-05,
349
+ "loss": 1.0954,
350
+ "step": 210
351
+ },
352
+ {
353
+ "epoch": 5.890410958904109,
354
+ "grad_norm": 0.361328125,
355
+ "learning_rate": 8.35908801082676e-05,
356
+ "loss": 1.1015,
357
+ "step": 215
358
+ },
359
+ {
360
+ "epoch": 6.0,
361
+ "eval_loss": 2.5496411323547363,
362
+ "eval_runtime": 0.5525,
363
+ "eval_samples_per_second": 18.099,
364
+ "eval_steps_per_second": 1.81,
365
+ "step": 219
366
+ },
367
+ {
368
+ "epoch": 6.027397260273973,
369
+ "grad_norm": 0.35546875,
370
+ "learning_rate": 7.882961277705895e-05,
371
+ "loss": 1.0967,
372
+ "step": 220
373
+ },
374
+ {
375
+ "epoch": 6.164383561643835,
376
+ "grad_norm": 0.369140625,
377
+ "learning_rate": 7.411809548974792e-05,
378
+ "loss": 1.0792,
379
+ "step": 225
380
+ },
381
+ {
382
+ "epoch": 6.301369863013699,
383
+ "grad_norm": 0.4140625,
384
+ "learning_rate": 6.94674002304887e-05,
385
+ "loss": 1.0787,
386
+ "step": 230
387
+ },
388
+ {
389
+ "epoch": 6.438356164383562,
390
+ "grad_norm": 0.357421875,
391
+ "learning_rate": 6.488845605272113e-05,
392
+ "loss": 1.0699,
393
+ "step": 235
394
+ },
395
+ {
396
+ "epoch": 6.575342465753424,
397
+ "grad_norm": 0.384765625,
398
+ "learning_rate": 6.039202339608432e-05,
399
+ "loss": 1.0875,
400
+ "step": 240
401
+ },
402
+ {
403
+ "epoch": 6.712328767123288,
404
+ "grad_norm": 0.39453125,
405
+ "learning_rate": 5.5988668799569545e-05,
406
+ "loss": 1.0721,
407
+ "step": 245
408
+ },
409
+ {
410
+ "epoch": 6.8493150684931505,
411
+ "grad_norm": 0.33984375,
412
+ "learning_rate": 5.168874007033615e-05,
413
+ "loss": 1.0769,
414
+ "step": 250
415
+ },
416
+ {
417
+ "epoch": 6.986301369863014,
418
+ "grad_norm": 0.408203125,
419
+ "learning_rate": 4.7502341966544e-05,
420
+ "loss": 1.0858,
421
+ "step": 255
422
+ },
423
+ {
424
+ "epoch": 6.986301369863014,
425
+ "eval_loss": 2.568040609359741,
426
+ "eval_runtime": 0.5489,
427
+ "eval_samples_per_second": 18.22,
428
+ "eval_steps_per_second": 1.822,
429
+ "step": 255
430
+ },
431
+ {
432
+ "epoch": 7.123287671232877,
433
+ "grad_norm": 0.36328125,
434
+ "learning_rate": 4.343931245134616e-05,
435
+ "loss": 1.0577,
436
+ "step": 260
437
+ },
438
+ {
439
+ "epoch": 7.260273972602739,
440
+ "grad_norm": 0.515625,
441
+ "learning_rate": 3.950919957384582e-05,
442
+ "loss": 1.0614,
443
+ "step": 265
444
+ },
445
+ {
446
+ "epoch": 7.397260273972603,
447
+ "grad_norm": 0.388671875,
448
+ "learning_rate": 3.5721239031346066e-05,
449
+ "loss": 1.0698,
450
+ "step": 270
451
+ },
452
+ {
453
+ "epoch": 7.534246575342466,
454
+ "grad_norm": 0.42578125,
455
+ "learning_rate": 3.2084332465620694e-05,
456
+ "loss": 1.0695,
457
+ "step": 275
458
+ },
459
+ {
460
+ "epoch": 7.671232876712329,
461
+ "grad_norm": 0.408203125,
462
+ "learning_rate": 2.8607026544210114e-05,
463
+ "loss": 1.0702,
464
+ "step": 280
465
+ },
466
+ {
467
+ "epoch": 7.808219178082192,
468
+ "grad_norm": 0.392578125,
469
+ "learning_rate": 2.529749287590042e-05,
470
+ "loss": 1.0632,
471
+ "step": 285
472
+ },
473
+ {
474
+ "epoch": 7.945205479452055,
475
+ "grad_norm": 0.40625,
476
+ "learning_rate": 2.2163508807583998e-05,
477
+ "loss": 1.0624,
478
+ "step": 290
479
+ },
480
+ {
481
+ "epoch": 8.0,
482
+ "eval_loss": 2.572319746017456,
483
+ "eval_runtime": 0.5428,
484
+ "eval_samples_per_second": 18.424,
485
+ "eval_steps_per_second": 1.842,
486
+ "step": 292
487
+ },
488
+ {
489
+ "epoch": 8.082191780821917,
490
+ "grad_norm": 0.359375,
491
+ "learning_rate": 1.921243914762889e-05,
492
+ "loss": 1.0583,
493
+ "step": 295
494
+ },
495
+ {
496
+ "epoch": 8.219178082191782,
497
+ "grad_norm": 0.33203125,
498
+ "learning_rate": 1.6451218858706374e-05,
499
+ "loss": 1.0536,
500
+ "step": 300
501
+ },
502
+ {
503
+ "epoch": 8.356164383561644,
504
+ "grad_norm": 0.359375,
505
+ "learning_rate": 1.388633676074862e-05,
506
+ "loss": 1.0613,
507
+ "step": 305
508
+ },
509
+ {
510
+ "epoch": 8.493150684931507,
511
+ "grad_norm": 0.353515625,
512
+ "learning_rate": 1.1523820282334219e-05,
513
+ "loss": 1.0589,
514
+ "step": 310
515
+ },
516
+ {
517
+ "epoch": 8.63013698630137,
518
+ "grad_norm": 0.337890625,
519
+ "learning_rate": 9.369221296335006e-06,
520
+ "loss": 1.0664,
521
+ "step": 315
522
+ },
523
+ {
524
+ "epoch": 8.767123287671232,
525
+ "grad_norm": 0.337890625,
526
+ "learning_rate": 7.427603073110967e-06,
527
+ "loss": 1.0656,
528
+ "step": 320
529
+ },
530
+ {
531
+ "epoch": 8.904109589041095,
532
+ "grad_norm": 0.330078125,
533
+ "learning_rate": 5.7035283819124155e-06,
534
+ "loss": 1.0546,
535
+ "step": 325
536
+ },
537
+ {
538
+ "epoch": 8.986301369863014,
539
+ "eval_loss": 2.5756430625915527,
540
+ "eval_runtime": 0.5591,
541
+ "eval_samples_per_second": 17.886,
542
+ "eval_steps_per_second": 1.789,
543
+ "step": 328
544
+ },
545
+ {
546
+ "epoch": 9.04109589041096,
547
+ "grad_norm": 0.33203125,
548
+ "learning_rate": 4.20104876845111e-06,
549
+ "loss": 1.0515,
550
+ "step": 330
551
+ },
552
+ {
553
+ "epoch": 9.178082191780822,
554
+ "grad_norm": 0.3515625,
555
+ "learning_rate": 2.9236950338380033e-06,
556
+ "loss": 1.0521,
557
+ "step": 335
558
+ },
559
+ {
560
+ "epoch": 9.315068493150685,
561
+ "grad_norm": 0.3359375,
562
+ "learning_rate": 1.874468937261531e-06,
563
+ "loss": 1.0563,
564
+ "step": 340
565
+ },
566
+ {
567
+ "epoch": 9.452054794520548,
568
+ "grad_norm": 0.341796875,
569
+ "learning_rate": 1.055836141905553e-06,
570
+ "loss": 1.0535,
571
+ "step": 345
572
+ },
573
+ {
574
+ "epoch": 9.58904109589041,
575
+ "grad_norm": 0.349609375,
576
+ "learning_rate": 4.6972042068341714e-07,
577
+ "loss": 1.0521,
578
+ "step": 350
579
+ },
580
+ {
581
+ "epoch": 9.726027397260275,
582
+ "grad_norm": 0.341796875,
583
+ "learning_rate": 1.1749913540496371e-07,
584
+ "loss": 1.0664,
585
+ "step": 355
586
+ },
587
+ {
588
+ "epoch": 9.863013698630137,
589
+ "grad_norm": 0.345703125,
590
+ "learning_rate": 0.0,
591
+ "loss": 1.0623,
592
+ "step": 360
593
+ },
594
+ {
595
+ "epoch": 9.863013698630137,
596
+ "eval_loss": 2.5758092403411865,
597
+ "eval_runtime": 0.5499,
598
+ "eval_samples_per_second": 18.186,
599
+ "eval_steps_per_second": 1.819,
600
+ "step": 360
601
+ },
602
+ {
603
+ "epoch": 9.863013698630137,
604
+ "step": 360,
605
+ "total_flos": 2.1145440197646746e+17,
606
+ "train_loss": 1.288370986117257,
607
+ "train_runtime": 1930.7273,
608
+ "train_samples_per_second": 9.074,
609
+ "train_steps_per_second": 0.186
610
+ }
611
+ ],
612
+ "logging_steps": 5,
613
+ "max_steps": 360,
614
+ "num_input_tokens_seen": 0,
615
+ "num_train_epochs": 10,
616
+ "save_steps": 100,
617
+ "stateful_callbacks": {
618
+ "TrainerControl": {
619
+ "args": {
620
+ "should_epoch_stop": false,
621
+ "should_evaluate": false,
622
+ "should_log": false,
623
+ "should_save": true,
624
+ "should_training_stop": false
625
+ },
626
+ "attributes": {}
627
+ }
628
+ },
629
+ "total_flos": 2.1145440197646746e+17,
630
+ "train_batch_size": 8,
631
+ "trial_name": null,
632
+ "trial_params": null
633
+ }