chansung commited on
Commit
7295abf
1 Parent(s): 33e1318

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,12 @@
2
  license: gemma
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
  base_model: google/gemma-7b
10
  datasets:
11
- - chansung/no_robots_only_coding
12
  model-index:
13
  - name: gemma-7b-sft-qlora-1
14
  results: []
@@ -19,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # gemma-7b-sft-qlora-1
21
 
22
- This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the chansung/no_robots_only_coding dataset.
23
  It achieves the following results on the evaluation set:
24
- - Loss: 2.2095
25
 
26
  ## Model description
27
 
@@ -58,29 +57,29 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 23.6212 | 0.91 | 5 | 8.0020 |
62
- | 14.6688 | 2.0 | 11 | 6.8099 |
63
- | 10.8277 | 2.91 | 16 | 6.4585 |
64
- | 10.965 | 4.0 | 22 | 5.2759 |
65
- | 8.3233 | 4.91 | 27 | 1.6939 |
66
- | 2.2795 | 6.0 | 33 | 1.4540 |
67
- | 1.5047 | 6.91 | 38 | 1.3612 |
68
- | 1.3243 | 8.0 | 44 | 1.2886 |
69
- | 1.1264 | 8.91 | 49 | 1.2783 |
70
- | 0.9122 | 10.0 | 55 | 1.2740 |
71
- | 0.8184 | 10.91 | 60 | 1.2854 |
72
- | 0.6918 | 12.0 | 66 | 1.3135 |
73
- | 0.6194 | 12.91 | 71 | 1.3431 |
74
- | 0.5176 | 14.0 | 77 | 1.4737 |
75
- | 0.4514 | 14.91 | 82 | 1.7112 |
76
- | 0.3759 | 16.0 | 88 | 1.8429 |
77
- | 0.3464 | 16.91 | 93 | 1.8994 |
78
- | 0.2681 | 18.0 | 99 | 1.9583 |
79
- | 0.2487 | 18.91 | 104 | 2.1623 |
80
- | 0.2122 | 20.0 | 110 | 2.2136 |
81
- | 0.2036 | 20.91 | 115 | 2.2150 |
82
- | 0.2098 | 22.0 | 121 | 2.2189 |
83
- | 0.1955 | 22.73 | 125 | 2.2095 |
84
 
85
 
86
  ### Framework versions
 
2
  license: gemma
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  base_model: google/gemma-7b
9
  datasets:
10
+ - generator
11
  model-index:
12
  - name: gemma-7b-sft-qlora-1
13
  results: []
 
18
 
19
  # gemma-7b-sft-qlora-1
20
 
21
+ This model is a fine-tuned version of [google/gemma-7b](https://huggingface.co/google/gemma-7b) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 2.1615
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:-----:|:----:|:---------------:|
60
+ | 23.7344 | 0.91 | 5 | 7.9584 |
61
+ | 14.6026 | 2.0 | 11 | 6.8289 |
62
+ | 10.8118 | 2.91 | 16 | 6.4185 |
63
+ | 10.8598 | 4.0 | 22 | 5.1061 |
64
+ | 7.9354 | 4.91 | 27 | 1.7011 |
65
+ | 2.0354 | 6.0 | 33 | 1.4461 |
66
+ | 1.4855 | 6.91 | 38 | 1.3565 |
67
+ | 1.326 | 8.0 | 44 | 1.2935 |
68
+ | 1.1375 | 8.91 | 49 | 1.2696 |
69
+ | 0.9091 | 10.0 | 55 | 1.2716 |
70
+ | 0.8111 | 10.91 | 60 | 1.2861 |
71
+ | 0.689 | 12.0 | 66 | 1.3148 |
72
+ | 0.6341 | 12.91 | 71 | 1.3391 |
73
+ | 0.5359 | 14.0 | 77 | 1.4232 |
74
+ | 0.4664 | 14.91 | 82 | 1.5107 |
75
+ | 0.3951 | 16.0 | 88 | 1.6597 |
76
+ | 0.3593 | 16.91 | 93 | 1.9377 |
77
+ | 0.2802 | 18.0 | 99 | 1.9024 |
78
+ | 0.2613 | 18.91 | 104 | 2.0981 |
79
+ | 0.2262 | 20.0 | 110 | 2.1472 |
80
+ | 0.2169 | 20.91 | 115 | 2.1633 |
81
+ | 0.2232 | 22.0 | 121 | 2.1595 |
82
+ | 0.2096 | 22.73 | 125 | 2.1615 |
83
 
84
 
85
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3f3e71a4da3a39a6cf7fdcb8dd60e4b87c3164ba403028d7d427b1b50a08331
3
  size 200068904
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4214980a9ececfd4df813d29a61cd992cbd3f80251df558e237efd88cd5a3d15
3
  size 200068904
all_results.json CHANGED
@@ -1,13 +1,8 @@
1
  {
2
  "epoch": 22.73,
3
- "eval_loss": 2.209489345550537,
4
- "eval_runtime": 0.6001,
5
- "eval_samples": 16,
6
- "eval_samples_per_second": 3.333,
7
- "eval_steps_per_second": 1.667,
8
- "train_loss": 3.2900945229530336,
9
- "train_runtime": 472.0874,
10
  "train_samples": 926,
11
- "train_samples_per_second": 4.66,
12
- "train_steps_per_second": 0.265
13
  }
 
1
  {
2
  "epoch": 22.73,
3
+ "train_loss": 3.26698664855957,
4
+ "train_runtime": 470.6237,
 
 
 
 
 
5
  "train_samples": 926,
6
+ "train_samples_per_second": 4.675,
7
+ "train_steps_per_second": 0.266
8
  }
runs/Apr11_22-49-38_deep-diver-main-rare-husky-1-0-0/events.out.tfevents.1712890281.deep-diver-main-rare-husky-1-0-0.520.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9533a1e5b09043cf2b202b59d59cf89a93c9b47849c11735a807e0c6b99d65d
3
- size 14482
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98dd6fb036556bf6cfa6a7991cb4a6cea6a01d3353d76e98ff8c87e2de6ee7e7
3
+ size 17195
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 22.73,
3
- "train_loss": 3.2900945229530336,
4
- "train_runtime": 472.0874,
5
  "train_samples": 926,
6
- "train_samples_per_second": 4.66,
7
- "train_steps_per_second": 0.265
8
  }
 
1
  {
2
  "epoch": 22.73,
3
+ "train_loss": 3.26698664855957,
4
+ "train_runtime": 470.6237,
5
  "train_samples": 926,
6
+ "train_samples_per_second": 4.675,
7
+ "train_steps_per_second": 0.266
8
  }
trainer_state.json CHANGED
@@ -10,378 +10,378 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.18,
13
- "grad_norm": 220.0,
14
  "learning_rate": 1.5384615384615387e-05,
15
  "loss": 24.9691,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.91,
20
- "grad_norm": 90.0,
21
  "learning_rate": 7.692307692307693e-05,
22
- "loss": 23.6212,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.91,
27
- "eval_loss": 8.00199031829834,
28
- "eval_runtime": 0.5608,
29
- "eval_samples_per_second": 3.566,
30
- "eval_steps_per_second": 1.783,
31
  "step": 5
32
  },
33
  {
34
  "epoch": 1.82,
35
- "grad_norm": 14.8125,
36
  "learning_rate": 0.00015384615384615385,
37
- "loss": 14.6688,
38
  "step": 10
39
  },
40
  {
41
  "epoch": 2.0,
42
- "eval_loss": 6.809861183166504,
43
- "eval_runtime": 0.5624,
44
- "eval_samples_per_second": 3.556,
45
- "eval_steps_per_second": 1.778,
46
  "step": 11
47
  },
48
  {
49
  "epoch": 2.73,
50
- "grad_norm": 5.625,
51
  "learning_rate": 0.00019984268150178167,
52
- "loss": 10.8277,
53
  "step": 15
54
  },
55
  {
56
  "epoch": 2.91,
57
- "eval_loss": 6.458514213562012,
58
- "eval_runtime": 0.5773,
59
- "eval_samples_per_second": 3.465,
60
  "eval_steps_per_second": 1.732,
61
  "step": 16
62
  },
63
  {
64
  "epoch": 3.64,
65
- "grad_norm": 7.65625,
66
  "learning_rate": 0.00019807852804032305,
67
- "loss": 10.965,
68
  "step": 20
69
  },
70
  {
71
  "epoch": 4.0,
72
- "eval_loss": 5.275852203369141,
73
- "eval_runtime": 0.5667,
74
- "eval_samples_per_second": 3.529,
75
- "eval_steps_per_second": 1.765,
76
  "step": 22
77
  },
78
  {
79
  "epoch": 4.55,
80
- "grad_norm": 32.75,
81
  "learning_rate": 0.00019438833303083678,
82
- "loss": 8.3233,
83
  "step": 25
84
  },
85
  {
86
  "epoch": 4.91,
87
- "eval_loss": 1.6938855648040771,
88
- "eval_runtime": 0.5862,
89
- "eval_samples_per_second": 3.412,
90
- "eval_steps_per_second": 1.706,
91
  "step": 27
92
  },
93
  {
94
  "epoch": 5.45,
95
- "grad_norm": 2.890625,
96
  "learning_rate": 0.00018884456359788724,
97
- "loss": 2.2795,
98
  "step": 30
99
  },
100
  {
101
  "epoch": 6.0,
102
- "eval_loss": 1.4539892673492432,
103
- "eval_runtime": 0.5675,
104
- "eval_samples_per_second": 3.524,
105
- "eval_steps_per_second": 1.762,
106
  "step": 33
107
  },
108
  {
109
  "epoch": 6.36,
110
- "grad_norm": 35.25,
111
  "learning_rate": 0.00018155608689592604,
112
- "loss": 1.5047,
113
  "step": 35
114
  },
115
  {
116
  "epoch": 6.91,
117
- "eval_loss": 1.3612087965011597,
118
- "eval_runtime": 0.5882,
119
- "eval_samples_per_second": 3.4,
120
- "eval_steps_per_second": 1.7,
121
  "step": 38
122
  },
123
  {
124
  "epoch": 7.27,
125
- "grad_norm": 1.65625,
126
  "learning_rate": 0.0001726660322034027,
127
- "loss": 1.3243,
128
  "step": 40
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_loss": 1.288640022277832,
133
- "eval_runtime": 0.5698,
134
- "eval_samples_per_second": 3.51,
135
- "eval_steps_per_second": 1.755,
136
  "step": 44
137
  },
138
  {
139
  "epoch": 8.18,
140
- "grad_norm": 1.015625,
141
  "learning_rate": 0.00016234898018587337,
142
- "loss": 1.1264,
143
  "step": 45
144
  },
145
  {
146
  "epoch": 8.91,
147
- "eval_loss": 1.2782788276672363,
148
- "eval_runtime": 0.59,
149
- "eval_samples_per_second": 3.39,
150
- "eval_steps_per_second": 1.695,
151
  "step": 49
152
  },
153
  {
154
  "epoch": 9.09,
155
- "grad_norm": 2.1875,
156
  "learning_rate": 0.00015080753452465296,
157
- "loss": 1.0387,
158
  "step": 50
159
  },
160
  {
161
  "epoch": 10.0,
162
- "grad_norm": 0.9765625,
163
  "learning_rate": 0.000138268343236509,
164
- "loss": 0.9122,
165
  "step": 55
166
  },
167
  {
168
  "epoch": 10.0,
169
- "eval_loss": 1.2739648818969727,
170
- "eval_runtime": 0.5659,
171
- "eval_samples_per_second": 3.534,
172
- "eval_steps_per_second": 1.767,
173
  "step": 55
174
  },
175
  {
176
  "epoch": 10.91,
177
- "grad_norm": 0.6953125,
178
  "learning_rate": 0.0001249776478167227,
179
- "loss": 0.8184,
180
  "step": 60
181
  },
182
  {
183
  "epoch": 10.91,
184
- "eval_loss": 1.2853541374206543,
185
- "eval_runtime": 0.5892,
186
- "eval_samples_per_second": 3.395,
187
- "eval_steps_per_second": 1.697,
188
  "step": 60
189
  },
190
  {
191
  "epoch": 11.82,
192
- "grad_norm": 0.96875,
193
  "learning_rate": 0.00011119644761033078,
194
- "loss": 0.6918,
195
  "step": 65
196
  },
197
  {
198
  "epoch": 12.0,
199
- "eval_loss": 1.31352698802948,
200
- "eval_runtime": 0.5674,
201
- "eval_samples_per_second": 3.525,
202
- "eval_steps_per_second": 1.762,
203
  "step": 66
204
  },
205
  {
206
  "epoch": 12.73,
207
- "grad_norm": 0.92578125,
208
  "learning_rate": 9.719537437241312e-05,
209
- "loss": 0.6194,
210
  "step": 70
211
  },
212
  {
213
  "epoch": 12.91,
214
- "eval_loss": 1.343058705329895,
215
- "eval_runtime": 0.7364,
216
- "eval_samples_per_second": 2.716,
217
- "eval_steps_per_second": 1.358,
218
  "step": 71
219
  },
220
  {
221
  "epoch": 13.64,
222
- "grad_norm": 1.1875,
223
  "learning_rate": 8.324937766952638e-05,
224
- "loss": 0.5176,
225
  "step": 75
226
  },
227
  {
228
  "epoch": 14.0,
229
- "eval_loss": 1.4736580848693848,
230
- "eval_runtime": 0.5693,
231
- "eval_samples_per_second": 3.513,
232
- "eval_steps_per_second": 1.757,
233
  "step": 77
234
  },
235
  {
236
  "epoch": 14.55,
237
- "grad_norm": 0.9296875,
238
  "learning_rate": 6.963232548903853e-05,
239
- "loss": 0.4514,
240
  "step": 80
241
  },
242
  {
243
  "epoch": 14.91,
244
- "eval_loss": 1.7112184762954712,
245
- "eval_runtime": 0.6242,
246
- "eval_samples_per_second": 3.204,
247
- "eval_steps_per_second": 1.602,
248
  "step": 82
249
  },
250
  {
251
  "epoch": 15.45,
252
- "grad_norm": 1.34375,
253
  "learning_rate": 5.6611626088244194e-05,
254
- "loss": 0.3759,
255
  "step": 85
256
  },
257
  {
258
  "epoch": 16.0,
259
- "eval_loss": 1.8429330587387085,
260
- "eval_runtime": 0.5694,
261
- "eval_samples_per_second": 3.513,
262
- "eval_steps_per_second": 1.756,
263
  "step": 88
264
  },
265
  {
266
  "epoch": 16.36,
267
- "grad_norm": 0.84375,
268
  "learning_rate": 4.444297669803981e-05,
269
- "loss": 0.3464,
270
  "step": 90
271
  },
272
  {
273
  "epoch": 16.91,
274
- "eval_loss": 1.899384617805481,
275
- "eval_runtime": 0.5863,
276
- "eval_samples_per_second": 3.411,
277
- "eval_steps_per_second": 1.706,
278
  "step": 93
279
  },
280
  {
281
  "epoch": 17.27,
282
- "grad_norm": 0.8203125,
283
  "learning_rate": 3.336534220479961e-05,
284
- "loss": 0.2681,
285
  "step": 95
286
  },
287
  {
288
  "epoch": 18.0,
289
- "eval_loss": 1.9583137035369873,
290
- "eval_runtime": 0.5683,
291
- "eval_samples_per_second": 3.519,
292
- "eval_steps_per_second": 1.76,
293
  "step": 99
294
  },
295
  {
296
  "epoch": 18.18,
297
- "grad_norm": 0.73828125,
298
  "learning_rate": 2.3596262417839255e-05,
299
- "loss": 0.2487,
300
  "step": 100
301
  },
302
  {
303
  "epoch": 18.91,
304
- "eval_loss": 2.162316083908081,
305
- "eval_runtime": 0.5845,
306
- "eval_samples_per_second": 3.422,
307
- "eval_steps_per_second": 1.711,
308
  "step": 104
309
  },
310
  {
311
  "epoch": 19.09,
312
- "grad_norm": 0.875,
313
  "learning_rate": 1.5327580077171587e-05,
314
- "loss": 0.2322,
315
  "step": 105
316
  },
317
  {
318
  "epoch": 20.0,
319
- "grad_norm": 0.6484375,
320
  "learning_rate": 8.72167349386811e-06,
321
- "loss": 0.2122,
322
  "step": 110
323
  },
324
  {
325
  "epoch": 20.0,
326
- "eval_loss": 2.213620901107788,
327
- "eval_runtime": 0.5651,
328
- "eval_samples_per_second": 3.539,
329
- "eval_steps_per_second": 1.769,
330
  "step": 110
331
  },
332
  {
333
  "epoch": 20.91,
334
- "grad_norm": 0.50390625,
335
  "learning_rate": 3.908267805490051e-06,
336
- "loss": 0.2036,
337
  "step": 115
338
  },
339
  {
340
  "epoch": 20.91,
341
- "eval_loss": 2.2149863243103027,
342
- "eval_runtime": 0.5812,
343
- "eval_samples_per_second": 3.441,
344
- "eval_steps_per_second": 1.721,
345
  "step": 115
346
  },
347
  {
348
  "epoch": 21.82,
349
- "grad_norm": 0.458984375,
350
  "learning_rate": 9.818874663554357e-07,
351
- "loss": 0.2098,
352
  "step": 120
353
  },
354
  {
355
  "epoch": 22.0,
356
- "eval_loss": 2.2188880443573,
357
- "eval_runtime": 0.5673,
358
- "eval_samples_per_second": 3.525,
359
- "eval_steps_per_second": 1.763,
360
  "step": 121
361
  },
362
  {
363
  "epoch": 22.73,
364
- "grad_norm": 0.6171875,
365
  "learning_rate": 0.0,
366
- "loss": 0.1955,
367
  "step": 125
368
  },
369
  {
370
  "epoch": 22.73,
371
- "eval_loss": 2.209489345550537,
372
- "eval_runtime": 0.5657,
373
- "eval_samples_per_second": 3.536,
374
- "eval_steps_per_second": 1.768,
375
  "step": 125
376
  },
377
  {
378
  "epoch": 22.73,
379
  "step": 125,
380
- "total_flos": 1.917235948819579e+17,
381
- "train_loss": 3.2900945229530336,
382
- "train_runtime": 472.0874,
383
- "train_samples_per_second": 4.66,
384
- "train_steps_per_second": 0.265
385
  }
386
  ],
387
  "logging_steps": 5,
@@ -389,7 +389,7 @@
389
  "num_input_tokens_seen": 0,
390
  "num_train_epochs": 25,
391
  "save_steps": 100,
392
- "total_flos": 1.917235948819579e+17,
393
  "train_batch_size": 2,
394
  "trial_name": null,
395
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.18,
13
+ "grad_norm": 159.0,
14
  "learning_rate": 1.5384615384615387e-05,
15
  "loss": 24.9691,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.91,
20
+ "grad_norm": 70.5,
21
  "learning_rate": 7.692307692307693e-05,
22
+ "loss": 23.7344,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.91,
27
+ "eval_loss": 7.958380222320557,
28
+ "eval_runtime": 0.5623,
29
+ "eval_samples_per_second": 3.557,
30
+ "eval_steps_per_second": 1.778,
31
  "step": 5
32
  },
33
  {
34
  "epoch": 1.82,
35
+ "grad_norm": 10.75,
36
  "learning_rate": 0.00015384615384615385,
37
+ "loss": 14.6026,
38
  "step": 10
39
  },
40
  {
41
  "epoch": 2.0,
42
+ "eval_loss": 6.82885217666626,
43
+ "eval_runtime": 0.5598,
44
+ "eval_samples_per_second": 3.573,
45
+ "eval_steps_per_second": 1.786,
46
  "step": 11
47
  },
48
  {
49
  "epoch": 2.73,
50
+ "grad_norm": 2.90625,
51
  "learning_rate": 0.00019984268150178167,
52
+ "loss": 10.8118,
53
  "step": 15
54
  },
55
  {
56
  "epoch": 2.91,
57
+ "eval_loss": 6.418484687805176,
58
+ "eval_runtime": 0.5774,
59
+ "eval_samples_per_second": 3.464,
60
  "eval_steps_per_second": 1.732,
61
  "step": 16
62
  },
63
  {
64
  "epoch": 3.64,
65
+ "grad_norm": 4.8125,
66
  "learning_rate": 0.00019807852804032305,
67
+ "loss": 10.8598,
68
  "step": 20
69
  },
70
  {
71
  "epoch": 4.0,
72
+ "eval_loss": 5.106105327606201,
73
+ "eval_runtime": 0.5656,
74
+ "eval_samples_per_second": 3.536,
75
+ "eval_steps_per_second": 1.768,
76
  "step": 22
77
  },
78
  {
79
  "epoch": 4.55,
80
+ "grad_norm": 16.625,
81
  "learning_rate": 0.00019438833303083678,
82
+ "loss": 7.9354,
83
  "step": 25
84
  },
85
  {
86
  "epoch": 4.91,
87
+ "eval_loss": 1.7010552883148193,
88
+ "eval_runtime": 0.5782,
89
+ "eval_samples_per_second": 3.459,
90
+ "eval_steps_per_second": 1.73,
91
  "step": 27
92
  },
93
  {
94
  "epoch": 5.45,
95
+ "grad_norm": 2.21875,
96
  "learning_rate": 0.00018884456359788724,
97
+ "loss": 2.0354,
98
  "step": 30
99
  },
100
  {
101
  "epoch": 6.0,
102
+ "eval_loss": 1.4460557699203491,
103
+ "eval_runtime": 0.5682,
104
+ "eval_samples_per_second": 3.52,
105
+ "eval_steps_per_second": 1.76,
106
  "step": 33
107
  },
108
  {
109
  "epoch": 6.36,
110
+ "grad_norm": 0.9140625,
111
  "learning_rate": 0.00018155608689592604,
112
+ "loss": 1.4855,
113
  "step": 35
114
  },
115
  {
116
  "epoch": 6.91,
117
+ "eval_loss": 1.3564677238464355,
118
+ "eval_runtime": 0.5812,
119
+ "eval_samples_per_second": 3.441,
120
+ "eval_steps_per_second": 1.721,
121
  "step": 38
122
  },
123
  {
124
  "epoch": 7.27,
125
+ "grad_norm": 1.203125,
126
  "learning_rate": 0.0001726660322034027,
127
+ "loss": 1.326,
128
  "step": 40
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_loss": 1.2935034036636353,
133
+ "eval_runtime": 0.5674,
134
+ "eval_samples_per_second": 3.525,
135
+ "eval_steps_per_second": 1.762,
136
  "step": 44
137
  },
138
  {
139
  "epoch": 8.18,
140
+ "grad_norm": 0.9296875,
141
  "learning_rate": 0.00016234898018587337,
142
+ "loss": 1.1375,
143
  "step": 45
144
  },
145
  {
146
  "epoch": 8.91,
147
+ "eval_loss": 1.269553542137146,
148
+ "eval_runtime": 0.5872,
149
+ "eval_samples_per_second": 3.406,
150
+ "eval_steps_per_second": 1.703,
151
  "step": 49
152
  },
153
  {
154
  "epoch": 9.09,
155
+ "grad_norm": 1.0625,
156
  "learning_rate": 0.00015080753452465296,
157
+ "loss": 1.0376,
158
  "step": 50
159
  },
160
  {
161
  "epoch": 10.0,
162
+ "grad_norm": 1.5390625,
163
  "learning_rate": 0.000138268343236509,
164
+ "loss": 0.9091,
165
  "step": 55
166
  },
167
  {
168
  "epoch": 10.0,
169
+ "eval_loss": 1.2716257572174072,
170
+ "eval_runtime": 0.5653,
171
+ "eval_samples_per_second": 3.538,
172
+ "eval_steps_per_second": 1.769,
173
  "step": 55
174
  },
175
  {
176
  "epoch": 10.91,
177
+ "grad_norm": 0.59375,
178
  "learning_rate": 0.0001249776478167227,
179
+ "loss": 0.8111,
180
  "step": 60
181
  },
182
  {
183
  "epoch": 10.91,
184
+ "eval_loss": 1.2860848903656006,
185
+ "eval_runtime": 0.5837,
186
+ "eval_samples_per_second": 3.427,
187
+ "eval_steps_per_second": 1.713,
188
  "step": 60
189
  },
190
  {
191
  "epoch": 11.82,
192
+ "grad_norm": 0.69140625,
193
  "learning_rate": 0.00011119644761033078,
194
+ "loss": 0.689,
195
  "step": 65
196
  },
197
  {
198
  "epoch": 12.0,
199
+ "eval_loss": 1.3148236274719238,
200
+ "eval_runtime": 0.5682,
201
+ "eval_samples_per_second": 3.52,
202
+ "eval_steps_per_second": 1.76,
203
  "step": 66
204
  },
205
  {
206
  "epoch": 12.73,
207
+ "grad_norm": 0.59375,
208
  "learning_rate": 9.719537437241312e-05,
209
+ "loss": 0.6341,
210
  "step": 70
211
  },
212
  {
213
  "epoch": 12.91,
214
+ "eval_loss": 1.3391039371490479,
215
+ "eval_runtime": 0.7408,
216
+ "eval_samples_per_second": 2.7,
217
+ "eval_steps_per_second": 1.35,
218
  "step": 71
219
  },
220
  {
221
  "epoch": 13.64,
222
+ "grad_norm": 0.8984375,
223
  "learning_rate": 8.324937766952638e-05,
224
+ "loss": 0.5359,
225
  "step": 75
226
  },
227
  {
228
  "epoch": 14.0,
229
+ "eval_loss": 1.4231812953948975,
230
+ "eval_runtime": 0.5681,
231
+ "eval_samples_per_second": 3.521,
232
+ "eval_steps_per_second": 1.76,
233
  "step": 77
234
  },
235
  {
236
  "epoch": 14.55,
237
+ "grad_norm": 1.0078125,
238
  "learning_rate": 6.963232548903853e-05,
239
+ "loss": 0.4664,
240
  "step": 80
241
  },
242
  {
243
  "epoch": 14.91,
244
+ "eval_loss": 1.510708212852478,
245
+ "eval_runtime": 0.6308,
246
+ "eval_samples_per_second": 3.171,
247
+ "eval_steps_per_second": 1.585,
248
  "step": 82
249
  },
250
  {
251
  "epoch": 15.45,
252
+ "grad_norm": 0.68359375,
253
  "learning_rate": 5.6611626088244194e-05,
254
+ "loss": 0.3951,
255
  "step": 85
256
  },
257
  {
258
  "epoch": 16.0,
259
+ "eval_loss": 1.6597082614898682,
260
+ "eval_runtime": 0.5686,
261
+ "eval_samples_per_second": 3.517,
262
+ "eval_steps_per_second": 1.759,
263
  "step": 88
264
  },
265
  {
266
  "epoch": 16.36,
267
+ "grad_norm": 0.67578125,
268
  "learning_rate": 4.444297669803981e-05,
269
+ "loss": 0.3593,
270
  "step": 90
271
  },
272
  {
273
  "epoch": 16.91,
274
+ "eval_loss": 1.9376537799835205,
275
+ "eval_runtime": 0.5859,
276
+ "eval_samples_per_second": 3.413,
277
+ "eval_steps_per_second": 1.707,
278
  "step": 93
279
  },
280
  {
281
  "epoch": 17.27,
282
+ "grad_norm": 0.53125,
283
  "learning_rate": 3.336534220479961e-05,
284
+ "loss": 0.2802,
285
  "step": 95
286
  },
287
  {
288
  "epoch": 18.0,
289
+ "eval_loss": 1.9024397134780884,
290
+ "eval_runtime": 0.5686,
291
+ "eval_samples_per_second": 3.518,
292
+ "eval_steps_per_second": 1.759,
293
  "step": 99
294
  },
295
  {
296
  "epoch": 18.18,
297
+ "grad_norm": 0.466796875,
298
  "learning_rate": 2.3596262417839255e-05,
299
+ "loss": 0.2613,
300
  "step": 100
301
  },
302
  {
303
  "epoch": 18.91,
304
+ "eval_loss": 2.098067283630371,
305
+ "eval_runtime": 0.5848,
306
+ "eval_samples_per_second": 3.42,
307
+ "eval_steps_per_second": 1.71,
308
  "step": 104
309
  },
310
  {
311
  "epoch": 19.09,
312
+ "grad_norm": 0.63671875,
313
  "learning_rate": 1.5327580077171587e-05,
314
+ "loss": 0.2442,
315
  "step": 105
316
  },
317
  {
318
  "epoch": 20.0,
319
+ "grad_norm": 0.51171875,
320
  "learning_rate": 8.72167349386811e-06,
321
+ "loss": 0.2262,
322
  "step": 110
323
  },
324
  {
325
  "epoch": 20.0,
326
+ "eval_loss": 2.1472132205963135,
327
+ "eval_runtime": 0.5636,
328
+ "eval_samples_per_second": 3.548,
329
+ "eval_steps_per_second": 1.774,
330
  "step": 110
331
  },
332
  {
333
  "epoch": 20.91,
334
+ "grad_norm": 0.34375,
335
  "learning_rate": 3.908267805490051e-06,
336
+ "loss": 0.2169,
337
  "step": 115
338
  },
339
  {
340
  "epoch": 20.91,
341
+ "eval_loss": 2.1632509231567383,
342
+ "eval_runtime": 0.5774,
343
+ "eval_samples_per_second": 3.464,
344
+ "eval_steps_per_second": 1.732,
345
  "step": 115
346
  },
347
  {
348
  "epoch": 21.82,
349
+ "grad_norm": 0.333984375,
350
  "learning_rate": 9.818874663554357e-07,
351
+ "loss": 0.2232,
352
  "step": 120
353
  },
354
  {
355
  "epoch": 22.0,
356
+ "eval_loss": 2.159508466720581,
357
+ "eval_runtime": 0.5656,
358
+ "eval_samples_per_second": 3.536,
359
+ "eval_steps_per_second": 1.768,
360
  "step": 121
361
  },
362
  {
363
  "epoch": 22.73,
364
+ "grad_norm": 0.46875,
365
  "learning_rate": 0.0,
366
+ "loss": 0.2096,
367
  "step": 125
368
  },
369
  {
370
  "epoch": 22.73,
371
+ "eval_loss": 2.161546468734741,
372
+ "eval_runtime": 0.563,
373
+ "eval_samples_per_second": 3.553,
374
+ "eval_steps_per_second": 1.776,
375
  "step": 125
376
  },
377
  {
378
  "epoch": 22.73,
379
  "step": 125,
380
+ "total_flos": 1.929524923995259e+17,
381
+ "train_loss": 3.26698664855957,
382
+ "train_runtime": 470.6237,
383
+ "train_samples_per_second": 4.675,
384
+ "train_steps_per_second": 0.266
385
  }
386
  ],
387
  "logging_steps": 5,
 
389
  "num_input_tokens_seen": 0,
390
  "num_train_epochs": 25,
391
  "save_steps": 100,
392
+ "total_flos": 1.929524923995259e+17,
393
  "train_batch_size": 2,
394
  "trial_name": null,
395
  "trial_params": null