tyzhu commited on
Commit
696b7a3
1 Parent(s): 56da1c8

End of training

Browse files
Files changed (6) hide show
  1. README.md +14 -2
  2. all_results.json +16 -0
  3. eval_results.json +10 -0
  4. tokenizer.json +1 -6
  5. train_results.json +9 -0
  6. trainer_state.json +686 -0
README.md CHANGED
@@ -3,11 +3,23 @@ license: other
3
  base_model: Qwen/Qwen1.5-4B
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - accuracy
8
  model-index:
9
  - name: lmind_nq_train6000_eval6489_v1_docidx_v3_Qwen_Qwen1.5-4B_5e-5_lora2
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  library_name: peft
12
  ---
13
 
@@ -16,7 +28,7 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # lmind_nq_train6000_eval6489_v1_docidx_v3_Qwen_Qwen1.5-4B_5e-5_lora2
18
 
19
- This model is a fine-tuned version of [Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: 5.0355
22
  - Accuracy: 0.4273
 
3
  base_model: Qwen/Qwen1.5-4B
4
  tags:
5
  - generated_from_trainer
6
+ datasets:
7
+ - tyzhu/lmind_nq_train6000_eval6489_v1_docidx_v3
8
  metrics:
9
  - accuracy
10
  model-index:
11
  - name: lmind_nq_train6000_eval6489_v1_docidx_v3_Qwen_Qwen1.5-4B_5e-5_lora2
12
+ results:
13
+ - task:
14
+ name: Causal Language Modeling
15
+ type: text-generation
16
+ dataset:
17
+ name: tyzhu/lmind_nq_train6000_eval6489_v1_docidx_v3
18
+ type: tyzhu/lmind_nq_train6000_eval6489_v1_docidx_v3
19
+ metrics:
20
+ - name: Accuracy
21
+ type: accuracy
22
+ value: 0.42728205128205127
23
  library_name: peft
24
  ---
25
 
 
28
 
29
  # lmind_nq_train6000_eval6489_v1_docidx_v3_Qwen_Qwen1.5-4B_5e-5_lora2
30
 
31
+ This model is a fine-tuned version of [Qwen/Qwen1.5-4B](https://huggingface.co/Qwen/Qwen1.5-4B) on the tyzhu/lmind_nq_train6000_eval6489_v1_docidx_v3 dataset.
32
  It achieves the following results on the evaluation set:
33
  - Loss: 5.0355
34
  - Accuracy: 0.4273
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.970717423133237,
3
+ "eval_accuracy": 0.42728205128205127,
4
+ "eval_loss": 5.035463809967041,
5
+ "eval_runtime": 5.6207,
6
+ "eval_samples": 500,
7
+ "eval_samples_per_second": 88.957,
8
+ "eval_steps_per_second": 11.209,
9
+ "perplexity": 153.77089654928625,
10
+ "total_flos": 5.856471132500132e+17,
11
+ "train_loss": 1.188865839770812,
12
+ "train_runtime": 15402.4164,
13
+ "train_samples": 10925,
14
+ "train_samples_per_second": 14.186,
15
+ "train_steps_per_second": 0.443
16
+ }
eval_results.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.970717423133237,
3
+ "eval_accuracy": 0.42728205128205127,
4
+ "eval_loss": 5.035463809967041,
5
+ "eval_runtime": 5.6207,
6
+ "eval_samples": 500,
7
+ "eval_samples_per_second": 88.957,
8
+ "eval_steps_per_second": 11.209,
9
+ "perplexity": 153.77089654928625
10
+ }
tokenizer.json CHANGED
@@ -1,11 +1,6 @@
1
  {
2
  "version": "1.0",
3
- "truncation": {
4
- "direction": "Right",
5
- "max_length": 1024,
6
- "strategy": "LongestFirst",
7
- "stride": 0
8
- },
9
  "padding": null,
10
  "added_tokens": [
11
  {
 
1
  {
2
  "version": "1.0",
3
+ "truncation": null,
 
 
 
 
 
4
  "padding": null,
5
  "added_tokens": [
6
  {
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 19.970717423133237,
3
+ "total_flos": 5.856471132500132e+17,
4
+ "train_loss": 1.188865839770812,
5
+ "train_runtime": 15402.4164,
6
+ "train_samples": 10925,
7
+ "train_samples_per_second": 14.186,
8
+ "train_steps_per_second": 0.443
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,686 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 19.970717423133237,
5
+ "eval_steps": 500,
6
+ "global_step": 6820,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.29282576866764276,
13
+ "grad_norm": 0.25126561522483826,
14
+ "learning_rate": 5e-05,
15
+ "loss": 2.0212,
16
+ "step": 100
17
+ },
18
+ {
19
+ "epoch": 0.5856515373352855,
20
+ "grad_norm": 0.2594548463821411,
21
+ "learning_rate": 5e-05,
22
+ "loss": 1.9601,
23
+ "step": 200
24
+ },
25
+ {
26
+ "epoch": 0.8784773060029283,
27
+ "grad_norm": 0.267731249332428,
28
+ "learning_rate": 5e-05,
29
+ "loss": 1.9626,
30
+ "step": 300
31
+ },
32
+ {
33
+ "epoch": 0.9985358711566618,
34
+ "eval_accuracy": 0.4726666666666667,
35
+ "eval_loss": 2.9919469356536865,
36
+ "eval_runtime": 5.796,
37
+ "eval_samples_per_second": 86.266,
38
+ "eval_steps_per_second": 10.87,
39
+ "step": 341
40
+ },
41
+ {
42
+ "epoch": 1.171303074670571,
43
+ "grad_norm": 0.3525784909725189,
44
+ "learning_rate": 5e-05,
45
+ "loss": 1.9477,
46
+ "step": 400
47
+ },
48
+ {
49
+ "epoch": 1.4641288433382138,
50
+ "grad_norm": 0.3649675250053406,
51
+ "learning_rate": 5e-05,
52
+ "loss": 1.9044,
53
+ "step": 500
54
+ },
55
+ {
56
+ "epoch": 1.7569546120058566,
57
+ "grad_norm": 0.4195611774921417,
58
+ "learning_rate": 5e-05,
59
+ "loss": 1.9158,
60
+ "step": 600
61
+ },
62
+ {
63
+ "epoch": 2.0,
64
+ "eval_accuracy": 0.47374358974358977,
65
+ "eval_loss": 2.9863882064819336,
66
+ "eval_runtime": 5.61,
67
+ "eval_samples_per_second": 89.126,
68
+ "eval_steps_per_second": 11.23,
69
+ "step": 683
70
+ },
71
+ {
72
+ "epoch": 2.049780380673499,
73
+ "grad_norm": 0.41343557834625244,
74
+ "learning_rate": 5e-05,
75
+ "loss": 1.9046,
76
+ "step": 700
77
+ },
78
+ {
79
+ "epoch": 2.342606149341142,
80
+ "grad_norm": 0.49672290682792664,
81
+ "learning_rate": 5e-05,
82
+ "loss": 1.8367,
83
+ "step": 800
84
+ },
85
+ {
86
+ "epoch": 2.6354319180087846,
87
+ "grad_norm": 0.5728709697723389,
88
+ "learning_rate": 5e-05,
89
+ "loss": 1.8573,
90
+ "step": 900
91
+ },
92
+ {
93
+ "epoch": 2.9282576866764276,
94
+ "grad_norm": 0.5571523904800415,
95
+ "learning_rate": 5e-05,
96
+ "loss": 1.8622,
97
+ "step": 1000
98
+ },
99
+ {
100
+ "epoch": 2.998535871156662,
101
+ "eval_accuracy": 0.47102564102564104,
102
+ "eval_loss": 3.0419609546661377,
103
+ "eval_runtime": 5.7173,
104
+ "eval_samples_per_second": 87.454,
105
+ "eval_steps_per_second": 11.019,
106
+ "step": 1024
107
+ },
108
+ {
109
+ "epoch": 3.22108345534407,
110
+ "grad_norm": 0.5878937840461731,
111
+ "learning_rate": 5e-05,
112
+ "loss": 1.7839,
113
+ "step": 1100
114
+ },
115
+ {
116
+ "epoch": 3.513909224011713,
117
+ "grad_norm": 0.6718343496322632,
118
+ "learning_rate": 5e-05,
119
+ "loss": 1.7601,
120
+ "step": 1200
121
+ },
122
+ {
123
+ "epoch": 3.8067349926793557,
124
+ "grad_norm": 0.7238633632659912,
125
+ "learning_rate": 5e-05,
126
+ "loss": 1.786,
127
+ "step": 1300
128
+ },
129
+ {
130
+ "epoch": 4.0,
131
+ "eval_accuracy": 0.46615384615384614,
132
+ "eval_loss": 3.1526739597320557,
133
+ "eval_runtime": 5.5502,
134
+ "eval_samples_per_second": 90.087,
135
+ "eval_steps_per_second": 11.351,
136
+ "step": 1366
137
+ },
138
+ {
139
+ "epoch": 4.099560761346998,
140
+ "grad_norm": 0.8255831599235535,
141
+ "learning_rate": 5e-05,
142
+ "loss": 1.7459,
143
+ "step": 1400
144
+ },
145
+ {
146
+ "epoch": 4.392386530014641,
147
+ "grad_norm": 0.7857334613800049,
148
+ "learning_rate": 5e-05,
149
+ "loss": 1.6799,
150
+ "step": 1500
151
+ },
152
+ {
153
+ "epoch": 4.685212298682284,
154
+ "grad_norm": 0.8550590872764587,
155
+ "learning_rate": 5e-05,
156
+ "loss": 1.68,
157
+ "step": 1600
158
+ },
159
+ {
160
+ "epoch": 4.978038067349927,
161
+ "grad_norm": 0.9140918254852295,
162
+ "learning_rate": 5e-05,
163
+ "loss": 1.7019,
164
+ "step": 1700
165
+ },
166
+ {
167
+ "epoch": 4.998535871156662,
168
+ "eval_accuracy": 0.4634358974358974,
169
+ "eval_loss": 3.381866693496704,
170
+ "eval_runtime": 5.9878,
171
+ "eval_samples_per_second": 83.503,
172
+ "eval_steps_per_second": 10.521,
173
+ "step": 1707
174
+ },
175
+ {
176
+ "epoch": 5.270863836017569,
177
+ "grad_norm": 0.958633542060852,
178
+ "learning_rate": 5e-05,
179
+ "loss": 1.6064,
180
+ "step": 1800
181
+ },
182
+ {
183
+ "epoch": 5.563689604685212,
184
+ "grad_norm": 0.9240352511405945,
185
+ "learning_rate": 5e-05,
186
+ "loss": 1.5911,
187
+ "step": 1900
188
+ },
189
+ {
190
+ "epoch": 5.856515373352855,
191
+ "grad_norm": 0.971266508102417,
192
+ "learning_rate": 5e-05,
193
+ "loss": 1.6036,
194
+ "step": 2000
195
+ },
196
+ {
197
+ "epoch": 6.0,
198
+ "eval_accuracy": 0.45887179487179486,
199
+ "eval_loss": 3.4968838691711426,
200
+ "eval_runtime": 5.9643,
201
+ "eval_samples_per_second": 83.832,
202
+ "eval_steps_per_second": 10.563,
203
+ "step": 2049
204
+ },
205
+ {
206
+ "epoch": 6.149341142020498,
207
+ "grad_norm": 1.107942819595337,
208
+ "learning_rate": 5e-05,
209
+ "loss": 1.5305,
210
+ "step": 2100
211
+ },
212
+ {
213
+ "epoch": 6.44216691068814,
214
+ "grad_norm": 1.133771300315857,
215
+ "learning_rate": 5e-05,
216
+ "loss": 1.4897,
217
+ "step": 2200
218
+ },
219
+ {
220
+ "epoch": 6.734992679355783,
221
+ "grad_norm": 1.037837266921997,
222
+ "learning_rate": 5e-05,
223
+ "loss": 1.5175,
224
+ "step": 2300
225
+ },
226
+ {
227
+ "epoch": 6.998535871156662,
228
+ "eval_accuracy": 0.45774358974358975,
229
+ "eval_loss": 3.64119553565979,
230
+ "eval_runtime": 5.7326,
231
+ "eval_samples_per_second": 87.22,
232
+ "eval_steps_per_second": 10.99,
233
+ "step": 2390
234
+ },
235
+ {
236
+ "epoch": 7.027818448023426,
237
+ "grad_norm": 1.2089684009552002,
238
+ "learning_rate": 5e-05,
239
+ "loss": 1.4929,
240
+ "step": 2400
241
+ },
242
+ {
243
+ "epoch": 7.320644216691068,
244
+ "grad_norm": 1.2794983386993408,
245
+ "learning_rate": 5e-05,
246
+ "loss": 1.3851,
247
+ "step": 2500
248
+ },
249
+ {
250
+ "epoch": 7.613469985358711,
251
+ "grad_norm": 1.2079399824142456,
252
+ "learning_rate": 5e-05,
253
+ "loss": 1.4075,
254
+ "step": 2600
255
+ },
256
+ {
257
+ "epoch": 7.906295754026354,
258
+ "grad_norm": 1.1294775009155273,
259
+ "learning_rate": 5e-05,
260
+ "loss": 1.4007,
261
+ "step": 2700
262
+ },
263
+ {
264
+ "epoch": 8.0,
265
+ "eval_accuracy": 0.45374358974358975,
266
+ "eval_loss": 3.831012487411499,
267
+ "eval_runtime": 5.871,
268
+ "eval_samples_per_second": 85.164,
269
+ "eval_steps_per_second": 10.731,
270
+ "step": 2732
271
+ },
272
+ {
273
+ "epoch": 8.199121522693996,
274
+ "grad_norm": 1.3208889961242676,
275
+ "learning_rate": 5e-05,
276
+ "loss": 1.3076,
277
+ "step": 2800
278
+ },
279
+ {
280
+ "epoch": 8.49194729136164,
281
+ "grad_norm": 1.345077395439148,
282
+ "learning_rate": 5e-05,
283
+ "loss": 1.2867,
284
+ "step": 2900
285
+ },
286
+ {
287
+ "epoch": 8.784773060029282,
288
+ "grad_norm": 1.4141809940338135,
289
+ "learning_rate": 5e-05,
290
+ "loss": 1.326,
291
+ "step": 3000
292
+ },
293
+ {
294
+ "epoch": 8.998535871156662,
295
+ "eval_accuracy": 0.44866666666666666,
296
+ "eval_loss": 3.9176931381225586,
297
+ "eval_runtime": 5.7802,
298
+ "eval_samples_per_second": 86.502,
299
+ "eval_steps_per_second": 10.899,
300
+ "step": 3073
301
+ },
302
+ {
303
+ "epoch": 9.077598828696924,
304
+ "grad_norm": 1.3622578382492065,
305
+ "learning_rate": 5e-05,
306
+ "loss": 1.2725,
307
+ "step": 3100
308
+ },
309
+ {
310
+ "epoch": 9.370424597364568,
311
+ "grad_norm": 1.632681965827942,
312
+ "learning_rate": 5e-05,
313
+ "loss": 1.1894,
314
+ "step": 3200
315
+ },
316
+ {
317
+ "epoch": 9.66325036603221,
318
+ "grad_norm": 1.5909069776535034,
319
+ "learning_rate": 5e-05,
320
+ "loss": 1.1959,
321
+ "step": 3300
322
+ },
323
+ {
324
+ "epoch": 9.956076134699854,
325
+ "grad_norm": 1.5544390678405762,
326
+ "learning_rate": 5e-05,
327
+ "loss": 1.231,
328
+ "step": 3400
329
+ },
330
+ {
331
+ "epoch": 10.0,
332
+ "eval_accuracy": 0.4450769230769231,
333
+ "eval_loss": 4.066500186920166,
334
+ "eval_runtime": 5.7535,
335
+ "eval_samples_per_second": 86.904,
336
+ "eval_steps_per_second": 10.95,
337
+ "step": 3415
338
+ },
339
+ {
340
+ "epoch": 10.248901903367496,
341
+ "grad_norm": 1.678530216217041,
342
+ "learning_rate": 5e-05,
343
+ "loss": 1.1034,
344
+ "step": 3500
345
+ },
346
+ {
347
+ "epoch": 10.541727672035138,
348
+ "grad_norm": 1.7260808944702148,
349
+ "learning_rate": 5e-05,
350
+ "loss": 1.0997,
351
+ "step": 3600
352
+ },
353
+ {
354
+ "epoch": 10.834553440702782,
355
+ "grad_norm": 1.8344779014587402,
356
+ "learning_rate": 5e-05,
357
+ "loss": 1.1298,
358
+ "step": 3700
359
+ },
360
+ {
361
+ "epoch": 10.998535871156662,
362
+ "eval_accuracy": 0.44,
363
+ "eval_loss": 4.177348613739014,
364
+ "eval_runtime": 5.7468,
365
+ "eval_samples_per_second": 87.005,
366
+ "eval_steps_per_second": 10.963,
367
+ "step": 3756
368
+ },
369
+ {
370
+ "epoch": 11.127379209370424,
371
+ "grad_norm": 1.9434887170791626,
372
+ "learning_rate": 5e-05,
373
+ "loss": 1.0734,
374
+ "step": 3800
375
+ },
376
+ {
377
+ "epoch": 11.420204978038067,
378
+ "grad_norm": 1.9285024404525757,
379
+ "learning_rate": 5e-05,
380
+ "loss": 1.0046,
381
+ "step": 3900
382
+ },
383
+ {
384
+ "epoch": 11.71303074670571,
385
+ "grad_norm": 1.9000359773635864,
386
+ "learning_rate": 5e-05,
387
+ "loss": 1.0276,
388
+ "step": 4000
389
+ },
390
+ {
391
+ "epoch": 12.0,
392
+ "eval_accuracy": 0.43784615384615383,
393
+ "eval_loss": 4.287516117095947,
394
+ "eval_runtime": 5.6257,
395
+ "eval_samples_per_second": 88.878,
396
+ "eval_steps_per_second": 11.199,
397
+ "step": 4098
398
+ },
399
+ {
400
+ "epoch": 12.005856515373353,
401
+ "grad_norm": 1.707350492477417,
402
+ "learning_rate": 5e-05,
403
+ "loss": 1.0392,
404
+ "step": 4100
405
+ },
406
+ {
407
+ "epoch": 12.298682284040996,
408
+ "grad_norm": 1.9523829221725464,
409
+ "learning_rate": 5e-05,
410
+ "loss": 0.9029,
411
+ "step": 4200
412
+ },
413
+ {
414
+ "epoch": 12.591508052708638,
415
+ "grad_norm": 1.9846230745315552,
416
+ "learning_rate": 5e-05,
417
+ "loss": 0.9324,
418
+ "step": 4300
419
+ },
420
+ {
421
+ "epoch": 12.88433382137628,
422
+ "grad_norm": 2.4833240509033203,
423
+ "learning_rate": 5e-05,
424
+ "loss": 0.9525,
425
+ "step": 4400
426
+ },
427
+ {
428
+ "epoch": 12.998535871156662,
429
+ "eval_accuracy": 0.4351794871794872,
430
+ "eval_loss": 4.42730188369751,
431
+ "eval_runtime": 5.835,
432
+ "eval_samples_per_second": 85.69,
433
+ "eval_steps_per_second": 10.797,
434
+ "step": 4439
435
+ },
436
+ {
437
+ "epoch": 13.177159590043924,
438
+ "grad_norm": 1.8055790662765503,
439
+ "learning_rate": 5e-05,
440
+ "loss": 0.8664,
441
+ "step": 4500
442
+ },
443
+ {
444
+ "epoch": 13.469985358711567,
445
+ "grad_norm": 2.5267982482910156,
446
+ "learning_rate": 5e-05,
447
+ "loss": 0.8457,
448
+ "step": 4600
449
+ },
450
+ {
451
+ "epoch": 13.762811127379209,
452
+ "grad_norm": 2.2086236476898193,
453
+ "learning_rate": 5e-05,
454
+ "loss": 0.8616,
455
+ "step": 4700
456
+ },
457
+ {
458
+ "epoch": 14.0,
459
+ "eval_accuracy": 0.43241025641025643,
460
+ "eval_loss": 4.44843053817749,
461
+ "eval_runtime": 5.7935,
462
+ "eval_samples_per_second": 86.303,
463
+ "eval_steps_per_second": 10.874,
464
+ "step": 4781
465
+ },
466
+ {
467
+ "epoch": 14.055636896046853,
468
+ "grad_norm": 2.44354510307312,
469
+ "learning_rate": 5e-05,
470
+ "loss": 0.8402,
471
+ "step": 4800
472
+ },
473
+ {
474
+ "epoch": 14.348462664714495,
475
+ "grad_norm": 2.1125502586364746,
476
+ "learning_rate": 5e-05,
477
+ "loss": 0.7608,
478
+ "step": 4900
479
+ },
480
+ {
481
+ "epoch": 14.641288433382137,
482
+ "grad_norm": 2.1573476791381836,
483
+ "learning_rate": 5e-05,
484
+ "loss": 0.7736,
485
+ "step": 5000
486
+ },
487
+ {
488
+ "epoch": 14.93411420204978,
489
+ "grad_norm": 2.4622254371643066,
490
+ "learning_rate": 5e-05,
491
+ "loss": 0.7799,
492
+ "step": 5100
493
+ },
494
+ {
495
+ "epoch": 14.998535871156662,
496
+ "eval_accuracy": 0.43133333333333335,
497
+ "eval_loss": 4.622844219207764,
498
+ "eval_runtime": 5.87,
499
+ "eval_samples_per_second": 85.178,
500
+ "eval_steps_per_second": 10.732,
501
+ "step": 5122
502
+ },
503
+ {
504
+ "epoch": 15.226939970717423,
505
+ "grad_norm": 2.4932661056518555,
506
+ "learning_rate": 5e-05,
507
+ "loss": 0.7057,
508
+ "step": 5200
509
+ },
510
+ {
511
+ "epoch": 15.519765739385067,
512
+ "grad_norm": 2.3488550186157227,
513
+ "learning_rate": 5e-05,
514
+ "loss": 0.686,
515
+ "step": 5300
516
+ },
517
+ {
518
+ "epoch": 15.812591508052709,
519
+ "grad_norm": 2.2296109199523926,
520
+ "learning_rate": 5e-05,
521
+ "loss": 0.7084,
522
+ "step": 5400
523
+ },
524
+ {
525
+ "epoch": 16.0,
526
+ "eval_accuracy": 0.4303076923076923,
527
+ "eval_loss": 4.723867416381836,
528
+ "eval_runtime": 5.6093,
529
+ "eval_samples_per_second": 89.137,
530
+ "eval_steps_per_second": 11.231,
531
+ "step": 5464
532
+ },
533
+ {
534
+ "epoch": 16.105417276720353,
535
+ "grad_norm": 2.141963243484497,
536
+ "learning_rate": 5e-05,
537
+ "loss": 0.6755,
538
+ "step": 5500
539
+ },
540
+ {
541
+ "epoch": 16.398243045387993,
542
+ "grad_norm": 2.679950475692749,
543
+ "learning_rate": 5e-05,
544
+ "loss": 0.6229,
545
+ "step": 5600
546
+ },
547
+ {
548
+ "epoch": 16.691068814055637,
549
+ "grad_norm": 2.506779193878174,
550
+ "learning_rate": 5e-05,
551
+ "loss": 0.6377,
552
+ "step": 5700
553
+ },
554
+ {
555
+ "epoch": 16.98389458272328,
556
+ "grad_norm": 3.019353151321411,
557
+ "learning_rate": 5e-05,
558
+ "loss": 0.6478,
559
+ "step": 5800
560
+ },
561
+ {
562
+ "epoch": 16.998535871156662,
563
+ "eval_accuracy": 0.430974358974359,
564
+ "eval_loss": 4.816666126251221,
565
+ "eval_runtime": 5.9538,
566
+ "eval_samples_per_second": 83.98,
567
+ "eval_steps_per_second": 10.581,
568
+ "step": 5805
569
+ },
570
+ {
571
+ "epoch": 17.27672035139092,
572
+ "grad_norm": 2.6447300910949707,
573
+ "learning_rate": 5e-05,
574
+ "loss": 0.5511,
575
+ "step": 5900
576
+ },
577
+ {
578
+ "epoch": 17.569546120058565,
579
+ "grad_norm": 2.720766067504883,
580
+ "learning_rate": 5e-05,
581
+ "loss": 0.5638,
582
+ "step": 6000
583
+ },
584
+ {
585
+ "epoch": 17.86237188872621,
586
+ "grad_norm": 2.4070911407470703,
587
+ "learning_rate": 5e-05,
588
+ "loss": 0.5862,
589
+ "step": 6100
590
+ },
591
+ {
592
+ "epoch": 18.0,
593
+ "eval_accuracy": 0.43025641025641026,
594
+ "eval_loss": 4.85101842880249,
595
+ "eval_runtime": 5.6529,
596
+ "eval_samples_per_second": 88.45,
597
+ "eval_steps_per_second": 11.145,
598
+ "step": 6147
599
+ },
600
+ {
601
+ "epoch": 18.15519765739385,
602
+ "grad_norm": 3.0029456615448,
603
+ "learning_rate": 5e-05,
604
+ "loss": 0.5335,
605
+ "step": 6200
606
+ },
607
+ {
608
+ "epoch": 18.448023426061493,
609
+ "grad_norm": 2.6631345748901367,
610
+ "learning_rate": 5e-05,
611
+ "loss": 0.5018,
612
+ "step": 6300
613
+ },
614
+ {
615
+ "epoch": 18.740849194729137,
616
+ "grad_norm": 2.603055477142334,
617
+ "learning_rate": 5e-05,
618
+ "loss": 0.5189,
619
+ "step": 6400
620
+ },
621
+ {
622
+ "epoch": 18.998535871156662,
623
+ "eval_accuracy": 0.42425641025641025,
624
+ "eval_loss": 4.926539897918701,
625
+ "eval_runtime": 5.8773,
626
+ "eval_samples_per_second": 85.073,
627
+ "eval_steps_per_second": 10.719,
628
+ "step": 6488
629
+ },
630
+ {
631
+ "epoch": 19.03367496339678,
632
+ "grad_norm": 2.3171300888061523,
633
+ "learning_rate": 5e-05,
634
+ "loss": 0.5225,
635
+ "step": 6500
636
+ },
637
+ {
638
+ "epoch": 19.32650073206442,
639
+ "grad_norm": 3.008113384246826,
640
+ "learning_rate": 5e-05,
641
+ "loss": 0.4485,
642
+ "step": 6600
643
+ },
644
+ {
645
+ "epoch": 19.619326500732065,
646
+ "grad_norm": 2.5204029083251953,
647
+ "learning_rate": 5e-05,
648
+ "loss": 0.4623,
649
+ "step": 6700
650
+ },
651
+ {
652
+ "epoch": 19.91215226939971,
653
+ "grad_norm": 2.9562718868255615,
654
+ "learning_rate": 5e-05,
655
+ "loss": 0.4767,
656
+ "step": 6800
657
+ },
658
+ {
659
+ "epoch": 19.970717423133237,
660
+ "eval_accuracy": 0.42728205128205127,
661
+ "eval_loss": 5.035463809967041,
662
+ "eval_runtime": 5.7429,
663
+ "eval_samples_per_second": 87.064,
664
+ "eval_steps_per_second": 10.97,
665
+ "step": 6820
666
+ },
667
+ {
668
+ "epoch": 19.970717423133237,
669
+ "step": 6820,
670
+ "total_flos": 5.856471132500132e+17,
671
+ "train_loss": 1.188865839770812,
672
+ "train_runtime": 15402.4164,
673
+ "train_samples_per_second": 14.186,
674
+ "train_steps_per_second": 0.443
675
+ }
676
+ ],
677
+ "logging_steps": 100,
678
+ "max_steps": 6820,
679
+ "num_input_tokens_seen": 0,
680
+ "num_train_epochs": 20,
681
+ "save_steps": 500,
682
+ "total_flos": 5.856471132500132e+17,
683
+ "train_batch_size": 1,
684
+ "trial_name": null,
685
+ "trial_params": null
686
+ }