augustocsc commited on
Commit
81c82d2
1 Parent(s): 19b4ee6

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f37df316feb4858f2a440118f10c706a92a144096e75b10321afd1c2e0cc6f94
3
  size 497780352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8f0f8979da33f748917d049cc5ff11a0545fa1c5d17f233f6f3c1ec1146a9914
3
  size 497780352
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e0027210dfbc20af719b86b497c1d99b69cc8dd187bd6e90ecd9da520fef05f0
3
  size 995617914
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bbeda76dd05ca37fe45ede5f34534dd782ec62fe3c26ad951df6b0614b972ad
3
  size 995617914
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75596e794538906fc3dbf131d0568e28de9e681be900571db24920286dbfef63
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ff133e609b5f5bc20eed4a2e7ac89b3c0a8b88a47081e6214c0ce30a0d557e2
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25ae61e837be3ada9de530496f3f4fe1da31f4699408459fe2a0b05c3e210f0d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef451fce60281d5913091edc9ce429895db044ce88d47b2d438038fa76c50883
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,290 +1,236 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 5.0,
5
  "eval_steps": 200,
6
- "global_step": 5000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
- "eval_loss": 0.07517029345035553,
14
- "eval_runtime": 37.992,
15
- "eval_samples_per_second": 421.142,
16
- "eval_steps_per_second": 6.58,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4,
21
- "eval_loss": 0.21885745227336884,
22
- "eval_runtime": 37.9737,
23
- "eval_samples_per_second": 421.344,
24
- "eval_steps_per_second": 6.583,
25
  "step": 400
26
  },
27
  {
28
  "epoch": 0.5,
29
- "grad_norm": 205354.109375,
30
- "learning_rate": 4.877641290737884e-05,
31
- "loss": 0.3926,
32
  "step": 500
33
  },
34
  {
35
  "epoch": 0.6,
36
- "eval_loss": 1.307199478149414,
37
- "eval_runtime": 37.7639,
38
- "eval_samples_per_second": 423.685,
39
- "eval_steps_per_second": 6.62,
40
  "step": 600
41
  },
42
  {
43
  "epoch": 0.8,
44
- "eval_loss": 0.062136366963386536,
45
- "eval_runtime": 37.7282,
46
- "eval_samples_per_second": 424.086,
47
- "eval_steps_per_second": 6.626,
48
  "step": 800
49
  },
50
  {
51
  "epoch": 1.0,
52
- "grad_norm": 0.6423090696334839,
53
- "learning_rate": 4.522542485937369e-05,
54
- "loss": 0.9884,
55
  "step": 1000
56
  },
57
  {
58
  "epoch": 1.0,
59
- "eval_loss": 0.031570661813020706,
60
- "eval_runtime": 37.8801,
61
- "eval_samples_per_second": 422.385,
62
- "eval_steps_per_second": 6.6,
63
  "step": 1000
64
  },
65
  {
66
  "epoch": 1.2,
67
- "eval_loss": 0.02908959612250328,
68
- "eval_runtime": 37.7451,
69
- "eval_samples_per_second": 423.896,
70
- "eval_steps_per_second": 6.623,
71
  "step": 1200
72
  },
73
  {
74
  "epoch": 1.4,
75
- "eval_loss": 0.028545573353767395,
76
- "eval_runtime": 37.8679,
77
- "eval_samples_per_second": 422.522,
78
- "eval_steps_per_second": 6.602,
79
  "step": 1400
80
  },
81
  {
82
  "epoch": 1.5,
83
- "grad_norm": 0.2492751181125641,
84
- "learning_rate": 3.969463130731183e-05,
85
- "loss": 0.0322,
86
  "step": 1500
87
  },
88
  {
89
  "epoch": 1.6,
90
- "eval_loss": 0.027546165511012077,
91
- "eval_runtime": 37.9026,
92
- "eval_samples_per_second": 422.135,
93
- "eval_steps_per_second": 6.596,
94
  "step": 1600
95
  },
96
  {
97
  "epoch": 1.8,
98
- "eval_loss": 0.027202017605304718,
99
- "eval_runtime": 37.8488,
100
- "eval_samples_per_second": 422.735,
101
- "eval_steps_per_second": 6.605,
102
  "step": 1800
103
  },
104
  {
105
  "epoch": 2.0,
106
- "grad_norm": 0.06460036337375641,
107
- "learning_rate": 3.272542485937369e-05,
108
- "loss": 0.0297,
109
  "step": 2000
110
  },
111
  {
112
  "epoch": 2.0,
113
- "eval_loss": 0.026708217337727547,
114
- "eval_runtime": 37.6675,
115
- "eval_samples_per_second": 424.77,
116
- "eval_steps_per_second": 6.637,
117
  "step": 2000
118
  },
119
  {
120
  "epoch": 2.2,
121
- "eval_loss": 0.026396282017230988,
122
- "eval_runtime": 37.8265,
123
- "eval_samples_per_second": 422.984,
124
- "eval_steps_per_second": 6.609,
125
  "step": 2200
126
  },
127
  {
128
  "epoch": 2.4,
129
- "eval_loss": 0.026144716888666153,
130
- "eval_runtime": 37.693,
131
- "eval_samples_per_second": 424.482,
132
- "eval_steps_per_second": 6.633,
133
  "step": 2400
134
  },
135
  {
136
  "epoch": 2.5,
137
- "grad_norm": 0.17030462622642517,
138
- "learning_rate": 2.5e-05,
139
- "loss": 0.028,
140
  "step": 2500
141
  },
142
  {
143
  "epoch": 2.6,
144
- "eval_loss": 0.02573326788842678,
145
- "eval_runtime": 37.6502,
146
- "eval_samples_per_second": 424.964,
147
- "eval_steps_per_second": 6.64,
148
  "step": 2600
149
  },
150
  {
151
  "epoch": 2.8,
152
- "eval_loss": 0.025387177243828773,
153
- "eval_runtime": 37.6666,
154
- "eval_samples_per_second": 424.779,
155
- "eval_steps_per_second": 6.637,
156
  "step": 2800
157
  },
158
  {
159
  "epoch": 3.0,
160
- "grad_norm": 0.13623261451721191,
161
- "learning_rate": 1.7274575140626318e-05,
162
- "loss": 0.0268,
163
  "step": 3000
164
  },
165
  {
166
  "epoch": 3.0,
167
- "eval_loss": 0.02545003592967987,
168
- "eval_runtime": 37.6642,
169
- "eval_samples_per_second": 424.806,
170
- "eval_steps_per_second": 6.638,
171
  "step": 3000
172
  },
173
  {
174
  "epoch": 3.2,
175
- "eval_loss": 0.02509310096502304,
176
- "eval_runtime": 37.675,
177
- "eval_samples_per_second": 424.685,
178
- "eval_steps_per_second": 6.636,
179
  "step": 3200
180
  },
181
  {
182
  "epoch": 3.4,
183
- "eval_loss": 0.025246502831578255,
184
- "eval_runtime": 37.7858,
185
- "eval_samples_per_second": 423.439,
186
- "eval_steps_per_second": 6.616,
187
  "step": 3400
188
  },
189
  {
190
  "epoch": 3.5,
191
- "grad_norm": 0.12589260935783386,
192
- "learning_rate": 1.0305368692688174e-05,
193
- "loss": 0.0263,
194
  "step": 3500
195
  },
196
  {
197
  "epoch": 3.6,
198
- "eval_loss": 0.0253463052213192,
199
- "eval_runtime": 37.879,
200
- "eval_samples_per_second": 422.398,
201
- "eval_steps_per_second": 6.6,
202
  "step": 3600
203
  },
204
  {
205
  "epoch": 3.8,
206
- "eval_loss": 0.024967506527900696,
207
- "eval_runtime": 82.198,
208
- "eval_samples_per_second": 194.652,
209
- "eval_steps_per_second": 3.041,
210
  "step": 3800
211
  },
212
  {
213
  "epoch": 4.0,
214
- "grad_norm": 0.10492005199193954,
215
- "learning_rate": 4.7745751406263165e-06,
216
- "loss": 0.0264,
217
  "step": 4000
218
  },
219
  {
220
  "epoch": 4.0,
221
- "eval_loss": 0.02466404065489769,
222
- "eval_runtime": 82.3457,
223
- "eval_samples_per_second": 194.303,
224
- "eval_steps_per_second": 3.036,
225
  "step": 4000
226
- },
227
- {
228
- "epoch": 4.2,
229
- "eval_loss": 0.02478621155023575,
230
- "eval_runtime": 76.6082,
231
- "eval_samples_per_second": 208.855,
232
- "eval_steps_per_second": 3.263,
233
- "step": 4200
234
- },
235
- {
236
- "epoch": 4.4,
237
- "eval_loss": 0.024716826155781746,
238
- "eval_runtime": 86.8007,
239
- "eval_samples_per_second": 184.33,
240
- "eval_steps_per_second": 2.88,
241
- "step": 4400
242
- },
243
- {
244
- "epoch": 4.5,
245
- "grad_norm": 0.1692284792661667,
246
- "learning_rate": 1.2235870926211619e-06,
247
- "loss": 0.026,
248
- "step": 4500
249
- },
250
- {
251
- "epoch": 4.6,
252
- "eval_loss": 0.024610303342342377,
253
- "eval_runtime": 82.2894,
254
- "eval_samples_per_second": 194.436,
255
- "eval_steps_per_second": 3.038,
256
- "step": 4600
257
- },
258
- {
259
- "epoch": 4.8,
260
- "eval_loss": 0.024647582322359085,
261
- "eval_runtime": 83.9599,
262
- "eval_samples_per_second": 190.567,
263
- "eval_steps_per_second": 2.978,
264
- "step": 4800
265
- },
266
- {
267
- "epoch": 5.0,
268
- "grad_norm": 0.16388757526874542,
269
- "learning_rate": 0.0,
270
- "loss": 0.0258,
271
- "step": 5000
272
- },
273
- {
274
- "epoch": 5.0,
275
- "eval_loss": 0.02461400255560875,
276
- "eval_runtime": 73.1695,
277
- "eval_samples_per_second": 218.67,
278
- "eval_steps_per_second": 3.417,
279
- "step": 5000
280
  }
281
  ],
282
  "logging_steps": 500,
283
- "max_steps": 5000,
284
  "num_input_tokens_seen": 0,
285
- "num_train_epochs": 5,
286
  "save_steps": 1000,
287
- "total_flos": 2.090336256e+16,
288
  "train_batch_size": 64,
289
  "trial_name": null,
290
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
  "eval_steps": 200,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.2,
13
+ "eval_loss": 0.03862990066409111,
14
+ "eval_runtime": 37.7121,
15
+ "eval_samples_per_second": 424.267,
16
+ "eval_steps_per_second": 6.629,
17
  "step": 200
18
  },
19
  {
20
  "epoch": 0.4,
21
+ "eval_loss": 0.0523061603307724,
22
+ "eval_runtime": 37.8838,
23
+ "eval_samples_per_second": 422.344,
24
+ "eval_steps_per_second": 6.599,
25
  "step": 400
26
  },
27
  {
28
  "epoch": 0.5,
29
+ "grad_norm": 72862.21875,
30
+ "learning_rate": 4.9692208514878444e-05,
31
+ "loss": 0.1425,
32
  "step": 500
33
  },
34
  {
35
  "epoch": 0.6,
36
+ "eval_loss": 1.0542395114898682,
37
+ "eval_runtime": 37.8634,
38
+ "eval_samples_per_second": 422.571,
39
+ "eval_steps_per_second": 6.603,
40
  "step": 600
41
  },
42
  {
43
  "epoch": 0.8,
44
+ "eval_loss": 1.0459295511245728,
45
+ "eval_runtime": 37.8174,
46
+ "eval_samples_per_second": 423.086,
47
+ "eval_steps_per_second": 6.611,
48
  "step": 800
49
  },
50
  {
51
  "epoch": 1.0,
52
+ "grad_norm": 4.641770839691162,
53
+ "learning_rate": 4.877641290737884e-05,
54
+ "loss": 0.9177,
55
  "step": 1000
56
  },
57
  {
58
  "epoch": 1.0,
59
+ "eval_loss": 0.3677258789539337,
60
+ "eval_runtime": 37.6734,
61
+ "eval_samples_per_second": 424.703,
62
+ "eval_steps_per_second": 6.636,
63
  "step": 1000
64
  },
65
  {
66
  "epoch": 1.2,
67
+ "eval_loss": 0.02963975816965103,
68
+ "eval_runtime": 37.6097,
69
+ "eval_samples_per_second": 425.422,
70
+ "eval_steps_per_second": 6.647,
71
  "step": 1200
72
  },
73
  {
74
  "epoch": 1.4,
75
+ "eval_loss": 0.028412258252501488,
76
+ "eval_runtime": 37.7795,
77
+ "eval_samples_per_second": 423.51,
78
+ "eval_steps_per_second": 6.617,
79
  "step": 1400
80
  },
81
  {
82
  "epoch": 1.5,
83
+ "grad_norm": 0.17703795433044434,
84
+ "learning_rate": 4.72751631047092e-05,
85
+ "loss": 0.0421,
86
  "step": 1500
87
  },
88
  {
89
  "epoch": 1.6,
90
+ "eval_loss": 0.02752041630446911,
91
+ "eval_runtime": 37.7317,
92
+ "eval_samples_per_second": 424.046,
93
+ "eval_steps_per_second": 6.626,
94
  "step": 1600
95
  },
96
  {
97
  "epoch": 1.8,
98
+ "eval_loss": 0.027357231825590134,
99
+ "eval_runtime": 37.9192,
100
+ "eval_samples_per_second": 421.95,
101
+ "eval_steps_per_second": 6.593,
102
  "step": 1800
103
  },
104
  {
105
  "epoch": 2.0,
106
+ "grad_norm": 0.14987020194530487,
107
+ "learning_rate": 4.522542485937369e-05,
108
+ "loss": 0.029,
109
  "step": 2000
110
  },
111
  {
112
  "epoch": 2.0,
113
+ "eval_loss": 0.026410279795527458,
114
+ "eval_runtime": 37.883,
115
+ "eval_samples_per_second": 422.353,
116
+ "eval_steps_per_second": 6.599,
117
  "step": 2000
118
  },
119
  {
120
  "epoch": 2.2,
121
+ "eval_loss": 0.025838496163487434,
122
+ "eval_runtime": 37.6759,
123
+ "eval_samples_per_second": 424.674,
124
+ "eval_steps_per_second": 6.636,
125
  "step": 2200
126
  },
127
  {
128
  "epoch": 2.4,
129
+ "eval_loss": 0.025557253509759903,
130
+ "eval_runtime": 37.6691,
131
+ "eval_samples_per_second": 424.751,
132
+ "eval_steps_per_second": 6.637,
133
  "step": 2400
134
  },
135
  {
136
  "epoch": 2.5,
137
+ "grad_norm": 0.21350397169589996,
138
+ "learning_rate": 4.267766952966369e-05,
139
+ "loss": 0.0276,
140
  "step": 2500
141
  },
142
  {
143
  "epoch": 2.6,
144
+ "eval_loss": 0.025391312316060066,
145
+ "eval_runtime": 37.9185,
146
+ "eval_samples_per_second": 421.958,
147
+ "eval_steps_per_second": 6.593,
148
  "step": 2600
149
  },
150
  {
151
  "epoch": 2.8,
152
+ "eval_loss": 0.025234265252947807,
153
+ "eval_runtime": 37.6569,
154
+ "eval_samples_per_second": 424.889,
155
+ "eval_steps_per_second": 6.639,
156
  "step": 2800
157
  },
158
  {
159
  "epoch": 3.0,
160
+ "grad_norm": 0.09507149457931519,
161
+ "learning_rate": 3.969463130731183e-05,
162
+ "loss": 0.0265,
163
  "step": 3000
164
  },
165
  {
166
  "epoch": 3.0,
167
+ "eval_loss": 0.025119660422205925,
168
+ "eval_runtime": 37.7071,
169
+ "eval_samples_per_second": 424.323,
170
+ "eval_steps_per_second": 6.63,
171
  "step": 3000
172
  },
173
  {
174
  "epoch": 3.2,
175
+ "eval_loss": 0.02474472112953663,
176
+ "eval_runtime": 37.8472,
177
+ "eval_samples_per_second": 422.753,
178
+ "eval_steps_per_second": 6.606,
179
  "step": 3200
180
  },
181
  {
182
  "epoch": 3.4,
183
+ "eval_loss": 0.02474530041217804,
184
+ "eval_runtime": 37.964,
185
+ "eval_samples_per_second": 421.452,
186
+ "eval_steps_per_second": 6.585,
187
  "step": 3400
188
  },
189
  {
190
  "epoch": 3.5,
191
+ "grad_norm": 0.21493718028068542,
192
+ "learning_rate": 3.634976249348867e-05,
193
+ "loss": 0.0256,
194
  "step": 3500
195
  },
196
  {
197
  "epoch": 3.6,
198
+ "eval_loss": 0.024641884490847588,
199
+ "eval_runtime": 37.7028,
200
+ "eval_samples_per_second": 424.372,
201
+ "eval_steps_per_second": 6.631,
202
  "step": 3600
203
  },
204
  {
205
  "epoch": 3.8,
206
+ "eval_loss": 0.02516881749033928,
207
+ "eval_runtime": 37.6697,
208
+ "eval_samples_per_second": 424.745,
209
+ "eval_steps_per_second": 6.637,
210
  "step": 3800
211
  },
212
  {
213
  "epoch": 4.0,
214
+ "grad_norm": 0.11537094414234161,
215
+ "learning_rate": 3.272542485937369e-05,
216
+ "loss": 0.0262,
217
  "step": 4000
218
  },
219
  {
220
  "epoch": 4.0,
221
+ "eval_loss": 0.024907398968935013,
222
+ "eval_runtime": 37.8134,
223
+ "eval_samples_per_second": 423.13,
224
+ "eval_steps_per_second": 6.611,
225
  "step": 4000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  }
227
  ],
228
  "logging_steps": 500,
229
+ "max_steps": 10000,
230
  "num_input_tokens_seen": 0,
231
+ "num_train_epochs": 10,
232
  "save_steps": 1000,
233
+ "total_flos": 1.6722690048e+16,
234
  "train_batch_size": 64,
235
  "trial_name": null,
236
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1f0fcad3e74386ffcf1917836bd60c8d585b4f88d87c9181c320cc9fefea09b
3
  size 4984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac099e73b43ec4fb0ff55120cb27fe4c7253b82e028a96ded709dbe7962ba7d3
3
  size 4984