habibi26 commited on
Commit
10cb604
1 Parent(s): a854a04

Training in progress, epoch 0

Browse files
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 19.2,
3
- "total_flos": 6.063680104641331e+17,
4
- "train_loss": 0.059417286076692714,
5
- "train_runtime": 389.1501,
6
- "train_samples_per_second": 20.558,
7
- "train_steps_per_second": 0.308
8
  }
 
1
  {
2
+ "epoch": 18.46153846153846,
3
+ "total_flos": 6.035309694497341e+17,
4
+ "train_loss": 0.09444785690047866,
5
+ "train_runtime": 374.1698,
6
+ "train_samples_per_second": 22.129,
7
+ "train_steps_per_second": 0.321
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2838fadb867240e176028870f536aa974297d04766f71557ecd1fe9ab88467e2
3
  size 349857196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:285b88f17873b74a93a630cf43553eacf28648e1c373dbccbea976e9a04636a6
3
  size 349857196
runs/Jul03_12-38-35_e0133a370a2e/events.out.tfevents.1720010332.e0133a370a2e.34.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82ba278461f45a4f201bb17c5aff573cb89f27ed28dba6f5b725949e8705223f
3
+ size 5170
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 19.2,
3
- "total_flos": 6.063680104641331e+17,
4
- "train_loss": 0.059417286076692714,
5
- "train_runtime": 389.1501,
6
- "train_samples_per_second": 20.558,
7
- "train_steps_per_second": 0.308
8
  }
 
1
  {
2
+ "epoch": 18.46153846153846,
3
+ "total_flos": 6.035309694497341e+17,
4
+ "train_loss": 0.09444785690047866,
5
+ "train_runtime": 374.1698,
6
+ "train_samples_per_second": 22.129,
7
+ "train_steps_per_second": 0.321
8
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": 1.0,
3
- "best_model_checkpoint": "ktp-not-ktp-clip/checkpoint-37",
4
- "epoch": 19.2,
5
  "eval_steps": 500,
6
  "global_step": 120,
7
  "is_hyper_param_search": false,
@@ -9,277 +9,268 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.96,
13
- "eval_accuracy": 0.8829787234042553,
14
- "eval_loss": 0.40863433480262756,
15
- "eval_runtime": 5.5907,
16
- "eval_samples_per_second": 33.627,
17
- "eval_steps_per_second": 2.146,
18
  "step": 6
19
  },
20
  {
21
- "epoch": 1.6,
22
- "grad_norm": 9.341768264770508,
23
  "learning_rate": 4.166666666666667e-05,
24
- "loss": 0.4605,
25
  "step": 10
26
  },
27
  {
28
- "epoch": 1.92,
29
- "eval_accuracy": 0.9680851063829787,
30
- "eval_loss": 0.05953093245625496,
31
- "eval_runtime": 4.4357,
32
- "eval_samples_per_second": 42.384,
33
- "eval_steps_per_second": 2.705,
34
- "step": 12
35
  },
36
  {
37
- "epoch": 2.88,
38
- "eval_accuracy": 0.9946808510638298,
39
- "eval_loss": 0.03500603511929512,
40
- "eval_runtime": 4.3384,
41
- "eval_samples_per_second": 43.334,
42
- "eval_steps_per_second": 2.766,
43
- "step": 18
44
  },
45
  {
46
- "epoch": 3.2,
47
- "grad_norm": 30.609560012817383,
48
  "learning_rate": 4.62962962962963e-05,
49
- "loss": 0.1012,
50
  "step": 20
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.9787234042553191,
55
- "eval_loss": 0.05293251574039459,
56
- "eval_runtime": 4.4455,
57
- "eval_samples_per_second": 42.29,
58
- "eval_steps_per_second": 2.699,
59
- "step": 25
60
  },
61
  {
62
- "epoch": 4.8,
63
- "grad_norm": 15.118265151977539,
64
  "learning_rate": 4.166666666666667e-05,
65
- "loss": 0.0975,
66
  "step": 30
67
  },
68
  {
69
- "epoch": 4.96,
70
- "eval_accuracy": 0.9946808510638298,
71
- "eval_loss": 0.016444995999336243,
72
- "eval_runtime": 4.5421,
73
- "eval_samples_per_second": 41.39,
74
- "eval_steps_per_second": 2.642,
75
- "step": 31
76
  },
77
  {
78
- "epoch": 5.92,
79
- "eval_accuracy": 1.0,
80
- "eval_loss": 0.002412277739495039,
81
- "eval_runtime": 4.4217,
82
- "eval_samples_per_second": 42.518,
83
- "eval_steps_per_second": 2.714,
84
- "step": 37
85
  },
86
  {
87
- "epoch": 6.4,
88
- "grad_norm": 0.07197532057762146,
89
  "learning_rate": 3.7037037037037037e-05,
90
- "loss": 0.0107,
91
  "step": 40
92
  },
93
  {
94
- "epoch": 6.88,
95
- "eval_accuracy": 0.9840425531914894,
96
- "eval_loss": 0.06681745499372482,
97
- "eval_runtime": 4.4511,
98
- "eval_samples_per_second": 42.237,
99
- "eval_steps_per_second": 2.696,
100
- "step": 43
101
  },
102
  {
103
- "epoch": 8.0,
104
- "grad_norm": 0.005246018059551716,
105
  "learning_rate": 3.240740740740741e-05,
106
- "loss": 0.0415,
107
  "step": 50
108
  },
109
  {
110
  "epoch": 8.0,
111
  "eval_accuracy": 1.0,
112
- "eval_loss": 0.002033823635429144,
113
- "eval_runtime": 4.4719,
114
- "eval_samples_per_second": 42.041,
115
- "eval_steps_per_second": 2.683,
116
- "step": 50
117
  },
118
  {
119
- "epoch": 8.96,
120
- "eval_accuracy": 0.9946808510638298,
121
- "eval_loss": 0.021679291501641273,
122
- "eval_runtime": 4.4307,
123
- "eval_samples_per_second": 42.432,
124
- "eval_steps_per_second": 2.708,
125
- "step": 56
126
  },
127
  {
128
- "epoch": 9.6,
129
- "grad_norm": 30.30540657043457,
130
  "learning_rate": 2.777777777777778e-05,
131
- "loss": 0.0016,
132
  "step": 60
133
  },
134
  {
135
- "epoch": 9.92,
136
- "eval_accuracy": 1.0,
137
- "eval_loss": 0.005527271423488855,
138
- "eval_runtime": 4.4668,
139
- "eval_samples_per_second": 42.089,
140
- "eval_steps_per_second": 2.687,
141
- "step": 62
142
- },
143
- {
144
- "epoch": 10.88,
145
- "eval_accuracy": 0.9893617021276596,
146
- "eval_loss": 0.040382999926805496,
147
- "eval_runtime": 4.5274,
148
- "eval_samples_per_second": 41.525,
149
- "eval_steps_per_second": 2.651,
150
- "step": 68
151
  },
152
  {
153
- "epoch": 11.2,
154
- "grad_norm": 0.0002783833770081401,
155
  "learning_rate": 2.314814814814815e-05,
156
- "loss": 0.0,
157
  "step": 70
158
  },
 
 
 
 
 
 
 
 
 
159
  {
160
  "epoch": 12.0,
161
- "eval_accuracy": 1.0,
162
- "eval_loss": 0.003910040948539972,
163
- "eval_runtime": 4.4428,
164
- "eval_samples_per_second": 42.315,
165
- "eval_steps_per_second": 2.701,
166
- "step": 75
167
  },
168
  {
169
- "epoch": 12.8,
170
- "grad_norm": 0.0002681456971913576,
171
  "learning_rate": 1.8518518518518518e-05,
172
- "loss": 0.0,
173
  "step": 80
174
  },
175
  {
176
- "epoch": 12.96,
177
- "eval_accuracy": 0.9946808510638298,
178
- "eval_loss": 0.006880749948322773,
179
- "eval_runtime": 4.5249,
180
- "eval_samples_per_second": 41.548,
181
- "eval_steps_per_second": 2.652,
182
- "step": 81
183
- },
184
- {
185
- "epoch": 13.92,
186
- "eval_accuracy": 0.9946808510638298,
187
- "eval_loss": 0.01297041680663824,
188
- "eval_runtime": 4.4227,
189
- "eval_samples_per_second": 42.508,
190
- "eval_steps_per_second": 2.713,
191
- "step": 87
192
  },
193
  {
194
- "epoch": 14.4,
195
- "grad_norm": 0.00011915920185856521,
196
  "learning_rate": 1.388888888888889e-05,
197
- "loss": 0.0,
198
  "step": 90
199
  },
200
  {
201
- "epoch": 14.88,
202
- "eval_accuracy": 0.9946808510638298,
203
- "eval_loss": 0.015453271567821503,
204
- "eval_runtime": 4.4271,
205
- "eval_samples_per_second": 42.465,
206
- "eval_steps_per_second": 2.711,
207
- "step": 93
208
  },
209
  {
210
- "epoch": 16.0,
211
- "grad_norm": 0.00199418468400836,
212
- "learning_rate": 9.259259259259259e-06,
213
- "loss": 0.0,
214
- "step": 100
 
 
215
  },
216
  {
217
- "epoch": 16.0,
218
- "eval_accuracy": 0.9946808510638298,
219
- "eval_loss": 0.015377724543213844,
220
- "eval_runtime": 4.4729,
221
- "eval_samples_per_second": 42.03,
222
- "eval_steps_per_second": 2.683,
223
  "step": 100
224
  },
225
  {
226
- "epoch": 16.96,
227
- "eval_accuracy": 0.9946808510638298,
228
- "eval_loss": 0.01379456277936697,
229
- "eval_runtime": 4.3656,
230
- "eval_samples_per_second": 43.064,
231
- "eval_steps_per_second": 2.749,
232
- "step": 106
233
  },
234
  {
235
- "epoch": 17.6,
236
- "grad_norm": 3.829578054137528e-05,
237
  "learning_rate": 4.6296296296296296e-06,
238
  "loss": 0.0,
239
  "step": 110
240
  },
241
  {
242
- "epoch": 17.92,
243
- "eval_accuracy": 0.9946808510638298,
244
- "eval_loss": 0.013120871968567371,
245
- "eval_runtime": 4.534,
246
- "eval_samples_per_second": 41.464,
247
- "eval_steps_per_second": 2.647,
248
- "step": 112
249
  },
250
  {
251
- "epoch": 18.88,
252
- "eval_accuracy": 0.9946808510638298,
253
- "eval_loss": 0.012892493978142738,
254
- "eval_runtime": 4.5207,
255
- "eval_samples_per_second": 41.586,
256
- "eval_steps_per_second": 2.654,
257
- "step": 118
258
  },
259
  {
260
- "epoch": 19.2,
261
- "grad_norm": 0.00024270155699923635,
262
  "learning_rate": 0.0,
263
  "loss": 0.0,
264
  "step": 120
265
  },
266
  {
267
- "epoch": 19.2,
268
- "eval_accuracy": 0.9946808510638298,
269
- "eval_loss": 0.012876511551439762,
270
- "eval_runtime": 4.5819,
271
- "eval_samples_per_second": 41.031,
272
- "eval_steps_per_second": 2.619,
273
  "step": 120
274
  },
275
  {
276
- "epoch": 19.2,
277
  "step": 120,
278
- "total_flos": 6.063680104641331e+17,
279
- "train_loss": 0.059417286076692714,
280
- "train_runtime": 389.1501,
281
- "train_samples_per_second": 20.558,
282
- "train_steps_per_second": 0.308
283
  }
284
  ],
285
  "logging_steps": 10,
@@ -299,7 +290,7 @@
299
  "attributes": {}
300
  }
301
  },
302
- "total_flos": 6.063680104641331e+17,
303
  "train_batch_size": 16,
304
  "trial_name": null,
305
  "trial_params": null
 
1
  {
2
  "best_metric": 1.0,
3
+ "best_model_checkpoint": "ktp-not-ktp-clip/checkpoint-52",
4
+ "epoch": 18.46153846153846,
5
  "eval_steps": 500,
6
  "global_step": 120,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.9230769230769231,
13
+ "eval_accuracy": 0.897196261682243,
14
+ "eval_loss": 0.4882933795452118,
15
+ "eval_runtime": 6.4817,
16
+ "eval_samples_per_second": 33.016,
17
+ "eval_steps_per_second": 2.16,
18
  "step": 6
19
  },
20
  {
21
+ "epoch": 1.5384615384615383,
22
+ "grad_norm": 8.5100679397583,
23
  "learning_rate": 4.166666666666667e-05,
24
+ "loss": 0.6748,
25
  "step": 10
26
  },
27
  {
28
+ "epoch": 2.0,
29
+ "eval_accuracy": 0.9719626168224299,
30
+ "eval_loss": 0.07545875012874603,
31
+ "eval_runtime": 4.9236,
32
+ "eval_samples_per_second": 43.464,
33
+ "eval_steps_per_second": 2.843,
34
+ "step": 13
35
  },
36
  {
37
+ "epoch": 2.9230769230769234,
38
+ "eval_accuracy": 0.9345794392523364,
39
+ "eval_loss": 0.13731282949447632,
40
+ "eval_runtime": 4.9997,
41
+ "eval_samples_per_second": 42.803,
42
+ "eval_steps_per_second": 2.8,
43
+ "step": 19
44
  },
45
  {
46
+ "epoch": 3.076923076923077,
47
+ "grad_norm": 55.80315399169922,
48
  "learning_rate": 4.62962962962963e-05,
49
+ "loss": 0.1779,
50
  "step": 20
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.9719626168224299,
55
+ "eval_loss": 0.06819602847099304,
56
+ "eval_runtime": 5.1079,
57
+ "eval_samples_per_second": 41.896,
58
+ "eval_steps_per_second": 2.741,
59
+ "step": 26
60
  },
61
  {
62
+ "epoch": 4.615384615384615,
63
+ "grad_norm": 5.013700008392334,
64
  "learning_rate": 4.166666666666667e-05,
65
+ "loss": 0.1511,
66
  "step": 30
67
  },
68
  {
69
+ "epoch": 4.923076923076923,
70
+ "eval_accuracy": 0.9766355140186916,
71
+ "eval_loss": 0.03988885134458542,
72
+ "eval_runtime": 5.0626,
73
+ "eval_samples_per_second": 42.271,
74
+ "eval_steps_per_second": 2.765,
75
+ "step": 32
76
  },
77
  {
78
+ "epoch": 6.0,
79
+ "eval_accuracy": 0.9953271028037384,
80
+ "eval_loss": 0.011155444197356701,
81
+ "eval_runtime": 5.129,
82
+ "eval_samples_per_second": 41.724,
83
+ "eval_steps_per_second": 2.73,
84
+ "step": 39
85
  },
86
  {
87
+ "epoch": 6.153846153846154,
88
+ "grad_norm": 22.189523696899414,
89
  "learning_rate": 3.7037037037037037e-05,
90
+ "loss": 0.0248,
91
  "step": 40
92
  },
93
  {
94
+ "epoch": 6.923076923076923,
95
+ "eval_accuracy": 0.9766355140186916,
96
+ "eval_loss": 0.05554972589015961,
97
+ "eval_runtime": 5.1142,
98
+ "eval_samples_per_second": 41.844,
99
+ "eval_steps_per_second": 2.737,
100
+ "step": 45
101
  },
102
  {
103
+ "epoch": 7.6923076923076925,
104
+ "grad_norm": 49.896671295166016,
105
  "learning_rate": 3.240740740740741e-05,
106
+ "loss": 0.057,
107
  "step": 50
108
  },
109
  {
110
  "epoch": 8.0,
111
  "eval_accuracy": 1.0,
112
+ "eval_loss": 0.005110082216560841,
113
+ "eval_runtime": 5.1693,
114
+ "eval_samples_per_second": 41.398,
115
+ "eval_steps_per_second": 2.708,
116
+ "step": 52
117
  },
118
  {
119
+ "epoch": 8.923076923076923,
120
+ "eval_accuracy": 0.985981308411215,
121
+ "eval_loss": 0.029344480484724045,
122
+ "eval_runtime": 5.1265,
123
+ "eval_samples_per_second": 41.744,
124
+ "eval_steps_per_second": 2.731,
125
+ "step": 58
126
  },
127
  {
128
+ "epoch": 9.23076923076923,
129
+ "grad_norm": 22.167015075683594,
130
  "learning_rate": 2.777777777777778e-05,
131
+ "loss": 0.0361,
132
  "step": 60
133
  },
134
  {
135
+ "epoch": 10.0,
136
+ "eval_accuracy": 0.985981308411215,
137
+ "eval_loss": 0.027151916176080704,
138
+ "eval_runtime": 5.1474,
139
+ "eval_samples_per_second": 41.574,
140
+ "eval_steps_per_second": 2.72,
141
+ "step": 65
 
 
 
 
 
 
 
 
 
142
  },
143
  {
144
+ "epoch": 10.76923076923077,
145
+ "grad_norm": 0.32469525933265686,
146
  "learning_rate": 2.314814814814815e-05,
147
+ "loss": 0.011,
148
  "step": 70
149
  },
150
+ {
151
+ "epoch": 10.923076923076923,
152
+ "eval_accuracy": 0.9906542056074766,
153
+ "eval_loss": 0.01700090989470482,
154
+ "eval_runtime": 5.0367,
155
+ "eval_samples_per_second": 42.488,
156
+ "eval_steps_per_second": 2.78,
157
+ "step": 71
158
+ },
159
  {
160
  "epoch": 12.0,
161
+ "eval_accuracy": 0.985981308411215,
162
+ "eval_loss": 0.07101369649171829,
163
+ "eval_runtime": 4.9654,
164
+ "eval_samples_per_second": 43.099,
165
+ "eval_steps_per_second": 2.82,
166
+ "step": 78
167
  },
168
  {
169
+ "epoch": 12.307692307692308,
170
+ "grad_norm": 0.06220458447933197,
171
  "learning_rate": 1.8518518518518518e-05,
172
+ "loss": 0.0006,
173
  "step": 80
174
  },
175
  {
176
+ "epoch": 12.923076923076923,
177
+ "eval_accuracy": 0.9813084112149533,
178
+ "eval_loss": 0.07214021682739258,
179
+ "eval_runtime": 4.9054,
180
+ "eval_samples_per_second": 43.625,
181
+ "eval_steps_per_second": 2.854,
182
+ "step": 84
 
 
 
 
 
 
 
 
 
183
  },
184
  {
185
+ "epoch": 13.846153846153847,
186
+ "grad_norm": 0.0023754581343382597,
187
  "learning_rate": 1.388888888888889e-05,
188
+ "loss": 0.0001,
189
  "step": 90
190
  },
191
  {
192
+ "epoch": 14.0,
193
+ "eval_accuracy": 0.9906542056074766,
194
+ "eval_loss": 0.02770264819264412,
195
+ "eval_runtime": 4.9176,
196
+ "eval_samples_per_second": 43.517,
197
+ "eval_steps_per_second": 2.847,
198
+ "step": 91
199
  },
200
  {
201
+ "epoch": 14.923076923076923,
202
+ "eval_accuracy": 0.9953271028037384,
203
+ "eval_loss": 0.022401457652449608,
204
+ "eval_runtime": 4.8851,
205
+ "eval_samples_per_second": 43.806,
206
+ "eval_steps_per_second": 2.866,
207
+ "step": 97
208
  },
209
  {
210
+ "epoch": 15.384615384615385,
211
+ "grad_norm": 0.0014737015590071678,
212
+ "learning_rate": 9.259259259259259e-06,
213
+ "loss": 0.0001,
 
 
214
  "step": 100
215
  },
216
  {
217
+ "epoch": 16.0,
218
+ "eval_accuracy": 0.9953271028037384,
219
+ "eval_loss": 0.02016393281519413,
220
+ "eval_runtime": 5.2559,
221
+ "eval_samples_per_second": 40.716,
222
+ "eval_steps_per_second": 2.664,
223
+ "step": 104
224
  },
225
  {
226
+ "epoch": 16.923076923076923,
227
+ "grad_norm": 0.0013604339910671115,
228
  "learning_rate": 4.6296296296296296e-06,
229
  "loss": 0.0,
230
  "step": 110
231
  },
232
  {
233
+ "epoch": 16.923076923076923,
234
+ "eval_accuracy": 0.9953271028037384,
235
+ "eval_loss": 0.023425478488206863,
236
+ "eval_runtime": 5.0917,
237
+ "eval_samples_per_second": 42.029,
238
+ "eval_steps_per_second": 2.75,
239
+ "step": 110
240
  },
241
  {
242
+ "epoch": 18.0,
243
+ "eval_accuracy": 0.9953271028037384,
244
+ "eval_loss": 0.02447943389415741,
245
+ "eval_runtime": 4.9857,
246
+ "eval_samples_per_second": 42.923,
247
+ "eval_steps_per_second": 2.808,
248
+ "step": 117
249
  },
250
  {
251
+ "epoch": 18.46153846153846,
252
+ "grad_norm": 0.009677406400442123,
253
  "learning_rate": 0.0,
254
  "loss": 0.0,
255
  "step": 120
256
  },
257
  {
258
+ "epoch": 18.46153846153846,
259
+ "eval_accuracy": 0.9953271028037384,
260
+ "eval_loss": 0.024529017508029938,
261
+ "eval_runtime": 5.1559,
262
+ "eval_samples_per_second": 41.506,
263
+ "eval_steps_per_second": 2.715,
264
  "step": 120
265
  },
266
  {
267
+ "epoch": 18.46153846153846,
268
  "step": 120,
269
+ "total_flos": 6.035309694497341e+17,
270
+ "train_loss": 0.09444785690047866,
271
+ "train_runtime": 374.1698,
272
+ "train_samples_per_second": 22.129,
273
+ "train_steps_per_second": 0.321
274
  }
275
  ],
276
  "logging_steps": 10,
 
290
  "attributes": {}
291
  }
292
  },
293
+ "total_flos": 6.035309694497341e+17,
294
  "train_batch_size": 16,
295
  "trial_name": null,
296
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:234698a21a3ee38e3970458d513d29bf92da7ee921a19acdb43031f7f8dec258
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06137faeaa484e835b530ab7917f824549233122bd7a33c6728f43780ec71c5c
3
  size 5112