bansilp commited on
Commit
7ffb264
1 Parent(s): 840617e

Model save

Browse files
README.md CHANGED
@@ -40,7 +40,7 @@ The following hyperparameters were used during training:
40
  - seed: 42
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: linear
43
- - num_epochs: 4
44
  - mixed_precision_training: Native AMP
45
 
46
  ### Training results
 
40
  - seed: 42
41
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
42
  - lr_scheduler_type: linear
43
+ - num_epochs: 10
44
  - mixed_precision_training: Native AMP
45
 
46
  ### Training results
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.8812949640287769,
4
- "eval_loss": 0.4103582799434662,
5
- "eval_runtime": 15.1855,
6
- "eval_samples_per_second": 73.228,
7
- "eval_steps_per_second": 9.153,
8
- "total_flos": 6.887981879958897e+17,
9
- "train_loss": 0.8073481788738168,
10
- "train_runtime": 211.1785,
11
- "train_samples_per_second": 42.088,
12
- "train_steps_per_second": 2.633
13
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.8712962962962963,
4
+ "eval_loss": 0.45238634943962097,
5
+ "eval_runtime": 15.5817,
6
+ "eval_samples_per_second": 69.312,
7
+ "eval_steps_per_second": 8.664,
8
+ "total_flos": 1.339145591637934e+18,
9
+ "train_loss": 0.6042599819324634,
10
+ "train_runtime": 412.4773,
11
+ "train_samples_per_second": 41.893,
12
+ "train_steps_per_second": 2.618
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.8812949640287769,
4
- "eval_loss": 0.4103582799434662,
5
- "eval_runtime": 15.1855,
6
- "eval_samples_per_second": 73.228,
7
- "eval_steps_per_second": 9.153
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.8712962962962963,
4
+ "eval_loss": 0.45238634943962097,
5
+ "eval_runtime": 15.5817,
6
+ "eval_samples_per_second": 69.312,
7
+ "eval_steps_per_second": 8.664
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5cd19c50b656e731cb9fd57b204546bedc536a7b2f2cddd11c22d2ee4078094b
3
  size 343245508
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19e9e589da45f267b74543fb3cf71a3e64ae7573f9ffbf643d2128ab848f036b
3
  size 343245508
runs/Dec13_17-25-51_f86c45587747/events.out.tfevents.1702489100.f86c45587747.2060.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ce540e1558284f9793fdfb4eff509c6495bb11a69321047444e8790d82a4996
3
+ size 411
runs/Dec13_17-40-47_f86c45587747/events.out.tfevents.1702489257.f86c45587747.2060.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95f23499f1e81cdfb1efd1a6cc7bd92a67c13ff9980cacd5bbdd4bcc1a14fa17
3
+ size 47230
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "total_flos": 6.887981879958897e+17,
4
- "train_loss": 0.8073481788738168,
5
- "train_runtime": 211.1785,
6
- "train_samples_per_second": 42.088,
7
- "train_steps_per_second": 2.633
8
  }
 
1
  {
2
+ "epoch": 4.0,
3
+ "total_flos": 1.339145591637934e+18,
4
+ "train_loss": 0.6042599819324634,
5
+ "train_runtime": 412.4773,
6
+ "train_samples_per_second": 41.893,
7
+ "train_steps_per_second": 2.618
8
  }
trainer_state.json CHANGED
@@ -1,358 +1,676 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.0,
5
  "eval_steps": 3000,
6
- "global_step": 556,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
- "learning_rate": 0.00019640287769784174,
14
- "loss": 2.1468,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.07,
19
- "learning_rate": 0.00019280575539568347,
20
- "loss": 1.9709,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.11,
25
- "learning_rate": 0.00018920863309352518,
26
- "loss": 1.8468,
27
  "step": 30
28
  },
29
  {
30
- "epoch": 0.14,
31
- "learning_rate": 0.0001856115107913669,
32
- "loss": 1.6942,
33
  "step": 40
34
  },
35
  {
36
- "epoch": 0.18,
37
- "learning_rate": 0.00018201438848920864,
38
- "loss": 1.4989,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.22,
43
- "learning_rate": 0.00017841726618705037,
44
- "loss": 1.3514,
45
  "step": 60
46
  },
47
  {
48
- "epoch": 0.25,
49
- "learning_rate": 0.0001748201438848921,
50
- "loss": 1.3165,
51
  "step": 70
52
  },
53
  {
54
- "epoch": 0.29,
55
- "learning_rate": 0.00017122302158273383,
56
- "loss": 1.2268,
57
  "step": 80
58
  },
59
  {
60
- "epoch": 0.32,
61
- "learning_rate": 0.00016762589928057554,
62
- "loss": 1.3164,
63
  "step": 90
64
  },
65
  {
66
- "epoch": 0.36,
67
- "learning_rate": 0.00016402877697841727,
68
- "loss": 1.3179,
69
  "step": 100
70
  },
71
  {
72
- "epoch": 0.4,
73
- "learning_rate": 0.000160431654676259,
74
- "loss": 1.2648,
75
  "step": 110
76
  },
77
  {
78
- "epoch": 0.43,
79
- "learning_rate": 0.00015683453237410073,
80
- "loss": 1.178,
81
  "step": 120
82
  },
83
  {
84
- "epoch": 0.47,
85
- "learning_rate": 0.00015323741007194246,
86
- "loss": 1.1558,
87
  "step": 130
88
  },
89
  {
90
- "epoch": 0.5,
91
- "learning_rate": 0.0001496402877697842,
92
- "loss": 1.0114,
93
  "step": 140
94
  },
95
  {
96
- "epoch": 0.54,
97
- "learning_rate": 0.0001460431654676259,
98
- "loss": 0.8844,
99
  "step": 150
100
  },
101
  {
102
- "epoch": 0.58,
103
- "learning_rate": 0.00014244604316546763,
104
- "loss": 0.9118,
105
  "step": 160
106
  },
107
  {
108
- "epoch": 0.61,
109
- "learning_rate": 0.00013884892086330936,
110
- "loss": 1.0269,
111
  "step": 170
112
  },
113
  {
114
- "epoch": 0.65,
115
- "learning_rate": 0.0001352517985611511,
116
- "loss": 0.9542,
117
  "step": 180
118
  },
119
  {
120
- "epoch": 0.68,
121
- "learning_rate": 0.00013165467625899283,
122
- "loss": 0.8281,
123
  "step": 190
124
  },
125
  {
126
- "epoch": 0.72,
127
- "learning_rate": 0.00012805755395683453,
128
- "loss": 0.8024,
129
  "step": 200
130
  },
131
  {
132
- "epoch": 0.76,
133
- "learning_rate": 0.00012446043165467626,
134
- "loss": 0.8185,
135
  "step": 210
136
  },
137
  {
138
- "epoch": 0.79,
139
- "learning_rate": 0.00012086330935251799,
140
- "loss": 1.099,
141
  "step": 220
142
  },
143
  {
144
- "epoch": 0.83,
145
- "learning_rate": 0.00011726618705035972,
146
- "loss": 0.8726,
147
  "step": 230
148
  },
149
  {
150
- "epoch": 0.86,
151
- "learning_rate": 0.00011366906474820144,
152
- "loss": 0.7907,
153
  "step": 240
154
  },
155
  {
156
- "epoch": 0.9,
157
- "learning_rate": 0.00011007194244604317,
158
- "loss": 0.9099,
159
  "step": 250
160
  },
161
  {
162
- "epoch": 0.94,
163
- "learning_rate": 0.0001064748201438849,
164
- "loss": 0.7301,
165
  "step": 260
166
  },
167
  {
168
- "epoch": 0.97,
169
- "learning_rate": 0.00010287769784172662,
170
- "loss": 0.772,
171
  "step": 270
172
  },
173
  {
174
- "epoch": 1.01,
175
- "learning_rate": 9.928057553956835e-05,
176
- "loss": 0.7936,
177
  "step": 280
178
  },
179
  {
180
- "epoch": 1.04,
181
- "learning_rate": 9.568345323741009e-05,
182
- "loss": 0.6623,
183
  "step": 290
184
  },
185
  {
186
- "epoch": 1.08,
187
- "learning_rate": 9.20863309352518e-05,
188
- "loss": 0.5091,
189
  "step": 300
190
  },
191
  {
192
- "epoch": 1.12,
193
- "learning_rate": 8.848920863309353e-05,
194
- "loss": 0.4996,
195
  "step": 310
196
  },
197
  {
198
- "epoch": 1.15,
199
- "learning_rate": 8.489208633093527e-05,
200
- "loss": 0.5529,
201
  "step": 320
202
  },
203
  {
204
- "epoch": 1.19,
205
- "learning_rate": 8.129496402877698e-05,
206
- "loss": 0.6094,
207
  "step": 330
208
  },
209
  {
210
- "epoch": 1.22,
211
- "learning_rate": 7.769784172661872e-05,
212
- "loss": 0.4422,
213
  "step": 340
214
  },
215
  {
216
- "epoch": 1.26,
217
- "learning_rate": 7.410071942446043e-05,
218
- "loss": 0.4468,
219
  "step": 350
220
  },
221
  {
222
- "epoch": 1.29,
223
- "learning_rate": 7.050359712230215e-05,
224
- "loss": 0.4673,
225
  "step": 360
226
  },
227
  {
228
- "epoch": 1.33,
229
- "learning_rate": 6.690647482014388e-05,
230
- "loss": 0.4746,
231
  "step": 370
232
  },
233
  {
234
- "epoch": 1.37,
235
- "learning_rate": 6.366906474820145e-05,
236
- "loss": 0.4601,
237
  "step": 380
238
  },
239
  {
240
- "epoch": 1.4,
241
- "learning_rate": 6.007194244604317e-05,
242
- "loss": 0.4793,
243
  "step": 390
244
  },
245
  {
246
- "epoch": 1.44,
247
- "learning_rate": 5.64748201438849e-05,
248
- "loss": 0.5919,
249
  "step": 400
250
  },
251
  {
252
- "epoch": 1.47,
253
- "learning_rate": 5.287769784172663e-05,
254
- "loss": 0.4313,
255
  "step": 410
256
  },
257
  {
258
- "epoch": 1.51,
259
- "learning_rate": 4.9280575539568345e-05,
260
- "loss": 0.4715,
261
  "step": 420
262
  },
263
  {
264
- "epoch": 1.55,
265
- "learning_rate": 4.5683453237410076e-05,
266
- "loss": 0.3263,
267
  "step": 430
268
  },
269
  {
270
- "epoch": 1.58,
271
- "learning_rate": 4.20863309352518e-05,
272
- "loss": 0.4526,
273
  "step": 440
274
  },
275
  {
276
- "epoch": 1.62,
277
- "learning_rate": 3.8489208633093525e-05,
278
- "loss": 0.4379,
279
  "step": 450
280
  },
281
  {
282
- "epoch": 1.65,
283
- "learning_rate": 3.489208633093525e-05,
284
- "loss": 0.5083,
285
  "step": 460
286
  },
287
  {
288
- "epoch": 1.69,
289
- "learning_rate": 3.129496402877698e-05,
290
- "loss": 0.5373,
291
  "step": 470
292
  },
293
  {
294
- "epoch": 1.73,
295
- "learning_rate": 2.7697841726618706e-05,
296
- "loss": 0.3303,
297
  "step": 480
298
  },
299
  {
300
- "epoch": 1.76,
301
- "learning_rate": 2.4100719424460434e-05,
302
- "loss": 0.3214,
303
  "step": 490
304
  },
305
  {
306
- "epoch": 1.8,
307
- "learning_rate": 2.050359712230216e-05,
308
- "loss": 0.4343,
309
  "step": 500
310
  },
311
  {
312
- "epoch": 1.83,
313
- "learning_rate": 1.6906474820143887e-05,
314
- "loss": 0.314,
315
  "step": 510
316
  },
317
  {
318
- "epoch": 1.87,
319
- "learning_rate": 1.3309352517985613e-05,
320
- "loss": 0.3727,
321
  "step": 520
322
  },
323
  {
324
- "epoch": 1.91,
325
- "learning_rate": 9.71223021582734e-06,
326
- "loss": 0.2798,
327
  "step": 530
328
  },
329
  {
330
- "epoch": 1.94,
331
- "learning_rate": 6.115107913669065e-06,
332
- "loss": 0.4031,
333
  "step": 540
334
  },
335
  {
336
- "epoch": 1.98,
337
- "learning_rate": 2.5179856115107916e-06,
338
- "loss": 0.3542,
339
  "step": 550
340
  },
341
  {
342
- "epoch": 2.0,
343
- "step": 556,
344
- "total_flos": 6.887981879958897e+17,
345
- "train_loss": 0.8073481788738168,
346
- "train_runtime": 211.1785,
347
- "train_samples_per_second": 42.088,
348
- "train_steps_per_second": 2.633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  }
350
  ],
351
  "logging_steps": 10,
352
- "max_steps": 556,
353
- "num_train_epochs": 2,
354
  "save_steps": 3000,
355
- "total_flos": 6.887981879958897e+17,
356
  "trial_name": null,
357
  "trial_params": null
358
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.0,
5
  "eval_steps": 3000,
6
+ "global_step": 1080,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.04,
13
+ "learning_rate": 0.00019814814814814814,
14
+ "loss": 2.1983,
15
  "step": 10
16
  },
17
  {
18
  "epoch": 0.07,
19
+ "learning_rate": 0.0001962962962962963,
20
+ "loss": 2.1208,
21
  "step": 20
22
  },
23
  {
24
  "epoch": 0.11,
25
+ "learning_rate": 0.00019444444444444446,
26
+ "loss": 1.8985,
27
  "step": 30
28
  },
29
  {
30
+ "epoch": 0.15,
31
+ "learning_rate": 0.0001925925925925926,
32
+ "loss": 1.726,
33
  "step": 40
34
  },
35
  {
36
+ "epoch": 0.19,
37
+ "learning_rate": 0.00019074074074074075,
38
+ "loss": 1.6799,
39
  "step": 50
40
  },
41
  {
42
  "epoch": 0.22,
43
+ "learning_rate": 0.00018888888888888888,
44
+ "loss": 1.5807,
45
  "step": 60
46
  },
47
  {
48
+ "epoch": 0.26,
49
+ "learning_rate": 0.00018703703703703704,
50
+ "loss": 1.5165,
51
  "step": 70
52
  },
53
  {
54
+ "epoch": 0.3,
55
+ "learning_rate": 0.0001851851851851852,
56
+ "loss": 1.4893,
57
  "step": 80
58
  },
59
  {
60
+ "epoch": 0.33,
61
+ "learning_rate": 0.00018333333333333334,
62
+ "loss": 1.24,
63
  "step": 90
64
  },
65
  {
66
+ "epoch": 0.37,
67
+ "learning_rate": 0.0001814814814814815,
68
+ "loss": 1.2977,
69
  "step": 100
70
  },
71
  {
72
+ "epoch": 0.41,
73
+ "learning_rate": 0.00017962962962962963,
74
+ "loss": 1.2492,
75
  "step": 110
76
  },
77
  {
78
+ "epoch": 0.44,
79
+ "learning_rate": 0.00017777777777777779,
80
+ "loss": 1.1566,
81
  "step": 120
82
  },
83
  {
84
+ "epoch": 0.48,
85
+ "learning_rate": 0.00017592592592592595,
86
+ "loss": 1.3443,
87
  "step": 130
88
  },
89
  {
90
+ "epoch": 0.52,
91
+ "learning_rate": 0.00017407407407407408,
92
+ "loss": 1.2112,
93
  "step": 140
94
  },
95
  {
96
+ "epoch": 0.56,
97
+ "learning_rate": 0.00017222222222222224,
98
+ "loss": 1.0302,
99
  "step": 150
100
  },
101
  {
102
+ "epoch": 0.59,
103
+ "learning_rate": 0.00017037037037037037,
104
+ "loss": 1.1856,
105
  "step": 160
106
  },
107
  {
108
+ "epoch": 0.63,
109
+ "learning_rate": 0.00016851851851851853,
110
+ "loss": 1.1745,
111
  "step": 170
112
  },
113
  {
114
+ "epoch": 0.67,
115
+ "learning_rate": 0.0001666666666666667,
116
+ "loss": 1.0659,
117
  "step": 180
118
  },
119
  {
120
+ "epoch": 0.7,
121
+ "learning_rate": 0.00016481481481481482,
122
+ "loss": 1.0134,
123
  "step": 190
124
  },
125
  {
126
+ "epoch": 0.74,
127
+ "learning_rate": 0.00016296296296296295,
128
+ "loss": 0.9692,
129
  "step": 200
130
  },
131
  {
132
+ "epoch": 0.78,
133
+ "learning_rate": 0.0001611111111111111,
134
+ "loss": 0.8877,
135
  "step": 210
136
  },
137
  {
138
+ "epoch": 0.81,
139
+ "learning_rate": 0.00015925925925925927,
140
+ "loss": 0.8419,
141
  "step": 220
142
  },
143
  {
144
+ "epoch": 0.85,
145
+ "learning_rate": 0.00015740740740740743,
146
+ "loss": 0.9147,
147
  "step": 230
148
  },
149
  {
150
+ "epoch": 0.89,
151
+ "learning_rate": 0.00015555555555555556,
152
+ "loss": 1.0111,
153
  "step": 240
154
  },
155
  {
156
+ "epoch": 0.93,
157
+ "learning_rate": 0.0001537037037037037,
158
+ "loss": 0.9457,
159
  "step": 250
160
  },
161
  {
162
+ "epoch": 0.96,
163
+ "learning_rate": 0.00015185185185185185,
164
+ "loss": 0.9867,
165
  "step": 260
166
  },
167
  {
168
+ "epoch": 1.0,
169
+ "learning_rate": 0.00015000000000000001,
170
+ "loss": 1.1366,
171
  "step": 270
172
  },
173
  {
174
+ "epoch": 1.04,
175
+ "learning_rate": 0.00014814814814814815,
176
+ "loss": 0.7648,
177
  "step": 280
178
  },
179
  {
180
+ "epoch": 1.07,
181
+ "learning_rate": 0.0001462962962962963,
182
+ "loss": 0.8693,
183
  "step": 290
184
  },
185
  {
186
+ "epoch": 1.11,
187
+ "learning_rate": 0.00014444444444444444,
188
+ "loss": 0.8179,
189
  "step": 300
190
  },
191
  {
192
+ "epoch": 1.15,
193
+ "learning_rate": 0.0001425925925925926,
194
+ "loss": 0.8077,
195
  "step": 310
196
  },
197
  {
198
+ "epoch": 1.19,
199
+ "learning_rate": 0.00014074074074074076,
200
+ "loss": 0.941,
201
  "step": 320
202
  },
203
  {
204
+ "epoch": 1.22,
205
+ "learning_rate": 0.0001388888888888889,
206
+ "loss": 0.8906,
207
  "step": 330
208
  },
209
  {
210
+ "epoch": 1.26,
211
+ "learning_rate": 0.00013703703703703705,
212
+ "loss": 0.8433,
213
  "step": 340
214
  },
215
  {
216
+ "epoch": 1.3,
217
+ "learning_rate": 0.00013518518518518518,
218
+ "loss": 0.7654,
219
  "step": 350
220
  },
221
  {
222
+ "epoch": 1.33,
223
+ "learning_rate": 0.00013333333333333334,
224
+ "loss": 0.7217,
225
  "step": 360
226
  },
227
  {
228
+ "epoch": 1.37,
229
+ "learning_rate": 0.0001314814814814815,
230
+ "loss": 0.8744,
231
  "step": 370
232
  },
233
  {
234
+ "epoch": 1.41,
235
+ "learning_rate": 0.00012962962962962963,
236
+ "loss": 0.6337,
237
  "step": 380
238
  },
239
  {
240
+ "epoch": 1.44,
241
+ "learning_rate": 0.00012777777777777776,
242
+ "loss": 0.6369,
243
  "step": 390
244
  },
245
  {
246
+ "epoch": 1.48,
247
+ "learning_rate": 0.00012592592592592592,
248
+ "loss": 0.5268,
249
  "step": 400
250
  },
251
  {
252
+ "epoch": 1.52,
253
+ "learning_rate": 0.00012407407407407408,
254
+ "loss": 0.8015,
255
  "step": 410
256
  },
257
  {
258
+ "epoch": 1.56,
259
+ "learning_rate": 0.00012222222222222224,
260
+ "loss": 0.5013,
261
  "step": 420
262
  },
263
  {
264
+ "epoch": 1.59,
265
+ "learning_rate": 0.00012037037037037037,
266
+ "loss": 0.6063,
267
  "step": 430
268
  },
269
  {
270
+ "epoch": 1.63,
271
+ "learning_rate": 0.00011851851851851852,
272
+ "loss": 0.7767,
273
  "step": 440
274
  },
275
  {
276
+ "epoch": 1.67,
277
+ "learning_rate": 0.00011666666666666668,
278
+ "loss": 0.5174,
279
  "step": 450
280
  },
281
  {
282
+ "epoch": 1.7,
283
+ "learning_rate": 0.00011481481481481482,
284
+ "loss": 0.6391,
285
  "step": 460
286
  },
287
  {
288
+ "epoch": 1.74,
289
+ "learning_rate": 0.00011296296296296296,
290
+ "loss": 0.4966,
291
  "step": 470
292
  },
293
  {
294
+ "epoch": 1.78,
295
+ "learning_rate": 0.00011111111111111112,
296
+ "loss": 0.5991,
297
  "step": 480
298
  },
299
  {
300
+ "epoch": 1.81,
301
+ "learning_rate": 0.00010925925925925926,
302
+ "loss": 0.5499,
303
  "step": 490
304
  },
305
  {
306
+ "epoch": 1.85,
307
+ "learning_rate": 0.00010740740740740742,
308
+ "loss": 0.5488,
309
  "step": 500
310
  },
311
  {
312
+ "epoch": 1.89,
313
+ "learning_rate": 0.00010555555555555557,
314
+ "loss": 0.5834,
315
  "step": 510
316
  },
317
  {
318
+ "epoch": 1.93,
319
+ "learning_rate": 0.0001037037037037037,
320
+ "loss": 0.6238,
321
  "step": 520
322
  },
323
  {
324
+ "epoch": 1.96,
325
+ "learning_rate": 0.00010185185185185186,
326
+ "loss": 0.6365,
327
  "step": 530
328
  },
329
  {
330
+ "epoch": 2.0,
331
+ "learning_rate": 0.0001,
332
+ "loss": 0.5337,
333
  "step": 540
334
  },
335
  {
336
+ "epoch": 2.04,
337
+ "learning_rate": 9.814814814814815e-05,
338
+ "loss": 0.4326,
339
  "step": 550
340
  },
341
  {
342
+ "epoch": 2.07,
343
+ "learning_rate": 9.62962962962963e-05,
344
+ "loss": 0.4197,
345
+ "step": 560
346
+ },
347
+ {
348
+ "epoch": 2.11,
349
+ "learning_rate": 9.444444444444444e-05,
350
+ "loss": 0.3268,
351
+ "step": 570
352
+ },
353
+ {
354
+ "epoch": 2.15,
355
+ "learning_rate": 9.25925925925926e-05,
356
+ "loss": 0.3066,
357
+ "step": 580
358
+ },
359
+ {
360
+ "epoch": 2.19,
361
+ "learning_rate": 9.074074074074075e-05,
362
+ "loss": 0.4737,
363
+ "step": 590
364
+ },
365
+ {
366
+ "epoch": 2.22,
367
+ "learning_rate": 8.888888888888889e-05,
368
+ "loss": 0.3185,
369
+ "step": 600
370
+ },
371
+ {
372
+ "epoch": 2.26,
373
+ "learning_rate": 8.703703703703704e-05,
374
+ "loss": 0.4233,
375
+ "step": 610
376
+ },
377
+ {
378
+ "epoch": 2.3,
379
+ "learning_rate": 8.518518518518518e-05,
380
+ "loss": 0.3377,
381
+ "step": 620
382
+ },
383
+ {
384
+ "epoch": 2.33,
385
+ "learning_rate": 8.333333333333334e-05,
386
+ "loss": 0.3957,
387
+ "step": 630
388
+ },
389
+ {
390
+ "epoch": 2.37,
391
+ "learning_rate": 8.148148148148148e-05,
392
+ "loss": 0.3915,
393
+ "step": 640
394
+ },
395
+ {
396
+ "epoch": 2.41,
397
+ "learning_rate": 7.962962962962964e-05,
398
+ "loss": 0.3025,
399
+ "step": 650
400
+ },
401
+ {
402
+ "epoch": 2.44,
403
+ "learning_rate": 7.777777777777778e-05,
404
+ "loss": 0.2896,
405
+ "step": 660
406
+ },
407
+ {
408
+ "epoch": 2.48,
409
+ "learning_rate": 7.592592592592593e-05,
410
+ "loss": 0.2558,
411
+ "step": 670
412
+ },
413
+ {
414
+ "epoch": 2.52,
415
+ "learning_rate": 7.407407407407407e-05,
416
+ "loss": 0.3477,
417
+ "step": 680
418
+ },
419
+ {
420
+ "epoch": 2.56,
421
+ "learning_rate": 7.222222222222222e-05,
422
+ "loss": 0.2111,
423
+ "step": 690
424
+ },
425
+ {
426
+ "epoch": 2.59,
427
+ "learning_rate": 7.037037037037038e-05,
428
+ "loss": 0.2885,
429
+ "step": 700
430
+ },
431
+ {
432
+ "epoch": 2.63,
433
+ "learning_rate": 6.851851851851852e-05,
434
+ "loss": 0.2953,
435
+ "step": 710
436
+ },
437
+ {
438
+ "epoch": 2.67,
439
+ "learning_rate": 6.666666666666667e-05,
440
+ "loss": 0.2415,
441
+ "step": 720
442
+ },
443
+ {
444
+ "epoch": 2.7,
445
+ "learning_rate": 6.481481481481482e-05,
446
+ "loss": 0.3242,
447
+ "step": 730
448
+ },
449
+ {
450
+ "epoch": 2.74,
451
+ "learning_rate": 6.296296296296296e-05,
452
+ "loss": 0.2616,
453
+ "step": 740
454
+ },
455
+ {
456
+ "epoch": 2.78,
457
+ "learning_rate": 6.111111111111112e-05,
458
+ "loss": 0.2853,
459
+ "step": 750
460
+ },
461
+ {
462
+ "epoch": 2.81,
463
+ "learning_rate": 5.925925925925926e-05,
464
+ "loss": 0.2828,
465
+ "step": 760
466
+ },
467
+ {
468
+ "epoch": 2.85,
469
+ "learning_rate": 5.740740740740741e-05,
470
+ "loss": 0.2382,
471
+ "step": 770
472
+ },
473
+ {
474
+ "epoch": 2.89,
475
+ "learning_rate": 5.555555555555556e-05,
476
+ "loss": 0.3508,
477
+ "step": 780
478
+ },
479
+ {
480
+ "epoch": 2.93,
481
+ "learning_rate": 5.370370370370371e-05,
482
+ "loss": 0.2794,
483
+ "step": 790
484
+ },
485
+ {
486
+ "epoch": 2.96,
487
+ "learning_rate": 5.185185185185185e-05,
488
+ "loss": 0.3247,
489
+ "step": 800
490
+ },
491
+ {
492
+ "epoch": 3.0,
493
+ "learning_rate": 5e-05,
494
+ "loss": 0.2753,
495
+ "step": 810
496
+ },
497
+ {
498
+ "epoch": 3.04,
499
+ "learning_rate": 4.814814814814815e-05,
500
+ "loss": 0.1453,
501
+ "step": 820
502
+ },
503
+ {
504
+ "epoch": 3.07,
505
+ "learning_rate": 4.62962962962963e-05,
506
+ "loss": 0.1666,
507
+ "step": 830
508
+ },
509
+ {
510
+ "epoch": 3.11,
511
+ "learning_rate": 4.4444444444444447e-05,
512
+ "loss": 0.1369,
513
+ "step": 840
514
+ },
515
+ {
516
+ "epoch": 3.15,
517
+ "learning_rate": 4.259259259259259e-05,
518
+ "loss": 0.1086,
519
+ "step": 850
520
+ },
521
+ {
522
+ "epoch": 3.19,
523
+ "learning_rate": 4.074074074074074e-05,
524
+ "loss": 0.0967,
525
+ "step": 860
526
+ },
527
+ {
528
+ "epoch": 3.22,
529
+ "learning_rate": 3.888888888888889e-05,
530
+ "loss": 0.1327,
531
+ "step": 870
532
+ },
533
+ {
534
+ "epoch": 3.26,
535
+ "learning_rate": 3.7037037037037037e-05,
536
+ "loss": 0.0848,
537
+ "step": 880
538
+ },
539
+ {
540
+ "epoch": 3.3,
541
+ "learning_rate": 3.518518518518519e-05,
542
+ "loss": 0.1173,
543
+ "step": 890
544
+ },
545
+ {
546
+ "epoch": 3.33,
547
+ "learning_rate": 3.3333333333333335e-05,
548
+ "loss": 0.135,
549
+ "step": 900
550
+ },
551
+ {
552
+ "epoch": 3.37,
553
+ "learning_rate": 3.148148148148148e-05,
554
+ "loss": 0.1979,
555
+ "step": 910
556
+ },
557
+ {
558
+ "epoch": 3.41,
559
+ "learning_rate": 2.962962962962963e-05,
560
+ "loss": 0.1181,
561
+ "step": 920
562
+ },
563
+ {
564
+ "epoch": 3.44,
565
+ "learning_rate": 2.777777777777778e-05,
566
+ "loss": 0.0957,
567
+ "step": 930
568
+ },
569
+ {
570
+ "epoch": 3.48,
571
+ "learning_rate": 2.5925925925925925e-05,
572
+ "loss": 0.0927,
573
+ "step": 940
574
+ },
575
+ {
576
+ "epoch": 3.52,
577
+ "learning_rate": 2.4074074074074074e-05,
578
+ "loss": 0.094,
579
+ "step": 950
580
+ },
581
+ {
582
+ "epoch": 3.56,
583
+ "learning_rate": 2.2222222222222223e-05,
584
+ "loss": 0.1197,
585
+ "step": 960
586
+ },
587
+ {
588
+ "epoch": 3.59,
589
+ "learning_rate": 2.037037037037037e-05,
590
+ "loss": 0.0927,
591
+ "step": 970
592
+ },
593
+ {
594
+ "epoch": 3.63,
595
+ "learning_rate": 1.8518518518518518e-05,
596
+ "loss": 0.1523,
597
+ "step": 980
598
+ },
599
+ {
600
+ "epoch": 3.67,
601
+ "learning_rate": 1.6666666666666667e-05,
602
+ "loss": 0.2582,
603
+ "step": 990
604
+ },
605
+ {
606
+ "epoch": 3.7,
607
+ "learning_rate": 1.4814814814814815e-05,
608
+ "loss": 0.1101,
609
+ "step": 1000
610
+ },
611
+ {
612
+ "epoch": 3.74,
613
+ "learning_rate": 1.2962962962962962e-05,
614
+ "loss": 0.1582,
615
+ "step": 1010
616
+ },
617
+ {
618
+ "epoch": 3.78,
619
+ "learning_rate": 1.1111111111111112e-05,
620
+ "loss": 0.0504,
621
+ "step": 1020
622
+ },
623
+ {
624
+ "epoch": 3.81,
625
+ "learning_rate": 9.259259259259259e-06,
626
+ "loss": 0.0788,
627
+ "step": 1030
628
+ },
629
+ {
630
+ "epoch": 3.85,
631
+ "learning_rate": 7.4074074074074075e-06,
632
+ "loss": 0.0607,
633
+ "step": 1040
634
+ },
635
+ {
636
+ "epoch": 3.89,
637
+ "learning_rate": 5.555555555555556e-06,
638
+ "loss": 0.1061,
639
+ "step": 1050
640
+ },
641
+ {
642
+ "epoch": 3.93,
643
+ "learning_rate": 3.7037037037037037e-06,
644
+ "loss": 0.0645,
645
+ "step": 1060
646
+ },
647
+ {
648
+ "epoch": 3.96,
649
+ "learning_rate": 1.8518518518518519e-06,
650
+ "loss": 0.0971,
651
+ "step": 1070
652
+ },
653
+ {
654
+ "epoch": 4.0,
655
+ "learning_rate": 0.0,
656
+ "loss": 0.128,
657
+ "step": 1080
658
+ },
659
+ {
660
+ "epoch": 4.0,
661
+ "step": 1080,
662
+ "total_flos": 1.339145591637934e+18,
663
+ "train_loss": 0.6042599819324634,
664
+ "train_runtime": 412.4773,
665
+ "train_samples_per_second": 41.893,
666
+ "train_steps_per_second": 2.618
667
  }
668
  ],
669
  "logging_steps": 10,
670
+ "max_steps": 1080,
671
+ "num_train_epochs": 4,
672
  "save_steps": 3000,
673
+ "total_flos": 1.339145591637934e+18,
674
  "trial_name": null,
675
  "trial_params": null
676
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:238f025878ae397afe0eea0dd655a189bf936105b7e4ad96f34b4cef925b5a61
3
  size 4536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce8c248f033e489c055bf6bb31f2e28906b79dad7e44578cd117fc71cea6f0e4
3
  size 4536