chidamnat2002 commited on
Commit
86ac113
·
verified ·
1 Parent(s): 2cf9728

Upload 14 files

Browse files

trained with 10 epochs

adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0959b4b79c70f7f0f6f8ea6096fc88b75ffb66c687e1f532f27516fb6d4e03d1
3
  size 2536568
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83c32cb7d669775f075fb0a87c953cade6c10b0747873eb4ceebef2cb2f2892f
3
  size 2536568
config.json CHANGED
@@ -39,6 +39,6 @@
39
  "sinusoidal_pos_embds": false,
40
  "tie_weights_": true,
41
  "torch_dtype": "float32",
42
- "transformers_version": "4.45.0",
43
  "vocab_size": 30522
44
  }
 
39
  "sinusoidal_pos_embds": false,
40
  "tie_weights_": true,
41
  "torch_dtype": "float32",
42
+ "transformers_version": "4.44.2",
43
  "vocab_size": 30522
44
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72815fa37d94fe560665dc778ac50a1606e00f3cb03133aafb4b10784be08543
3
- size 267851024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8e2d7fc59b1fbb2a3ca19bdc93c9e8b7b3237e5f3609c4b59e9ba257e3e4ec0
3
+ size 214037161
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d437e51fed1f120e9b341f85d3bb2a02f70a8e251fc4cf0da979dc3fe761ca4b
3
+ size 5082798
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:468a0fc8a2aa5f71deed2da1be3649a479efb234844a546ab87a69c682ebf27e
3
+ size 14244
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7948e301d82ada6e7683ed3d39239b5c72d60691247f70984c929351115adbea
3
+ size 1064
trainer_state.json ADDED
@@ -0,0 +1,1086 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.2595302164554596,
3
+ "best_model_checkpoint": "distilbert-base-uncased-lora-intent-classification-v2/checkpoint-67716",
4
+ "epoch": 9.0,
5
+ "eval_steps": 500,
6
+ "global_step": 67716,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.06645401382243488,
13
+ "grad_norm": 4.6704421043396,
14
+ "learning_rate": 0.0009933545986177566,
15
+ "loss": 0.6675,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.13290802764486975,
20
+ "grad_norm": 2.3022220134735107,
21
+ "learning_rate": 0.000986709197235513,
22
+ "loss": 0.4718,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.19936204146730463,
27
+ "grad_norm": 0.44215622544288635,
28
+ "learning_rate": 0.0009800637958532696,
29
+ "loss": 0.4146,
30
+ "step": 1500
31
+ },
32
+ {
33
+ "epoch": 0.2658160552897395,
34
+ "grad_norm": 0.08581192046403885,
35
+ "learning_rate": 0.0009734183944710261,
36
+ "loss": 0.4297,
37
+ "step": 2000
38
+ },
39
+ {
40
+ "epoch": 0.3322700691121744,
41
+ "grad_norm": 13.087315559387207,
42
+ "learning_rate": 0.0009667729930887826,
43
+ "loss": 0.3776,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.39872408293460926,
48
+ "grad_norm": 15.066133499145508,
49
+ "learning_rate": 0.0009601275917065391,
50
+ "loss": 0.4233,
51
+ "step": 3000
52
+ },
53
+ {
54
+ "epoch": 0.46517809675704413,
55
+ "grad_norm": 0.23827387392520905,
56
+ "learning_rate": 0.0009534821903242956,
57
+ "loss": 0.3613,
58
+ "step": 3500
59
+ },
60
+ {
61
+ "epoch": 0.531632110579479,
62
+ "grad_norm": 0.009319925680756569,
63
+ "learning_rate": 0.0009468367889420521,
64
+ "loss": 0.4269,
65
+ "step": 4000
66
+ },
67
+ {
68
+ "epoch": 0.5980861244019139,
69
+ "grad_norm": 0.665321946144104,
70
+ "learning_rate": 0.0009401913875598086,
71
+ "loss": 0.3815,
72
+ "step": 4500
73
+ },
74
+ {
75
+ "epoch": 0.6645401382243488,
76
+ "grad_norm": 3.580693483352661,
77
+ "learning_rate": 0.0009335459861775651,
78
+ "loss": 0.3539,
79
+ "step": 5000
80
+ },
81
+ {
82
+ "epoch": 0.7309941520467836,
83
+ "grad_norm": 0.12289135903120041,
84
+ "learning_rate": 0.0009269005847953217,
85
+ "loss": 0.4112,
86
+ "step": 5500
87
+ },
88
+ {
89
+ "epoch": 0.7974481658692185,
90
+ "grad_norm": 1.3471044301986694,
91
+ "learning_rate": 0.0009202551834130782,
92
+ "loss": 0.4109,
93
+ "step": 6000
94
+ },
95
+ {
96
+ "epoch": 0.8639021796916534,
97
+ "grad_norm": 0.09887880831956863,
98
+ "learning_rate": 0.0009136097820308346,
99
+ "loss": 0.4508,
100
+ "step": 6500
101
+ },
102
+ {
103
+ "epoch": 0.9303561935140883,
104
+ "grad_norm": 0.005311007611453533,
105
+ "learning_rate": 0.0009069643806485912,
106
+ "loss": 0.4011,
107
+ "step": 7000
108
+ },
109
+ {
110
+ "epoch": 0.9968102073365231,
111
+ "grad_norm": 1.1049816608428955,
112
+ "learning_rate": 0.0009003189792663478,
113
+ "loss": 0.368,
114
+ "step": 7500
115
+ },
116
+ {
117
+ "epoch": 1.0,
118
+ "eval_accuracy": 0.9425867507886435,
119
+ "eval_f1": 0.9421244141375861,
120
+ "eval_loss": 0.3986539840698242,
121
+ "eval_precision": 0.9421379340931425,
122
+ "eval_recall": 0.9425867507886435,
123
+ "eval_runtime": 4.728,
124
+ "eval_samples_per_second": 335.238,
125
+ "eval_steps_per_second": 83.968,
126
+ "step": 7524
127
+ },
128
+ {
129
+ "epoch": 1.063264221158958,
130
+ "grad_norm": 70.09782409667969,
131
+ "learning_rate": 0.0008936735778841042,
132
+ "loss": 0.3306,
133
+ "step": 8000
134
+ },
135
+ {
136
+ "epoch": 1.1297182349813928,
137
+ "grad_norm": 0.7961419820785522,
138
+ "learning_rate": 0.0008870281765018608,
139
+ "loss": 0.3746,
140
+ "step": 8500
141
+ },
142
+ {
143
+ "epoch": 1.1961722488038278,
144
+ "grad_norm": 0.060738347470760345,
145
+ "learning_rate": 0.0008803827751196173,
146
+ "loss": 0.4045,
147
+ "step": 9000
148
+ },
149
+ {
150
+ "epoch": 1.2626262626262625,
151
+ "grad_norm": 0.20715029537677765,
152
+ "learning_rate": 0.0008737373737373737,
153
+ "loss": 0.4587,
154
+ "step": 9500
155
+ },
156
+ {
157
+ "epoch": 1.3290802764486975,
158
+ "grad_norm": 0.08913299441337585,
159
+ "learning_rate": 0.0008670919723551303,
160
+ "loss": 0.4504,
161
+ "step": 10000
162
+ },
163
+ {
164
+ "epoch": 1.3955342902711323,
165
+ "grad_norm": 0.14319421350955963,
166
+ "learning_rate": 0.0008604465709728868,
167
+ "loss": 0.3991,
168
+ "step": 10500
169
+ },
170
+ {
171
+ "epoch": 1.4619883040935673,
172
+ "grad_norm": 2.545884370803833,
173
+ "learning_rate": 0.0008538011695906432,
174
+ "loss": 0.4192,
175
+ "step": 11000
176
+ },
177
+ {
178
+ "epoch": 1.528442317916002,
179
+ "grad_norm": 0.12403066456317902,
180
+ "learning_rate": 0.0008471557682083998,
181
+ "loss": 0.3563,
182
+ "step": 11500
183
+ },
184
+ {
185
+ "epoch": 1.594896331738437,
186
+ "grad_norm": 41.519954681396484,
187
+ "learning_rate": 0.0008405103668261563,
188
+ "loss": 0.3435,
189
+ "step": 12000
190
+ },
191
+ {
192
+ "epoch": 1.661350345560872,
193
+ "grad_norm": 83.61852264404297,
194
+ "learning_rate": 0.0008338649654439129,
195
+ "loss": 0.3503,
196
+ "step": 12500
197
+ },
198
+ {
199
+ "epoch": 1.7278043593833068,
200
+ "grad_norm": 0.001769404741935432,
201
+ "learning_rate": 0.0008272195640616694,
202
+ "loss": 0.3238,
203
+ "step": 13000
204
+ },
205
+ {
206
+ "epoch": 1.7942583732057416,
207
+ "grad_norm": 1.7677043676376343,
208
+ "learning_rate": 0.0008205741626794258,
209
+ "loss": 0.38,
210
+ "step": 13500
211
+ },
212
+ {
213
+ "epoch": 1.8607123870281765,
214
+ "grad_norm": 1.0566127300262451,
215
+ "learning_rate": 0.0008139287612971824,
216
+ "loss": 0.4146,
217
+ "step": 14000
218
+ },
219
+ {
220
+ "epoch": 1.9271664008506115,
221
+ "grad_norm": 19.463109970092773,
222
+ "learning_rate": 0.0008072833599149389,
223
+ "loss": 0.4305,
224
+ "step": 14500
225
+ },
226
+ {
227
+ "epoch": 1.9936204146730463,
228
+ "grad_norm": 17.069889068603516,
229
+ "learning_rate": 0.0008006379585326954,
230
+ "loss": 0.3505,
231
+ "step": 15000
232
+ },
233
+ {
234
+ "epoch": 2.0,
235
+ "eval_accuracy": 0.9482649842271293,
236
+ "eval_f1": 0.9478124684113843,
237
+ "eval_loss": 0.3766539990901947,
238
+ "eval_precision": 0.9481744874506283,
239
+ "eval_recall": 0.9482649842271293,
240
+ "eval_runtime": 4.5607,
241
+ "eval_samples_per_second": 347.537,
242
+ "eval_steps_per_second": 87.049,
243
+ "step": 15048
244
+ },
245
+ {
246
+ "epoch": 2.060074428495481,
247
+ "grad_norm": 0.4118238389492035,
248
+ "learning_rate": 0.000793992557150452,
249
+ "loss": 0.3021,
250
+ "step": 15500
251
+ },
252
+ {
253
+ "epoch": 2.126528442317916,
254
+ "grad_norm": 0.4119320213794708,
255
+ "learning_rate": 0.0007873471557682083,
256
+ "loss": 0.3166,
257
+ "step": 16000
258
+ },
259
+ {
260
+ "epoch": 2.192982456140351,
261
+ "grad_norm": 10.00361442565918,
262
+ "learning_rate": 0.0007807017543859649,
263
+ "loss": 0.374,
264
+ "step": 16500
265
+ },
266
+ {
267
+ "epoch": 2.2594364699627856,
268
+ "grad_norm": 44.608726501464844,
269
+ "learning_rate": 0.0007740563530037215,
270
+ "loss": 0.4748,
271
+ "step": 17000
272
+ },
273
+ {
274
+ "epoch": 2.3258904837852206,
275
+ "grad_norm": 0.09617531299591064,
276
+ "learning_rate": 0.000767410951621478,
277
+ "loss": 0.3771,
278
+ "step": 17500
279
+ },
280
+ {
281
+ "epoch": 2.3923444976076556,
282
+ "grad_norm": 26.71993064880371,
283
+ "learning_rate": 0.0007607655502392344,
284
+ "loss": 0.4181,
285
+ "step": 18000
286
+ },
287
+ {
288
+ "epoch": 2.4587985114300905,
289
+ "grad_norm": 0.003970532212406397,
290
+ "learning_rate": 0.000754120148856991,
291
+ "loss": 0.3365,
292
+ "step": 18500
293
+ },
294
+ {
295
+ "epoch": 2.525252525252525,
296
+ "grad_norm": 0.023912647739052773,
297
+ "learning_rate": 0.0007474747474747475,
298
+ "loss": 0.3731,
299
+ "step": 19000
300
+ },
301
+ {
302
+ "epoch": 2.59170653907496,
303
+ "grad_norm": 0.08333996683359146,
304
+ "learning_rate": 0.000740829346092504,
305
+ "loss": 0.4489,
306
+ "step": 19500
307
+ },
308
+ {
309
+ "epoch": 2.658160552897395,
310
+ "grad_norm": 0.01645304262638092,
311
+ "learning_rate": 0.0007341839447102606,
312
+ "loss": 0.4246,
313
+ "step": 20000
314
+ },
315
+ {
316
+ "epoch": 2.72461456671983,
317
+ "grad_norm": 0.08779849112033844,
318
+ "learning_rate": 0.000727538543328017,
319
+ "loss": 0.4556,
320
+ "step": 20500
321
+ },
322
+ {
323
+ "epoch": 2.7910685805422646,
324
+ "grad_norm": 52.66293716430664,
325
+ "learning_rate": 0.0007208931419457735,
326
+ "loss": 0.3538,
327
+ "step": 21000
328
+ },
329
+ {
330
+ "epoch": 2.8575225943646996,
331
+ "grad_norm": 0.028336428105831146,
332
+ "learning_rate": 0.00071424774056353,
333
+ "loss": 0.3813,
334
+ "step": 21500
335
+ },
336
+ {
337
+ "epoch": 2.9239766081871346,
338
+ "grad_norm": 0.30558499693870544,
339
+ "learning_rate": 0.0007076023391812866,
340
+ "loss": 0.4138,
341
+ "step": 22000
342
+ },
343
+ {
344
+ "epoch": 2.990430622009569,
345
+ "grad_norm": 30.89914321899414,
346
+ "learning_rate": 0.0007009569377990431,
347
+ "loss": 0.3391,
348
+ "step": 22500
349
+ },
350
+ {
351
+ "epoch": 3.0,
352
+ "eval_accuracy": 0.9539432176656152,
353
+ "eval_f1": 0.95367799565447,
354
+ "eval_loss": 0.34262794256210327,
355
+ "eval_precision": 0.9535465559361256,
356
+ "eval_recall": 0.9539432176656152,
357
+ "eval_runtime": 4.5296,
358
+ "eval_samples_per_second": 349.923,
359
+ "eval_steps_per_second": 87.646,
360
+ "step": 22572
361
+ },
362
+ {
363
+ "epoch": 3.056884635832004,
364
+ "grad_norm": 280.99310302734375,
365
+ "learning_rate": 0.0006943115364167995,
366
+ "loss": 0.3269,
367
+ "step": 23000
368
+ },
369
+ {
370
+ "epoch": 3.123338649654439,
371
+ "grad_norm": 0.030926929786801338,
372
+ "learning_rate": 0.0006876661350345561,
373
+ "loss": 0.3015,
374
+ "step": 23500
375
+ },
376
+ {
377
+ "epoch": 3.189792663476874,
378
+ "grad_norm": 0.1642533391714096,
379
+ "learning_rate": 0.0006810207336523127,
380
+ "loss": 0.3959,
381
+ "step": 24000
382
+ },
383
+ {
384
+ "epoch": 3.256246677299309,
385
+ "grad_norm": 4.198115825653076,
386
+ "learning_rate": 0.000674375332270069,
387
+ "loss": 0.4014,
388
+ "step": 24500
389
+ },
390
+ {
391
+ "epoch": 3.3227006911217436,
392
+ "grad_norm": 0.007642796263098717,
393
+ "learning_rate": 0.0006677299308878256,
394
+ "loss": 0.3203,
395
+ "step": 25000
396
+ },
397
+ {
398
+ "epoch": 3.3891547049441786,
399
+ "grad_norm": 0.018859192728996277,
400
+ "learning_rate": 0.0006610845295055822,
401
+ "loss": 0.3617,
402
+ "step": 25500
403
+ },
404
+ {
405
+ "epoch": 3.4556087187666136,
406
+ "grad_norm": 0.1555991768836975,
407
+ "learning_rate": 0.0006544391281233386,
408
+ "loss": 0.34,
409
+ "step": 26000
410
+ },
411
+ {
412
+ "epoch": 3.522062732589048,
413
+ "grad_norm": 0.03736409544944763,
414
+ "learning_rate": 0.0006477937267410952,
415
+ "loss": 0.3342,
416
+ "step": 26500
417
+ },
418
+ {
419
+ "epoch": 3.588516746411483,
420
+ "grad_norm": 0.0046156104654073715,
421
+ "learning_rate": 0.0006411483253588518,
422
+ "loss": 0.3961,
423
+ "step": 27000
424
+ },
425
+ {
426
+ "epoch": 3.654970760233918,
427
+ "grad_norm": 27.846786499023438,
428
+ "learning_rate": 0.0006345029239766082,
429
+ "loss": 0.2895,
430
+ "step": 27500
431
+ },
432
+ {
433
+ "epoch": 3.721424774056353,
434
+ "grad_norm": 19.202760696411133,
435
+ "learning_rate": 0.0006278575225943647,
436
+ "loss": 0.4071,
437
+ "step": 28000
438
+ },
439
+ {
440
+ "epoch": 3.787878787878788,
441
+ "grad_norm": 0.007552656345069408,
442
+ "learning_rate": 0.0006212121212121212,
443
+ "loss": 0.3859,
444
+ "step": 28500
445
+ },
446
+ {
447
+ "epoch": 3.8543328017012226,
448
+ "grad_norm": 0.029448220506310463,
449
+ "learning_rate": 0.0006145667198298778,
450
+ "loss": 0.3642,
451
+ "step": 29000
452
+ },
453
+ {
454
+ "epoch": 3.9207868155236576,
455
+ "grad_norm": 2.9489197731018066,
456
+ "learning_rate": 0.0006079213184476342,
457
+ "loss": 0.3331,
458
+ "step": 29500
459
+ },
460
+ {
461
+ "epoch": 3.9872408293460926,
462
+ "grad_norm": 0.13416582345962524,
463
+ "learning_rate": 0.0006012759170653907,
464
+ "loss": 0.3399,
465
+ "step": 30000
466
+ },
467
+ {
468
+ "epoch": 4.0,
469
+ "eval_accuracy": 0.9533123028391167,
470
+ "eval_f1": 0.9528581216338866,
471
+ "eval_loss": 0.36635637283325195,
472
+ "eval_precision": 0.9528819559731596,
473
+ "eval_recall": 0.9533123028391167,
474
+ "eval_runtime": 4.1925,
475
+ "eval_samples_per_second": 378.06,
476
+ "eval_steps_per_second": 94.694,
477
+ "step": 30096
478
+ },
479
+ {
480
+ "epoch": 4.053694843168527,
481
+ "grad_norm": 28.457218170166016,
482
+ "learning_rate": 0.0005946305156831473,
483
+ "loss": 0.3025,
484
+ "step": 30500
485
+ },
486
+ {
487
+ "epoch": 4.120148856990962,
488
+ "grad_norm": 6.5367112159729,
489
+ "learning_rate": 0.0005879851143009038,
490
+ "loss": 0.314,
491
+ "step": 31000
492
+ },
493
+ {
494
+ "epoch": 4.186602870813397,
495
+ "grad_norm": 393.4518737792969,
496
+ "learning_rate": 0.0005813397129186602,
497
+ "loss": 0.3436,
498
+ "step": 31500
499
+ },
500
+ {
501
+ "epoch": 4.253056884635832,
502
+ "grad_norm": 0.9848179221153259,
503
+ "learning_rate": 0.0005746943115364168,
504
+ "loss": 0.2768,
505
+ "step": 32000
506
+ },
507
+ {
508
+ "epoch": 4.319510898458267,
509
+ "grad_norm": 2.0531139373779297,
510
+ "learning_rate": 0.0005680489101541734,
511
+ "loss": 0.3134,
512
+ "step": 32500
513
+ },
514
+ {
515
+ "epoch": 4.385964912280702,
516
+ "grad_norm": 0.055749546736478806,
517
+ "learning_rate": 0.0005614035087719298,
518
+ "loss": 0.3532,
519
+ "step": 33000
520
+ },
521
+ {
522
+ "epoch": 4.452418926103137,
523
+ "grad_norm": 0.4778645634651184,
524
+ "learning_rate": 0.0005547581073896864,
525
+ "loss": 0.3622,
526
+ "step": 33500
527
+ },
528
+ {
529
+ "epoch": 4.518872939925571,
530
+ "grad_norm": 0.061856046319007874,
531
+ "learning_rate": 0.0005481127060074428,
532
+ "loss": 0.3426,
533
+ "step": 34000
534
+ },
535
+ {
536
+ "epoch": 4.585326953748006,
537
+ "grad_norm": 0.026136351749300957,
538
+ "learning_rate": 0.0005414673046251993,
539
+ "loss": 0.3795,
540
+ "step": 34500
541
+ },
542
+ {
543
+ "epoch": 4.651780967570441,
544
+ "grad_norm": 0.03556622937321663,
545
+ "learning_rate": 0.0005348219032429559,
546
+ "loss": 0.3322,
547
+ "step": 35000
548
+ },
549
+ {
550
+ "epoch": 4.718234981392876,
551
+ "grad_norm": 0.14081618189811707,
552
+ "learning_rate": 0.0005281765018607124,
553
+ "loss": 0.3722,
554
+ "step": 35500
555
+ },
556
+ {
557
+ "epoch": 4.784688995215311,
558
+ "grad_norm": 100.0813217163086,
559
+ "learning_rate": 0.0005215311004784689,
560
+ "loss": 0.3467,
561
+ "step": 36000
562
+ },
563
+ {
564
+ "epoch": 4.851143009037746,
565
+ "grad_norm": 9.537514686584473,
566
+ "learning_rate": 0.0005148856990962254,
567
+ "loss": 0.3484,
568
+ "step": 36500
569
+ },
570
+ {
571
+ "epoch": 4.917597022860181,
572
+ "grad_norm": 0.048729896545410156,
573
+ "learning_rate": 0.0005082402977139819,
574
+ "loss": 0.3439,
575
+ "step": 37000
576
+ },
577
+ {
578
+ "epoch": 4.984051036682615,
579
+ "grad_norm": 0.005286164116114378,
580
+ "learning_rate": 0.0005015948963317385,
581
+ "loss": 0.3023,
582
+ "step": 37500
583
+ },
584
+ {
585
+ "epoch": 5.0,
586
+ "eval_accuracy": 0.9570977917981073,
587
+ "eval_f1": 0.9568038885748729,
588
+ "eval_loss": 0.3057607114315033,
589
+ "eval_precision": 0.9566095910966326,
590
+ "eval_recall": 0.9570977917981073,
591
+ "eval_runtime": 4.2904,
592
+ "eval_samples_per_second": 369.428,
593
+ "eval_steps_per_second": 92.532,
594
+ "step": 37620
595
+ },
596
+ {
597
+ "epoch": 5.05050505050505,
598
+ "grad_norm": 123.33903503417969,
599
+ "learning_rate": 0.000494949494949495,
600
+ "loss": 0.3801,
601
+ "step": 38000
602
+ },
603
+ {
604
+ "epoch": 5.116959064327485,
605
+ "grad_norm": 0.005817115306854248,
606
+ "learning_rate": 0.0004883040935672514,
607
+ "loss": 0.3047,
608
+ "step": 38500
609
+ },
610
+ {
611
+ "epoch": 5.18341307814992,
612
+ "grad_norm": 0.16751976311206818,
613
+ "learning_rate": 0.000481658692185008,
614
+ "loss": 0.4044,
615
+ "step": 39000
616
+ },
617
+ {
618
+ "epoch": 5.249867091972355,
619
+ "grad_norm": 60.48826599121094,
620
+ "learning_rate": 0.0004750132908027645,
621
+ "loss": 0.3485,
622
+ "step": 39500
623
+ },
624
+ {
625
+ "epoch": 5.31632110579479,
626
+ "grad_norm": 157.16188049316406,
627
+ "learning_rate": 0.000468367889420521,
628
+ "loss": 0.3368,
629
+ "step": 40000
630
+ },
631
+ {
632
+ "epoch": 5.382775119617225,
633
+ "grad_norm": 45.994049072265625,
634
+ "learning_rate": 0.00046172248803827756,
635
+ "loss": 0.3816,
636
+ "step": 40500
637
+ },
638
+ {
639
+ "epoch": 5.44922913343966,
640
+ "grad_norm": 15.62516975402832,
641
+ "learning_rate": 0.00045507708665603404,
642
+ "loss": 0.324,
643
+ "step": 41000
644
+ },
645
+ {
646
+ "epoch": 5.515683147262095,
647
+ "grad_norm": 289.2982177734375,
648
+ "learning_rate": 0.0004484316852737905,
649
+ "loss": 0.3031,
650
+ "step": 41500
651
+ },
652
+ {
653
+ "epoch": 5.582137161084529,
654
+ "grad_norm": 0.027738776057958603,
655
+ "learning_rate": 0.00044178628389154705,
656
+ "loss": 0.3392,
657
+ "step": 42000
658
+ },
659
+ {
660
+ "epoch": 5.648591174906964,
661
+ "grad_norm": 0.02977157197892666,
662
+ "learning_rate": 0.0004351408825093036,
663
+ "loss": 0.3477,
664
+ "step": 42500
665
+ },
666
+ {
667
+ "epoch": 5.715045188729399,
668
+ "grad_norm": 1.663713812828064,
669
+ "learning_rate": 0.0004284954811270601,
670
+ "loss": 0.3993,
671
+ "step": 43000
672
+ },
673
+ {
674
+ "epoch": 5.781499202551834,
675
+ "grad_norm": 2.4411869049072266,
676
+ "learning_rate": 0.0004218500797448166,
677
+ "loss": 0.422,
678
+ "step": 43500
679
+ },
680
+ {
681
+ "epoch": 5.847953216374269,
682
+ "grad_norm": 12.378539085388184,
683
+ "learning_rate": 0.0004152046783625731,
684
+ "loss": 0.3649,
685
+ "step": 44000
686
+ },
687
+ {
688
+ "epoch": 5.914407230196704,
689
+ "grad_norm": 82.05158996582031,
690
+ "learning_rate": 0.00040855927698032964,
691
+ "loss": 0.4191,
692
+ "step": 44500
693
+ },
694
+ {
695
+ "epoch": 5.980861244019139,
696
+ "grad_norm": 0.008256383240222931,
697
+ "learning_rate": 0.0004019138755980861,
698
+ "loss": 0.3437,
699
+ "step": 45000
700
+ },
701
+ {
702
+ "epoch": 6.0,
703
+ "eval_accuracy": 0.9501577287066246,
704
+ "eval_f1": 0.9497240205967022,
705
+ "eval_loss": 0.31248244643211365,
706
+ "eval_precision": 0.949826651119135,
707
+ "eval_recall": 0.9501577287066246,
708
+ "eval_runtime": 4.1272,
709
+ "eval_samples_per_second": 384.034,
710
+ "eval_steps_per_second": 96.19,
711
+ "step": 45144
712
+ },
713
+ {
714
+ "epoch": 6.047315257841573,
715
+ "grad_norm": 0.22720667719841003,
716
+ "learning_rate": 0.00039526847421584264,
717
+ "loss": 0.3774,
718
+ "step": 45500
719
+ },
720
+ {
721
+ "epoch": 6.113769271664008,
722
+ "grad_norm": 0.1796969771385193,
723
+ "learning_rate": 0.0003886230728335992,
724
+ "loss": 0.3625,
725
+ "step": 46000
726
+ },
727
+ {
728
+ "epoch": 6.180223285486443,
729
+ "grad_norm": 0.06664836406707764,
730
+ "learning_rate": 0.00038197767145135565,
731
+ "loss": 0.3096,
732
+ "step": 46500
733
+ },
734
+ {
735
+ "epoch": 6.246677299308878,
736
+ "grad_norm": 52.87346267700195,
737
+ "learning_rate": 0.0003753322700691122,
738
+ "loss": 0.324,
739
+ "step": 47000
740
+ },
741
+ {
742
+ "epoch": 6.313131313131313,
743
+ "grad_norm": 0.13641533255577087,
744
+ "learning_rate": 0.0003686868686868687,
745
+ "loss": 0.3824,
746
+ "step": 47500
747
+ },
748
+ {
749
+ "epoch": 6.379585326953748,
750
+ "grad_norm": 0.014752733521163464,
751
+ "learning_rate": 0.00036204146730462524,
752
+ "loss": 0.3576,
753
+ "step": 48000
754
+ },
755
+ {
756
+ "epoch": 6.446039340776183,
757
+ "grad_norm": 0.07991009950637817,
758
+ "learning_rate": 0.0003553960659223817,
759
+ "loss": 0.2889,
760
+ "step": 48500
761
+ },
762
+ {
763
+ "epoch": 6.512493354598618,
764
+ "grad_norm": 0.0857154056429863,
765
+ "learning_rate": 0.0003487506645401382,
766
+ "loss": 0.3496,
767
+ "step": 49000
768
+ },
769
+ {
770
+ "epoch": 6.578947368421053,
771
+ "grad_norm": 22.04611587524414,
772
+ "learning_rate": 0.00034210526315789477,
773
+ "loss": 0.3456,
774
+ "step": 49500
775
+ },
776
+ {
777
+ "epoch": 6.645401382243487,
778
+ "grad_norm": 0.3360465466976166,
779
+ "learning_rate": 0.00033545986177565125,
780
+ "loss": 0.3113,
781
+ "step": 50000
782
+ },
783
+ {
784
+ "epoch": 6.711855396065922,
785
+ "grad_norm": 0.011091183871030807,
786
+ "learning_rate": 0.0003288144603934078,
787
+ "loss": 0.3085,
788
+ "step": 50500
789
+ },
790
+ {
791
+ "epoch": 6.778309409888357,
792
+ "grad_norm": 45.16307830810547,
793
+ "learning_rate": 0.00032216905901116425,
794
+ "loss": 0.261,
795
+ "step": 51000
796
+ },
797
+ {
798
+ "epoch": 6.844763423710792,
799
+ "grad_norm": 0.10898467898368835,
800
+ "learning_rate": 0.0003155236576289208,
801
+ "loss": 0.2772,
802
+ "step": 51500
803
+ },
804
+ {
805
+ "epoch": 6.911217437533227,
806
+ "grad_norm": 0.04280232638120651,
807
+ "learning_rate": 0.0003088782562466773,
808
+ "loss": 0.3664,
809
+ "step": 52000
810
+ },
811
+ {
812
+ "epoch": 6.977671451355662,
813
+ "grad_norm": 0.44427451491355896,
814
+ "learning_rate": 0.0003022328548644338,
815
+ "loss": 0.2981,
816
+ "step": 52500
817
+ },
818
+ {
819
+ "epoch": 7.0,
820
+ "eval_accuracy": 0.9570977917981073,
821
+ "eval_f1": 0.9567609606627793,
822
+ "eval_loss": 0.3381944000720978,
823
+ "eval_precision": 0.9567551880330806,
824
+ "eval_recall": 0.9570977917981073,
825
+ "eval_runtime": 4.1238,
826
+ "eval_samples_per_second": 384.357,
827
+ "eval_steps_per_second": 96.271,
828
+ "step": 52668
829
+ },
830
+ {
831
+ "epoch": 7.044125465178097,
832
+ "grad_norm": 12.310619354248047,
833
+ "learning_rate": 0.00029558745348219037,
834
+ "loss": 0.2961,
835
+ "step": 53000
836
+ },
837
+ {
838
+ "epoch": 7.110579479000531,
839
+ "grad_norm": 0.021439863368868828,
840
+ "learning_rate": 0.00028894205209994685,
841
+ "loss": 0.3132,
842
+ "step": 53500
843
+ },
844
+ {
845
+ "epoch": 7.177033492822966,
846
+ "grad_norm": 12.506621360778809,
847
+ "learning_rate": 0.0002822966507177033,
848
+ "loss": 0.3065,
849
+ "step": 54000
850
+ },
851
+ {
852
+ "epoch": 7.243487506645401,
853
+ "grad_norm": 40.974212646484375,
854
+ "learning_rate": 0.00027565124933545985,
855
+ "loss": 0.3052,
856
+ "step": 54500
857
+ },
858
+ {
859
+ "epoch": 7.309941520467836,
860
+ "grad_norm": 17.352012634277344,
861
+ "learning_rate": 0.0002690058479532164,
862
+ "loss": 0.3074,
863
+ "step": 55000
864
+ },
865
+ {
866
+ "epoch": 7.376395534290271,
867
+ "grad_norm": 7.186513423919678,
868
+ "learning_rate": 0.0002623604465709729,
869
+ "loss": 0.2944,
870
+ "step": 55500
871
+ },
872
+ {
873
+ "epoch": 7.442849548112706,
874
+ "grad_norm": 0.11422441154718399,
875
+ "learning_rate": 0.0002557150451887294,
876
+ "loss": 0.3277,
877
+ "step": 56000
878
+ },
879
+ {
880
+ "epoch": 7.509303561935141,
881
+ "grad_norm": 0.4097649157047272,
882
+ "learning_rate": 0.0002490696438064859,
883
+ "loss": 0.3314,
884
+ "step": 56500
885
+ },
886
+ {
887
+ "epoch": 7.575757575757576,
888
+ "grad_norm": 255.17686462402344,
889
+ "learning_rate": 0.00024242424242424245,
890
+ "loss": 0.3849,
891
+ "step": 57000
892
+ },
893
+ {
894
+ "epoch": 7.642211589580011,
895
+ "grad_norm": 0.11329037696123123,
896
+ "learning_rate": 0.00023577884104199895,
897
+ "loss": 0.3603,
898
+ "step": 57500
899
+ },
900
+ {
901
+ "epoch": 7.708665603402445,
902
+ "grad_norm": 0.04299360513687134,
903
+ "learning_rate": 0.00022913343965975545,
904
+ "loss": 0.3467,
905
+ "step": 58000
906
+ },
907
+ {
908
+ "epoch": 7.77511961722488,
909
+ "grad_norm": 0.04895203933119774,
910
+ "learning_rate": 0.00022248803827751195,
911
+ "loss": 0.3428,
912
+ "step": 58500
913
+ },
914
+ {
915
+ "epoch": 7.841573631047315,
916
+ "grad_norm": 0.07165663689374924,
917
+ "learning_rate": 0.00021584263689526848,
918
+ "loss": 0.2874,
919
+ "step": 59000
920
+ },
921
+ {
922
+ "epoch": 7.90802764486975,
923
+ "grad_norm": 0.10646966099739075,
924
+ "learning_rate": 0.00020919723551302499,
925
+ "loss": 0.2834,
926
+ "step": 59500
927
+ },
928
+ {
929
+ "epoch": 7.974481658692185,
930
+ "grad_norm": 0.022936491295695305,
931
+ "learning_rate": 0.00020255183413078152,
932
+ "loss": 0.2899,
933
+ "step": 60000
934
+ },
935
+ {
936
+ "epoch": 8.0,
937
+ "eval_accuracy": 0.9577287066246056,
938
+ "eval_f1": 0.9575092656624108,
939
+ "eval_loss": 0.30500882863998413,
940
+ "eval_precision": 0.9575766504306299,
941
+ "eval_recall": 0.9577287066246056,
942
+ "eval_runtime": 4.5012,
943
+ "eval_samples_per_second": 352.132,
944
+ "eval_steps_per_second": 88.2,
945
+ "step": 60192
946
+ },
947
+ {
948
+ "epoch": 8.04093567251462,
949
+ "grad_norm": 0.4371676743030548,
950
+ "learning_rate": 0.00019590643274853802,
951
+ "loss": 0.3231,
952
+ "step": 60500
953
+ },
954
+ {
955
+ "epoch": 8.107389686337054,
956
+ "grad_norm": 0.000947824795730412,
957
+ "learning_rate": 0.00018926103136629452,
958
+ "loss": 0.3014,
959
+ "step": 61000
960
+ },
961
+ {
962
+ "epoch": 8.17384370015949,
963
+ "grad_norm": 0.06363413482904434,
964
+ "learning_rate": 0.00018261562998405105,
965
+ "loss": 0.2293,
966
+ "step": 61500
967
+ },
968
+ {
969
+ "epoch": 8.240297713981924,
970
+ "grad_norm": 1.2114511728286743,
971
+ "learning_rate": 0.00017597022860180755,
972
+ "loss": 0.2808,
973
+ "step": 62000
974
+ },
975
+ {
976
+ "epoch": 8.30675172780436,
977
+ "grad_norm": 23.535938262939453,
978
+ "learning_rate": 0.00016932482721956408,
979
+ "loss": 0.2595,
980
+ "step": 62500
981
+ },
982
+ {
983
+ "epoch": 8.373205741626794,
984
+ "grad_norm": 60.49204635620117,
985
+ "learning_rate": 0.00016267942583732056,
986
+ "loss": 0.3388,
987
+ "step": 63000
988
+ },
989
+ {
990
+ "epoch": 8.43965975544923,
991
+ "grad_norm": 14.233682632446289,
992
+ "learning_rate": 0.0001560340244550771,
993
+ "loss": 0.3423,
994
+ "step": 63500
995
+ },
996
+ {
997
+ "epoch": 8.506113769271664,
998
+ "grad_norm": 0.015386885032057762,
999
+ "learning_rate": 0.0001493886230728336,
1000
+ "loss": 0.316,
1001
+ "step": 64000
1002
+ },
1003
+ {
1004
+ "epoch": 8.5725677830941,
1005
+ "grad_norm": 0.3906301259994507,
1006
+ "learning_rate": 0.00014274322169059012,
1007
+ "loss": 0.3165,
1008
+ "step": 64500
1009
+ },
1010
+ {
1011
+ "epoch": 8.639021796916534,
1012
+ "grad_norm": 0.0586216077208519,
1013
+ "learning_rate": 0.00013609782030834665,
1014
+ "loss": 0.3013,
1015
+ "step": 65000
1016
+ },
1017
+ {
1018
+ "epoch": 8.70547581073897,
1019
+ "grad_norm": 0.006104405503720045,
1020
+ "learning_rate": 0.00012945241892610312,
1021
+ "loss": 0.2352,
1022
+ "step": 65500
1023
+ },
1024
+ {
1025
+ "epoch": 8.771929824561404,
1026
+ "grad_norm": 0.02979845367372036,
1027
+ "learning_rate": 0.00012280701754385965,
1028
+ "loss": 0.2203,
1029
+ "step": 66000
1030
+ },
1031
+ {
1032
+ "epoch": 8.83838383838384,
1033
+ "grad_norm": 0.08639369904994965,
1034
+ "learning_rate": 0.00011616161616161616,
1035
+ "loss": 0.2643,
1036
+ "step": 66500
1037
+ },
1038
+ {
1039
+ "epoch": 8.904837852206274,
1040
+ "grad_norm": 32.0872802734375,
1041
+ "learning_rate": 0.00010951621477937269,
1042
+ "loss": 0.2658,
1043
+ "step": 67000
1044
+ },
1045
+ {
1046
+ "epoch": 8.971291866028707,
1047
+ "grad_norm": 0.011845378205180168,
1048
+ "learning_rate": 0.00010287081339712919,
1049
+ "loss": 0.2795,
1050
+ "step": 67500
1051
+ },
1052
+ {
1053
+ "epoch": 9.0,
1054
+ "eval_accuracy": 0.9646687697160883,
1055
+ "eval_f1": 0.9644253672098426,
1056
+ "eval_loss": 0.2595302164554596,
1057
+ "eval_precision": 0.9644475825303181,
1058
+ "eval_recall": 0.9646687697160883,
1059
+ "eval_runtime": 4.3195,
1060
+ "eval_samples_per_second": 366.941,
1061
+ "eval_steps_per_second": 91.909,
1062
+ "step": 67716
1063
+ }
1064
+ ],
1065
+ "logging_steps": 500,
1066
+ "max_steps": 75240,
1067
+ "num_input_tokens_seen": 0,
1068
+ "num_train_epochs": 10,
1069
+ "save_steps": 500,
1070
+ "stateful_callbacks": {
1071
+ "TrainerControl": {
1072
+ "args": {
1073
+ "should_epoch_stop": false,
1074
+ "should_evaluate": false,
1075
+ "should_log": false,
1076
+ "should_save": true,
1077
+ "should_training_stop": false
1078
+ },
1079
+ "attributes": {}
1080
+ }
1081
+ },
1082
+ "total_flos": 2551274670587520.0,
1083
+ "train_batch_size": 4,
1084
+ "trial_name": null,
1085
+ "trial_params": null
1086
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6102d63306e03ce9ccefa5c06382dfc3655d1c1d06d494553fcb81a751b526ef
3
+ size 5240