sharren commited on
Commit
45c0e8f
1 Parent(s): c831ad4

Training in progress, step 100

Browse files
all_results.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.61,
3
+ "eval_accuracy": 0.8401525658807212,
4
+ "eval_f1": 0.8300860373682543,
5
+ "eval_loss": 0.5152533650398254,
6
+ "eval_precision": 0.8326625217058278,
7
+ "eval_recall": 0.8401525658807212,
8
+ "eval_runtime": 41.5803,
9
+ "eval_samples_per_second": 69.36,
10
+ "eval_steps_per_second": 8.682,
11
+ "total_flos": 2.2287694956200755e+18,
12
+ "train_loss": 0.2732895821850333,
13
+ "train_runtime": 1429.5622,
14
+ "train_samples_per_second": 358.711,
15
+ "train_steps_per_second": 22.454
16
+ }
config.json CHANGED
@@ -38,5 +38,5 @@
38
  "problem_type": "single_label_classification",
39
  "qkv_bias": true,
40
  "torch_dtype": "float32",
41
- "transformers_version": "4.38.2"
42
  }
 
38
  "problem_type": "single_label_classification",
39
  "qkv_bias": true,
40
  "torch_dtype": "float32",
41
+ "transformers_version": "4.39.0.dev0"
42
  }
eval_results.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.61,
3
+ "eval_accuracy": 0.8401525658807212,
4
+ "eval_f1": 0.8300860373682543,
5
+ "eval_loss": 0.5152533650398254,
6
+ "eval_precision": 0.8326625217058278,
7
+ "eval_recall": 0.8401525658807212,
8
+ "eval_runtime": 41.5803,
9
+ "eval_samples_per_second": 69.36,
10
+ "eval_steps_per_second": 8.682
11
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b63414d64d58868f9f12778c6f4f12d4cc8636a13e329ba9be76845535c66c90
3
  size 343239356
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37533831bd4454eece4c9e58d37e99dbcb12d932787fa67126bf9fdb7eabae8f
3
  size 343239356
preprocessor_config.json CHANGED
@@ -1,4 +1,18 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "do_normalize": true,
3
  "do_rescale": true,
4
  "do_resize": true,
 
1
  {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format"
15
+ ],
16
  "do_normalize": true,
17
  "do_rescale": true,
18
  "do_resize": true,
runs/Mar18_15-27-23_9c311a5b3773/events.out.tfevents.1710777139.9c311a5b3773.3314.21 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf8da78b31ec82b4130ab6869cf4497bea3a268b69f7e03c1ab9efdf51e57e86
3
+ size 560
runs/Mar18_16-06-19_9c311a5b3773/events.out.tfevents.1710777980.9c311a5b3773.3314.22 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9d64a813e3ef537dcb26823e2bc3876e5441054327691f52e866098a8c85c4a
3
+ size 4700
runs/Mar18_16-51-36_9c311a5b3773/events.out.tfevents.1710780698.9c311a5b3773.175663.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e7dd54f595722d5a95266c3b948b19757260d44ead65a25472066167ac1409c
3
+ size 7264
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.61,
3
+ "total_flos": 2.2287694956200755e+18,
4
+ "train_loss": 0.2732895821850333,
5
+ "train_runtime": 1429.5622,
6
+ "train_samples_per_second": 358.711,
7
+ "train_steps_per_second": 22.454
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1506 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.5152533650398254,
3
+ "best_model_checkpoint": "./vit-lr-cosine-restarts/checkpoint-800",
4
+ "epoch": 5.607476635514018,
5
+ "eval_steps": 100,
6
+ "global_step": 1800,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03,
13
+ "grad_norm": 5.076780319213867,
14
+ "learning_rate": 9.999998060388815e-05,
15
+ "loss": 1.3486,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.06,
20
+ "grad_norm": 4.030631065368652,
21
+ "learning_rate": 9.999991355561956e-05,
22
+ "loss": 0.7525,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.09,
27
+ "grad_norm": 4.877225875854492,
28
+ "learning_rate": 9.999979861580028e-05,
29
+ "loss": 0.833,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.12,
34
+ "grad_norm": 7.290777683258057,
35
+ "learning_rate": 9.99996357845404e-05,
36
+ "loss": 0.9935,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.16,
41
+ "grad_norm": 5.126965522766113,
42
+ "learning_rate": 9.999942506199588e-05,
43
+ "loss": 0.7271,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.19,
48
+ "grad_norm": 6.139286518096924,
49
+ "learning_rate": 9.999916644836857e-05,
50
+ "loss": 0.8015,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.22,
55
+ "grad_norm": 4.372274398803711,
56
+ "learning_rate": 9.999885994390619e-05,
57
+ "loss": 0.8235,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.25,
62
+ "grad_norm": 4.294188976287842,
63
+ "learning_rate": 9.999850554890226e-05,
64
+ "loss": 0.6745,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.28,
69
+ "grad_norm": 5.877817153930664,
70
+ "learning_rate": 9.999810326369631e-05,
71
+ "loss": 0.6976,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.31,
76
+ "grad_norm": 4.55079984664917,
77
+ "learning_rate": 9.999765308867361e-05,
78
+ "loss": 0.5734,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.31,
83
+ "eval_accuracy": 0.7884882108183079,
84
+ "eval_f1": 0.7654615392142581,
85
+ "eval_loss": 0.6007876992225647,
86
+ "eval_precision": 0.763065724803555,
87
+ "eval_recall": 0.7884882108183079,
88
+ "eval_runtime": 39.2669,
89
+ "eval_samples_per_second": 73.446,
90
+ "eval_steps_per_second": 9.193,
91
+ "step": 100
92
+ },
93
+ {
94
+ "epoch": 0.34,
95
+ "grad_norm": 3.842815399169922,
96
+ "learning_rate": 9.999715502426537e-05,
97
+ "loss": 0.6458,
98
+ "step": 110
99
+ },
100
+ {
101
+ "epoch": 0.37,
102
+ "grad_norm": 8.019635200500488,
103
+ "learning_rate": 9.999660907094863e-05,
104
+ "loss": 0.6384,
105
+ "step": 120
106
+ },
107
+ {
108
+ "epoch": 0.4,
109
+ "grad_norm": 7.878860950469971,
110
+ "learning_rate": 9.999601522924635e-05,
111
+ "loss": 0.7051,
112
+ "step": 130
113
+ },
114
+ {
115
+ "epoch": 0.44,
116
+ "grad_norm": 2.1701011657714844,
117
+ "learning_rate": 9.999537349972733e-05,
118
+ "loss": 0.5199,
119
+ "step": 140
120
+ },
121
+ {
122
+ "epoch": 0.47,
123
+ "grad_norm": 4.154609680175781,
124
+ "learning_rate": 9.999468388300622e-05,
125
+ "loss": 0.5967,
126
+ "step": 150
127
+ },
128
+ {
129
+ "epoch": 0.5,
130
+ "grad_norm": 5.633548259735107,
131
+ "learning_rate": 9.999394637974358e-05,
132
+ "loss": 0.5481,
133
+ "step": 160
134
+ },
135
+ {
136
+ "epoch": 0.53,
137
+ "grad_norm": 3.171057939529419,
138
+ "learning_rate": 9.99931609906458e-05,
139
+ "loss": 0.6302,
140
+ "step": 170
141
+ },
142
+ {
143
+ "epoch": 0.56,
144
+ "grad_norm": 5.892911911010742,
145
+ "learning_rate": 9.999232771646514e-05,
146
+ "loss": 0.655,
147
+ "step": 180
148
+ },
149
+ {
150
+ "epoch": 0.59,
151
+ "grad_norm": 3.430375576019287,
152
+ "learning_rate": 9.999144655799976e-05,
153
+ "loss": 0.5747,
154
+ "step": 190
155
+ },
156
+ {
157
+ "epoch": 0.62,
158
+ "grad_norm": 5.8416056632995605,
159
+ "learning_rate": 9.999051751609367e-05,
160
+ "loss": 0.5602,
161
+ "step": 200
162
+ },
163
+ {
164
+ "epoch": 0.62,
165
+ "eval_accuracy": 0.7541608876560333,
166
+ "eval_f1": 0.7003949849463896,
167
+ "eval_loss": 0.784325361251831,
168
+ "eval_precision": 0.742547935218919,
169
+ "eval_recall": 0.7541608876560333,
170
+ "eval_runtime": 38.9694,
171
+ "eval_samples_per_second": 74.007,
172
+ "eval_steps_per_second": 9.264,
173
+ "step": 200
174
+ },
175
+ {
176
+ "epoch": 0.65,
177
+ "grad_norm": 5.451653480529785,
178
+ "learning_rate": 9.998954059163672e-05,
179
+ "loss": 0.6803,
180
+ "step": 210
181
+ },
182
+ {
183
+ "epoch": 0.69,
184
+ "grad_norm": 6.584908485412598,
185
+ "learning_rate": 9.998851578556461e-05,
186
+ "loss": 0.7019,
187
+ "step": 220
188
+ },
189
+ {
190
+ "epoch": 0.72,
191
+ "grad_norm": 2.3256571292877197,
192
+ "learning_rate": 9.998744309885899e-05,
193
+ "loss": 0.4028,
194
+ "step": 230
195
+ },
196
+ {
197
+ "epoch": 0.75,
198
+ "grad_norm": 3.929255962371826,
199
+ "learning_rate": 9.998632253254729e-05,
200
+ "loss": 0.5235,
201
+ "step": 240
202
+ },
203
+ {
204
+ "epoch": 0.78,
205
+ "grad_norm": 4.65420389175415,
206
+ "learning_rate": 9.998515408770281e-05,
207
+ "loss": 0.6575,
208
+ "step": 250
209
+ },
210
+ {
211
+ "epoch": 0.81,
212
+ "grad_norm": 5.167401313781738,
213
+ "learning_rate": 9.998393776544475e-05,
214
+ "loss": 0.5393,
215
+ "step": 260
216
+ },
217
+ {
218
+ "epoch": 0.84,
219
+ "grad_norm": 3.1561028957366943,
220
+ "learning_rate": 9.998267356693811e-05,
221
+ "loss": 0.4725,
222
+ "step": 270
223
+ },
224
+ {
225
+ "epoch": 0.87,
226
+ "grad_norm": 4.633551597595215,
227
+ "learning_rate": 9.998136149339382e-05,
228
+ "loss": 0.4838,
229
+ "step": 280
230
+ },
231
+ {
232
+ "epoch": 0.9,
233
+ "grad_norm": 6.649106979370117,
234
+ "learning_rate": 9.99800015460686e-05,
235
+ "loss": 0.6252,
236
+ "step": 290
237
+ },
238
+ {
239
+ "epoch": 0.93,
240
+ "grad_norm": 6.892623424530029,
241
+ "learning_rate": 9.997859372626506e-05,
242
+ "loss": 0.7117,
243
+ "step": 300
244
+ },
245
+ {
246
+ "epoch": 0.93,
247
+ "eval_accuracy": 0.7659500693481276,
248
+ "eval_f1": 0.7754174880710092,
249
+ "eval_loss": 0.6221866011619568,
250
+ "eval_precision": 0.8158439426784209,
251
+ "eval_recall": 0.7659500693481276,
252
+ "eval_runtime": 38.9156,
253
+ "eval_samples_per_second": 74.109,
254
+ "eval_steps_per_second": 9.276,
255
+ "step": 300
256
+ },
257
+ {
258
+ "epoch": 0.97,
259
+ "grad_norm": 5.1511149406433105,
260
+ "learning_rate": 9.997713803533167e-05,
261
+ "loss": 0.5396,
262
+ "step": 310
263
+ },
264
+ {
265
+ "epoch": 1.0,
266
+ "grad_norm": 5.100913047790527,
267
+ "learning_rate": 9.997563447466271e-05,
268
+ "loss": 0.4868,
269
+ "step": 320
270
+ },
271
+ {
272
+ "epoch": 1.03,
273
+ "grad_norm": 5.780107498168945,
274
+ "learning_rate": 9.997408304569836e-05,
275
+ "loss": 0.4748,
276
+ "step": 330
277
+ },
278
+ {
279
+ "epoch": 1.06,
280
+ "grad_norm": 5.687930107116699,
281
+ "learning_rate": 9.997248374992462e-05,
282
+ "loss": 0.3881,
283
+ "step": 340
284
+ },
285
+ {
286
+ "epoch": 1.09,
287
+ "grad_norm": 3.1763672828674316,
288
+ "learning_rate": 9.997083658887336e-05,
289
+ "loss": 0.4268,
290
+ "step": 350
291
+ },
292
+ {
293
+ "epoch": 1.12,
294
+ "grad_norm": 2.4463164806365967,
295
+ "learning_rate": 9.996914156412227e-05,
296
+ "loss": 0.2806,
297
+ "step": 360
298
+ },
299
+ {
300
+ "epoch": 1.15,
301
+ "grad_norm": 3.662301778793335,
302
+ "learning_rate": 9.99673986772949e-05,
303
+ "loss": 0.4249,
304
+ "step": 370
305
+ },
306
+ {
307
+ "epoch": 1.18,
308
+ "grad_norm": 7.021162986755371,
309
+ "learning_rate": 9.996560793006067e-05,
310
+ "loss": 0.4093,
311
+ "step": 380
312
+ },
313
+ {
314
+ "epoch": 1.21,
315
+ "grad_norm": 4.5534539222717285,
316
+ "learning_rate": 9.996376932413478e-05,
317
+ "loss": 0.4531,
318
+ "step": 390
319
+ },
320
+ {
321
+ "epoch": 1.25,
322
+ "grad_norm": 2.38512921333313,
323
+ "learning_rate": 9.996188286127832e-05,
324
+ "loss": 0.4445,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 1.25,
329
+ "eval_accuracy": 0.7923023578363384,
330
+ "eval_f1": 0.7998929576261109,
331
+ "eval_loss": 0.5481300950050354,
332
+ "eval_precision": 0.8180996676834356,
333
+ "eval_recall": 0.7923023578363384,
334
+ "eval_runtime": 39.7555,
335
+ "eval_samples_per_second": 72.543,
336
+ "eval_steps_per_second": 9.081,
337
+ "step": 400
338
+ },
339
+ {
340
+ "epoch": 1.28,
341
+ "grad_norm": 4.16684103012085,
342
+ "learning_rate": 9.995994854329822e-05,
343
+ "loss": 0.3825,
344
+ "step": 410
345
+ },
346
+ {
347
+ "epoch": 1.31,
348
+ "grad_norm": 4.1305155754089355,
349
+ "learning_rate": 9.995796637204721e-05,
350
+ "loss": 0.4733,
351
+ "step": 420
352
+ },
353
+ {
354
+ "epoch": 1.34,
355
+ "grad_norm": 5.665832042694092,
356
+ "learning_rate": 9.99559363494239e-05,
357
+ "loss": 0.5294,
358
+ "step": 430
359
+ },
360
+ {
361
+ "epoch": 1.37,
362
+ "grad_norm": 4.365192413330078,
363
+ "learning_rate": 9.995385847737268e-05,
364
+ "loss": 0.4245,
365
+ "step": 440
366
+ },
367
+ {
368
+ "epoch": 1.4,
369
+ "grad_norm": 4.794129371643066,
370
+ "learning_rate": 9.995173275788385e-05,
371
+ "loss": 0.3817,
372
+ "step": 450
373
+ },
374
+ {
375
+ "epoch": 1.43,
376
+ "grad_norm": 5.548172950744629,
377
+ "learning_rate": 9.994955919299347e-05,
378
+ "loss": 0.4014,
379
+ "step": 460
380
+ },
381
+ {
382
+ "epoch": 1.46,
383
+ "grad_norm": 3.7581427097320557,
384
+ "learning_rate": 9.994733778478344e-05,
385
+ "loss": 0.3768,
386
+ "step": 470
387
+ },
388
+ {
389
+ "epoch": 1.5,
390
+ "grad_norm": 6.855079174041748,
391
+ "learning_rate": 9.994506853538152e-05,
392
+ "loss": 0.4268,
393
+ "step": 480
394
+ },
395
+ {
396
+ "epoch": 1.53,
397
+ "grad_norm": 4.8678483963012695,
398
+ "learning_rate": 9.994275144696124e-05,
399
+ "loss": 0.4195,
400
+ "step": 490
401
+ },
402
+ {
403
+ "epoch": 1.56,
404
+ "grad_norm": 2.5210819244384766,
405
+ "learning_rate": 9.994038652174203e-05,
406
+ "loss": 0.3471,
407
+ "step": 500
408
+ },
409
+ {
410
+ "epoch": 1.56,
411
+ "eval_accuracy": 0.8217753120665742,
412
+ "eval_f1": 0.8047594572379952,
413
+ "eval_loss": 0.5284826159477234,
414
+ "eval_precision": 0.8158080424554234,
415
+ "eval_recall": 0.8217753120665742,
416
+ "eval_runtime": 39.5975,
417
+ "eval_samples_per_second": 72.833,
418
+ "eval_steps_per_second": 9.117,
419
+ "step": 500
420
+ },
421
+ {
422
+ "epoch": 1.59,
423
+ "grad_norm": 3.7428205013275146,
424
+ "learning_rate": 9.993797376198904e-05,
425
+ "loss": 0.4145,
426
+ "step": 510
427
+ },
428
+ {
429
+ "epoch": 1.62,
430
+ "grad_norm": 4.127384185791016,
431
+ "learning_rate": 9.993551317001332e-05,
432
+ "loss": 0.3791,
433
+ "step": 520
434
+ },
435
+ {
436
+ "epoch": 1.65,
437
+ "grad_norm": 5.904128074645996,
438
+ "learning_rate": 9.993300474817171e-05,
439
+ "loss": 0.4183,
440
+ "step": 530
441
+ },
442
+ {
443
+ "epoch": 1.68,
444
+ "grad_norm": 3.835289478302002,
445
+ "learning_rate": 9.993044849886683e-05,
446
+ "loss": 0.3489,
447
+ "step": 540
448
+ },
449
+ {
450
+ "epoch": 1.71,
451
+ "grad_norm": 4.441788673400879,
452
+ "learning_rate": 9.992784442454718e-05,
453
+ "loss": 0.52,
454
+ "step": 550
455
+ },
456
+ {
457
+ "epoch": 1.74,
458
+ "grad_norm": 4.608953475952148,
459
+ "learning_rate": 9.9925192527707e-05,
460
+ "loss": 0.4983,
461
+ "step": 560
462
+ },
463
+ {
464
+ "epoch": 1.78,
465
+ "grad_norm": 3.4798424243927,
466
+ "learning_rate": 9.992249281088636e-05,
467
+ "loss": 0.3766,
468
+ "step": 570
469
+ },
470
+ {
471
+ "epoch": 1.81,
472
+ "grad_norm": 3.5282986164093018,
473
+ "learning_rate": 9.991974527667115e-05,
474
+ "loss": 0.2979,
475
+ "step": 580
476
+ },
477
+ {
478
+ "epoch": 1.84,
479
+ "grad_norm": 6.50264310836792,
480
+ "learning_rate": 9.991694992769305e-05,
481
+ "loss": 0.4602,
482
+ "step": 590
483
+ },
484
+ {
485
+ "epoch": 1.87,
486
+ "grad_norm": 1.7083078622817993,
487
+ "learning_rate": 9.991410676662952e-05,
488
+ "loss": 0.3144,
489
+ "step": 600
490
+ },
491
+ {
492
+ "epoch": 1.87,
493
+ "eval_accuracy": 0.7961165048543689,
494
+ "eval_f1": 0.8022817302667706,
495
+ "eval_loss": 0.5565336346626282,
496
+ "eval_precision": 0.8311598226972493,
497
+ "eval_recall": 0.7961165048543689,
498
+ "eval_runtime": 38.759,
499
+ "eval_samples_per_second": 74.408,
500
+ "eval_steps_per_second": 9.314,
501
+ "step": 600
502
+ },
503
+ {
504
+ "epoch": 1.9,
505
+ "grad_norm": 5.729618549346924,
506
+ "learning_rate": 9.991121579620385e-05,
507
+ "loss": 0.4488,
508
+ "step": 610
509
+ },
510
+ {
511
+ "epoch": 1.93,
512
+ "grad_norm": 3.340083599090576,
513
+ "learning_rate": 9.99082770191851e-05,
514
+ "loss": 0.3636,
515
+ "step": 620
516
+ },
517
+ {
518
+ "epoch": 1.96,
519
+ "grad_norm": 2.685565710067749,
520
+ "learning_rate": 9.990529043838812e-05,
521
+ "loss": 0.2454,
522
+ "step": 630
523
+ },
524
+ {
525
+ "epoch": 1.99,
526
+ "grad_norm": 4.4097676277160645,
527
+ "learning_rate": 9.990225605667357e-05,
528
+ "loss": 0.413,
529
+ "step": 640
530
+ },
531
+ {
532
+ "epoch": 2.02,
533
+ "grad_norm": 3.5152175426483154,
534
+ "learning_rate": 9.989917387694786e-05,
535
+ "loss": 0.2442,
536
+ "step": 650
537
+ },
538
+ {
539
+ "epoch": 2.06,
540
+ "grad_norm": 3.326936960220337,
541
+ "learning_rate": 9.989604390216322e-05,
542
+ "loss": 0.206,
543
+ "step": 660
544
+ },
545
+ {
546
+ "epoch": 2.09,
547
+ "grad_norm": 5.301408767700195,
548
+ "learning_rate": 9.989286613531763e-05,
549
+ "loss": 0.305,
550
+ "step": 670
551
+ },
552
+ {
553
+ "epoch": 2.12,
554
+ "grad_norm": 4.104938507080078,
555
+ "learning_rate": 9.988964057945486e-05,
556
+ "loss": 0.2568,
557
+ "step": 680
558
+ },
559
+ {
560
+ "epoch": 2.15,
561
+ "grad_norm": 3.0700721740722656,
562
+ "learning_rate": 9.988636723766446e-05,
563
+ "loss": 0.2259,
564
+ "step": 690
565
+ },
566
+ {
567
+ "epoch": 2.18,
568
+ "grad_norm": 2.798845052719116,
569
+ "learning_rate": 9.988304611308174e-05,
570
+ "loss": 0.1702,
571
+ "step": 700
572
+ },
573
+ {
574
+ "epoch": 2.18,
575
+ "eval_accuracy": 0.8255894590846047,
576
+ "eval_f1": 0.8239856028376362,
577
+ "eval_loss": 0.5403878688812256,
578
+ "eval_precision": 0.8319658160543976,
579
+ "eval_recall": 0.8255894590846047,
580
+ "eval_runtime": 39.5591,
581
+ "eval_samples_per_second": 72.903,
582
+ "eval_steps_per_second": 9.126,
583
+ "step": 700
584
+ },
585
+ {
586
+ "epoch": 2.21,
587
+ "grad_norm": 0.7119998931884766,
588
+ "learning_rate": 9.987967720888777e-05,
589
+ "loss": 0.1882,
590
+ "step": 710
591
+ },
592
+ {
593
+ "epoch": 2.24,
594
+ "grad_norm": 4.365331649780273,
595
+ "learning_rate": 9.987626052830943e-05,
596
+ "loss": 0.3553,
597
+ "step": 720
598
+ },
599
+ {
600
+ "epoch": 2.27,
601
+ "grad_norm": 5.9306511878967285,
602
+ "learning_rate": 9.98727960746193e-05,
603
+ "loss": 0.3306,
604
+ "step": 730
605
+ },
606
+ {
607
+ "epoch": 2.31,
608
+ "grad_norm": 1.0953195095062256,
609
+ "learning_rate": 9.986928385113575e-05,
610
+ "loss": 0.1931,
611
+ "step": 740
612
+ },
613
+ {
614
+ "epoch": 2.34,
615
+ "grad_norm": 6.899849891662598,
616
+ "learning_rate": 9.986572386122291e-05,
617
+ "loss": 0.1806,
618
+ "step": 750
619
+ },
620
+ {
621
+ "epoch": 2.37,
622
+ "grad_norm": 1.6626615524291992,
623
+ "learning_rate": 9.986211610829065e-05,
624
+ "loss": 0.1701,
625
+ "step": 760
626
+ },
627
+ {
628
+ "epoch": 2.4,
629
+ "grad_norm": 4.0154709815979,
630
+ "learning_rate": 9.98584605957946e-05,
631
+ "loss": 0.2327,
632
+ "step": 770
633
+ },
634
+ {
635
+ "epoch": 2.43,
636
+ "grad_norm": 2.971966028213501,
637
+ "learning_rate": 9.98547573272361e-05,
638
+ "loss": 0.3,
639
+ "step": 780
640
+ },
641
+ {
642
+ "epoch": 2.46,
643
+ "grad_norm": 3.9327046871185303,
644
+ "learning_rate": 9.985100630616231e-05,
645
+ "loss": 0.3169,
646
+ "step": 790
647
+ },
648
+ {
649
+ "epoch": 2.49,
650
+ "grad_norm": 2.108839750289917,
651
+ "learning_rate": 9.984720753616604e-05,
652
+ "loss": 0.2557,
653
+ "step": 800
654
+ },
655
+ {
656
+ "epoch": 2.49,
657
+ "eval_accuracy": 0.8401525658807212,
658
+ "eval_f1": 0.8300860373682543,
659
+ "eval_loss": 0.5152533650398254,
660
+ "eval_precision": 0.8326625217058278,
661
+ "eval_recall": 0.8401525658807212,
662
+ "eval_runtime": 39.7502,
663
+ "eval_samples_per_second": 72.553,
664
+ "eval_steps_per_second": 9.082,
665
+ "step": 800
666
+ },
667
+ {
668
+ "epoch": 2.52,
669
+ "grad_norm": 2.910875082015991,
670
+ "learning_rate": 9.98433610208859e-05,
671
+ "loss": 0.4144,
672
+ "step": 810
673
+ },
674
+ {
675
+ "epoch": 2.55,
676
+ "grad_norm": 3.922912359237671,
677
+ "learning_rate": 9.98394667640062e-05,
678
+ "loss": 0.2054,
679
+ "step": 820
680
+ },
681
+ {
682
+ "epoch": 2.59,
683
+ "grad_norm": 4.223613739013672,
684
+ "learning_rate": 9.983552476925697e-05,
685
+ "loss": 0.2299,
686
+ "step": 830
687
+ },
688
+ {
689
+ "epoch": 2.62,
690
+ "grad_norm": 4.242312908172607,
691
+ "learning_rate": 9.983153504041402e-05,
692
+ "loss": 0.176,
693
+ "step": 840
694
+ },
695
+ {
696
+ "epoch": 2.65,
697
+ "grad_norm": 2.051708936691284,
698
+ "learning_rate": 9.98274975812988e-05,
699
+ "loss": 0.1621,
700
+ "step": 850
701
+ },
702
+ {
703
+ "epoch": 2.68,
704
+ "grad_norm": 4.030768871307373,
705
+ "learning_rate": 9.982341239577855e-05,
706
+ "loss": 0.2601,
707
+ "step": 860
708
+ },
709
+ {
710
+ "epoch": 2.71,
711
+ "grad_norm": 3.5723698139190674,
712
+ "learning_rate": 9.98192794877662e-05,
713
+ "loss": 0.2528,
714
+ "step": 870
715
+ },
716
+ {
717
+ "epoch": 2.74,
718
+ "grad_norm": 8.439871788024902,
719
+ "learning_rate": 9.981509886122034e-05,
720
+ "loss": 0.2787,
721
+ "step": 880
722
+ },
723
+ {
724
+ "epoch": 2.77,
725
+ "grad_norm": 1.1091142892837524,
726
+ "learning_rate": 9.981087052014534e-05,
727
+ "loss": 0.2204,
728
+ "step": 890
729
+ },
730
+ {
731
+ "epoch": 2.8,
732
+ "grad_norm": 6.232639312744141,
733
+ "learning_rate": 9.980659446859127e-05,
734
+ "loss": 0.1579,
735
+ "step": 900
736
+ },
737
+ {
738
+ "epoch": 2.8,
739
+ "eval_accuracy": 0.8217753120665742,
740
+ "eval_f1": 0.8250088481014233,
741
+ "eval_loss": 0.5866798162460327,
742
+ "eval_precision": 0.841958803508682,
743
+ "eval_recall": 0.8217753120665742,
744
+ "eval_runtime": 39.5043,
745
+ "eval_samples_per_second": 73.005,
746
+ "eval_steps_per_second": 9.138,
747
+ "step": 900
748
+ },
749
+ {
750
+ "epoch": 2.83,
751
+ "grad_norm": 7.044712543487549,
752
+ "learning_rate": 9.980227071065382e-05,
753
+ "loss": 0.3333,
754
+ "step": 910
755
+ },
756
+ {
757
+ "epoch": 2.87,
758
+ "grad_norm": 5.107402324676514,
759
+ "learning_rate": 9.979789925047447e-05,
760
+ "loss": 0.2788,
761
+ "step": 920
762
+ },
763
+ {
764
+ "epoch": 2.9,
765
+ "grad_norm": 3.1622753143310547,
766
+ "learning_rate": 9.979348009224032e-05,
767
+ "loss": 0.3464,
768
+ "step": 930
769
+ },
770
+ {
771
+ "epoch": 2.93,
772
+ "grad_norm": 6.17850399017334,
773
+ "learning_rate": 9.97890132401842e-05,
774
+ "loss": 0.3268,
775
+ "step": 940
776
+ },
777
+ {
778
+ "epoch": 2.96,
779
+ "grad_norm": 0.5498158931732178,
780
+ "learning_rate": 9.978449869858458e-05,
781
+ "loss": 0.2121,
782
+ "step": 950
783
+ },
784
+ {
785
+ "epoch": 2.99,
786
+ "grad_norm": 5.8213300704956055,
787
+ "learning_rate": 9.977993647176566e-05,
788
+ "loss": 0.1871,
789
+ "step": 960
790
+ },
791
+ {
792
+ "epoch": 3.02,
793
+ "grad_norm": 0.513097882270813,
794
+ "learning_rate": 9.97753265640973e-05,
795
+ "loss": 0.1037,
796
+ "step": 970
797
+ },
798
+ {
799
+ "epoch": 3.05,
800
+ "grad_norm": 5.008875846862793,
801
+ "learning_rate": 9.977066897999499e-05,
802
+ "loss": 0.1076,
803
+ "step": 980
804
+ },
805
+ {
806
+ "epoch": 3.08,
807
+ "grad_norm": 2.2303969860076904,
808
+ "learning_rate": 9.976596372391993e-05,
809
+ "loss": 0.0691,
810
+ "step": 990
811
+ },
812
+ {
813
+ "epoch": 3.12,
814
+ "grad_norm": 2.022919178009033,
815
+ "learning_rate": 9.976121080037899e-05,
816
+ "loss": 0.0815,
817
+ "step": 1000
818
+ },
819
+ {
820
+ "epoch": 3.12,
821
+ "eval_accuracy": 0.8401525658807212,
822
+ "eval_f1": 0.8350952876276385,
823
+ "eval_loss": 0.621809184551239,
824
+ "eval_precision": 0.8475972155785173,
825
+ "eval_recall": 0.8401525658807212,
826
+ "eval_runtime": 41.1855,
827
+ "eval_samples_per_second": 70.025,
828
+ "eval_steps_per_second": 8.765,
829
+ "step": 1000
830
+ },
831
+ {
832
+ "epoch": 3.15,
833
+ "grad_norm": 3.9770946502685547,
834
+ "learning_rate": 9.975641021392464e-05,
835
+ "loss": 0.0985,
836
+ "step": 1010
837
+ },
838
+ {
839
+ "epoch": 3.18,
840
+ "grad_norm": 1.0038492679595947,
841
+ "learning_rate": 9.975156196915505e-05,
842
+ "loss": 0.1406,
843
+ "step": 1020
844
+ },
845
+ {
846
+ "epoch": 3.21,
847
+ "grad_norm": 2.54162335395813,
848
+ "learning_rate": 9.974666607071404e-05,
849
+ "loss": 0.1524,
850
+ "step": 1030
851
+ },
852
+ {
853
+ "epoch": 3.24,
854
+ "grad_norm": 3.7266845703125,
855
+ "learning_rate": 9.974172252329104e-05,
856
+ "loss": 0.0771,
857
+ "step": 1040
858
+ },
859
+ {
860
+ "epoch": 3.27,
861
+ "grad_norm": 8.810432434082031,
862
+ "learning_rate": 9.973673133162115e-05,
863
+ "loss": 0.1663,
864
+ "step": 1050
865
+ },
866
+ {
867
+ "epoch": 3.3,
868
+ "grad_norm": 0.4533999562263489,
869
+ "learning_rate": 9.973169250048511e-05,
870
+ "loss": 0.0733,
871
+ "step": 1060
872
+ },
873
+ {
874
+ "epoch": 3.33,
875
+ "grad_norm": 0.25851675868034363,
876
+ "learning_rate": 9.972660603470927e-05,
877
+ "loss": 0.1261,
878
+ "step": 1070
879
+ },
880
+ {
881
+ "epoch": 3.36,
882
+ "grad_norm": 4.003190994262695,
883
+ "learning_rate": 9.97214719391656e-05,
884
+ "loss": 0.1786,
885
+ "step": 1080
886
+ },
887
+ {
888
+ "epoch": 3.4,
889
+ "grad_norm": 3.595621109008789,
890
+ "learning_rate": 9.971629021877172e-05,
891
+ "loss": 0.0972,
892
+ "step": 1090
893
+ },
894
+ {
895
+ "epoch": 3.43,
896
+ "grad_norm": 2.7062487602233887,
897
+ "learning_rate": 9.971106087849084e-05,
898
+ "loss": 0.1075,
899
+ "step": 1100
900
+ },
901
+ {
902
+ "epoch": 3.43,
903
+ "eval_accuracy": 0.8429264909847434,
904
+ "eval_f1": 0.8341860243029758,
905
+ "eval_loss": 0.6122580170631409,
906
+ "eval_precision": 0.8456371193220292,
907
+ "eval_recall": 0.8429264909847434,
908
+ "eval_runtime": 39.1852,
909
+ "eval_samples_per_second": 73.599,
910
+ "eval_steps_per_second": 9.213,
911
+ "step": 1100
912
+ },
913
+ {
914
+ "epoch": 3.46,
915
+ "grad_norm": 4.331242561340332,
916
+ "learning_rate": 9.97057839233318e-05,
917
+ "loss": 0.0529,
918
+ "step": 1110
919
+ },
920
+ {
921
+ "epoch": 3.49,
922
+ "grad_norm": 4.688290119171143,
923
+ "learning_rate": 9.970045935834904e-05,
924
+ "loss": 0.1567,
925
+ "step": 1120
926
+ },
927
+ {
928
+ "epoch": 3.52,
929
+ "grad_norm": 6.66792106628418,
930
+ "learning_rate": 9.96950871886426e-05,
931
+ "loss": 0.053,
932
+ "step": 1130
933
+ },
934
+ {
935
+ "epoch": 3.55,
936
+ "grad_norm": 13.5020170211792,
937
+ "learning_rate": 9.968966741935813e-05,
938
+ "loss": 0.1816,
939
+ "step": 1140
940
+ },
941
+ {
942
+ "epoch": 3.58,
943
+ "grad_norm": 2.9301021099090576,
944
+ "learning_rate": 9.968420005568684e-05,
945
+ "loss": 0.1387,
946
+ "step": 1150
947
+ },
948
+ {
949
+ "epoch": 3.61,
950
+ "grad_norm": 0.7428218126296997,
951
+ "learning_rate": 9.967868510286557e-05,
952
+ "loss": 0.2021,
953
+ "step": 1160
954
+ },
955
+ {
956
+ "epoch": 3.64,
957
+ "grad_norm": 5.887143611907959,
958
+ "learning_rate": 9.967312256617671e-05,
959
+ "loss": 0.1325,
960
+ "step": 1170
961
+ },
962
+ {
963
+ "epoch": 3.68,
964
+ "grad_norm": 0.15455959737300873,
965
+ "learning_rate": 9.966751245094823e-05,
966
+ "loss": 0.0623,
967
+ "step": 1180
968
+ },
969
+ {
970
+ "epoch": 3.71,
971
+ "grad_norm": 11.433218002319336,
972
+ "learning_rate": 9.966185476255371e-05,
973
+ "loss": 0.099,
974
+ "step": 1190
975
+ },
976
+ {
977
+ "epoch": 3.74,
978
+ "grad_norm": 1.5371123552322388,
979
+ "learning_rate": 9.965614950641225e-05,
980
+ "loss": 0.161,
981
+ "step": 1200
982
+ },
983
+ {
984
+ "epoch": 3.74,
985
+ "eval_accuracy": 0.8509015256588072,
986
+ "eval_f1": 0.8419411480198592,
987
+ "eval_loss": 0.6438868045806885,
988
+ "eval_precision": 0.8478228582682292,
989
+ "eval_recall": 0.8509015256588072,
990
+ "eval_runtime": 39.7116,
991
+ "eval_samples_per_second": 72.624,
992
+ "eval_steps_per_second": 9.091,
993
+ "step": 1200
994
+ },
995
+ {
996
+ "epoch": 3.77,
997
+ "grad_norm": 5.211598873138428,
998
+ "learning_rate": 9.965039668798855e-05,
999
+ "loss": 0.1748,
1000
+ "step": 1210
1001
+ },
1002
+ {
1003
+ "epoch": 3.8,
1004
+ "grad_norm": 6.897021770477295,
1005
+ "learning_rate": 9.96445963127928e-05,
1006
+ "loss": 0.2178,
1007
+ "step": 1220
1008
+ },
1009
+ {
1010
+ "epoch": 3.83,
1011
+ "grad_norm": 2.0479114055633545,
1012
+ "learning_rate": 9.963874838638084e-05,
1013
+ "loss": 0.0631,
1014
+ "step": 1230
1015
+ },
1016
+ {
1017
+ "epoch": 3.86,
1018
+ "grad_norm": 0.7689093351364136,
1019
+ "learning_rate": 9.963285291435398e-05,
1020
+ "loss": 0.1147,
1021
+ "step": 1240
1022
+ },
1023
+ {
1024
+ "epoch": 3.89,
1025
+ "grad_norm": 0.16739872097969055,
1026
+ "learning_rate": 9.96269099023591e-05,
1027
+ "loss": 0.1705,
1028
+ "step": 1250
1029
+ },
1030
+ {
1031
+ "epoch": 3.93,
1032
+ "grad_norm": 0.712565004825592,
1033
+ "learning_rate": 9.962091935608861e-05,
1034
+ "loss": 0.0937,
1035
+ "step": 1260
1036
+ },
1037
+ {
1038
+ "epoch": 3.96,
1039
+ "grad_norm": 9.288028717041016,
1040
+ "learning_rate": 9.961488128128047e-05,
1041
+ "loss": 0.1466,
1042
+ "step": 1270
1043
+ },
1044
+ {
1045
+ "epoch": 3.99,
1046
+ "grad_norm": 1.646752119064331,
1047
+ "learning_rate": 9.960879568371813e-05,
1048
+ "loss": 0.0806,
1049
+ "step": 1280
1050
+ },
1051
+ {
1052
+ "epoch": 4.02,
1053
+ "grad_norm": 0.14091795682907104,
1054
+ "learning_rate": 9.960266256923055e-05,
1055
+ "loss": 0.0882,
1056
+ "step": 1290
1057
+ },
1058
+ {
1059
+ "epoch": 4.05,
1060
+ "grad_norm": 0.6831271052360535,
1061
+ "learning_rate": 9.959648194369227e-05,
1062
+ "loss": 0.0446,
1063
+ "step": 1300
1064
+ },
1065
+ {
1066
+ "epoch": 4.05,
1067
+ "eval_accuracy": 0.8561026352288488,
1068
+ "eval_f1": 0.8516177334392149,
1069
+ "eval_loss": 0.6347180008888245,
1070
+ "eval_precision": 0.8514533351685835,
1071
+ "eval_recall": 0.8561026352288488,
1072
+ "eval_runtime": 39.1555,
1073
+ "eval_samples_per_second": 73.655,
1074
+ "eval_steps_per_second": 9.22,
1075
+ "step": 1300
1076
+ },
1077
+ {
1078
+ "epoch": 4.08,
1079
+ "grad_norm": 0.6896274089813232,
1080
+ "learning_rate": 9.959025381302325e-05,
1081
+ "loss": 0.0551,
1082
+ "step": 1310
1083
+ },
1084
+ {
1085
+ "epoch": 4.11,
1086
+ "grad_norm": 1.7469240427017212,
1087
+ "learning_rate": 9.958397818318904e-05,
1088
+ "loss": 0.0172,
1089
+ "step": 1320
1090
+ },
1091
+ {
1092
+ "epoch": 4.14,
1093
+ "grad_norm": 1.5375361442565918,
1094
+ "learning_rate": 9.957765506020062e-05,
1095
+ "loss": 0.1115,
1096
+ "step": 1330
1097
+ },
1098
+ {
1099
+ "epoch": 4.17,
1100
+ "grad_norm": 0.022675570100545883,
1101
+ "learning_rate": 9.95712844501145e-05,
1102
+ "loss": 0.0172,
1103
+ "step": 1340
1104
+ },
1105
+ {
1106
+ "epoch": 4.21,
1107
+ "grad_norm": 0.012827737256884575,
1108
+ "learning_rate": 9.956486635903263e-05,
1109
+ "loss": 0.0473,
1110
+ "step": 1350
1111
+ },
1112
+ {
1113
+ "epoch": 4.24,
1114
+ "grad_norm": 9.727991104125977,
1115
+ "learning_rate": 9.955840079310251e-05,
1116
+ "loss": 0.0425,
1117
+ "step": 1360
1118
+ },
1119
+ {
1120
+ "epoch": 4.27,
1121
+ "grad_norm": 0.1388218253850937,
1122
+ "learning_rate": 9.955188775851703e-05,
1123
+ "loss": 0.0799,
1124
+ "step": 1370
1125
+ },
1126
+ {
1127
+ "epoch": 4.3,
1128
+ "grad_norm": 6.434169769287109,
1129
+ "learning_rate": 9.95453272615146e-05,
1130
+ "loss": 0.1215,
1131
+ "step": 1380
1132
+ },
1133
+ {
1134
+ "epoch": 4.33,
1135
+ "grad_norm": 0.04131891950964928,
1136
+ "learning_rate": 9.953871930837908e-05,
1137
+ "loss": 0.2098,
1138
+ "step": 1390
1139
+ },
1140
+ {
1141
+ "epoch": 4.36,
1142
+ "grad_norm": 5.073083400726318,
1143
+ "learning_rate": 9.953206390543979e-05,
1144
+ "loss": 0.1209,
1145
+ "step": 1400
1146
+ },
1147
+ {
1148
+ "epoch": 4.36,
1149
+ "eval_accuracy": 0.8453536754507628,
1150
+ "eval_f1": 0.8453570551603792,
1151
+ "eval_loss": 0.6838424801826477,
1152
+ "eval_precision": 0.8481646167581471,
1153
+ "eval_recall": 0.8453536754507628,
1154
+ "eval_runtime": 40.1161,
1155
+ "eval_samples_per_second": 71.891,
1156
+ "eval_steps_per_second": 8.999,
1157
+ "step": 1400
1158
+ },
1159
+ {
1160
+ "epoch": 4.39,
1161
+ "grad_norm": 0.1268097162246704,
1162
+ "learning_rate": 9.952536105907148e-05,
1163
+ "loss": 0.0401,
1164
+ "step": 1410
1165
+ },
1166
+ {
1167
+ "epoch": 4.42,
1168
+ "grad_norm": 0.3008650839328766,
1169
+ "learning_rate": 9.951861077569438e-05,
1170
+ "loss": 0.0539,
1171
+ "step": 1420
1172
+ },
1173
+ {
1174
+ "epoch": 4.45,
1175
+ "grad_norm": 1.8736006021499634,
1176
+ "learning_rate": 9.951181306177408e-05,
1177
+ "loss": 0.0375,
1178
+ "step": 1430
1179
+ },
1180
+ {
1181
+ "epoch": 4.49,
1182
+ "grad_norm": 0.02439166232943535,
1183
+ "learning_rate": 9.950496792382172e-05,
1184
+ "loss": 0.0783,
1185
+ "step": 1440
1186
+ },
1187
+ {
1188
+ "epoch": 4.52,
1189
+ "grad_norm": 2.347705841064453,
1190
+ "learning_rate": 9.949807536839375e-05,
1191
+ "loss": 0.0744,
1192
+ "step": 1450
1193
+ },
1194
+ {
1195
+ "epoch": 4.55,
1196
+ "grad_norm": 0.04086657613515854,
1197
+ "learning_rate": 9.94911354020921e-05,
1198
+ "loss": 0.0466,
1199
+ "step": 1460
1200
+ },
1201
+ {
1202
+ "epoch": 4.58,
1203
+ "grad_norm": 1.9723145961761475,
1204
+ "learning_rate": 9.94841480315641e-05,
1205
+ "loss": 0.0459,
1206
+ "step": 1470
1207
+ },
1208
+ {
1209
+ "epoch": 4.61,
1210
+ "grad_norm": 0.05683187022805214,
1211
+ "learning_rate": 9.947711326350247e-05,
1212
+ "loss": 0.1431,
1213
+ "step": 1480
1214
+ },
1215
+ {
1216
+ "epoch": 4.64,
1217
+ "grad_norm": 3.9203383922576904,
1218
+ "learning_rate": 9.947003110464533e-05,
1219
+ "loss": 0.0488,
1220
+ "step": 1490
1221
+ },
1222
+ {
1223
+ "epoch": 4.67,
1224
+ "grad_norm": 1.1523628234863281,
1225
+ "learning_rate": 9.946290156177625e-05,
1226
+ "loss": 0.006,
1227
+ "step": 1500
1228
+ },
1229
+ {
1230
+ "epoch": 4.67,
1231
+ "eval_accuracy": 0.8394590846047156,
1232
+ "eval_f1": 0.8362644621553696,
1233
+ "eval_loss": 0.7756162285804749,
1234
+ "eval_precision": 0.8375320979339962,
1235
+ "eval_recall": 0.8394590846047156,
1236
+ "eval_runtime": 40.2425,
1237
+ "eval_samples_per_second": 71.666,
1238
+ "eval_steps_per_second": 8.971,
1239
+ "step": 1500
1240
+ },
1241
+ {
1242
+ "epoch": 4.7,
1243
+ "grad_norm": 10.209505081176758,
1244
+ "learning_rate": 9.945572464172408e-05,
1245
+ "loss": 0.0775,
1246
+ "step": 1510
1247
+ },
1248
+ {
1249
+ "epoch": 4.74,
1250
+ "grad_norm": 1.9617162942886353,
1251
+ "learning_rate": 9.944850035136317e-05,
1252
+ "loss": 0.0089,
1253
+ "step": 1520
1254
+ },
1255
+ {
1256
+ "epoch": 4.77,
1257
+ "grad_norm": 4.741919994354248,
1258
+ "learning_rate": 9.944122869761312e-05,
1259
+ "loss": 0.0835,
1260
+ "step": 1530
1261
+ },
1262
+ {
1263
+ "epoch": 4.8,
1264
+ "grad_norm": 8.549369812011719,
1265
+ "learning_rate": 9.943390968743899e-05,
1266
+ "loss": 0.0569,
1267
+ "step": 1540
1268
+ },
1269
+ {
1270
+ "epoch": 4.83,
1271
+ "grad_norm": 0.6695631146430969,
1272
+ "learning_rate": 9.942654332785117e-05,
1273
+ "loss": 0.1164,
1274
+ "step": 1550
1275
+ },
1276
+ {
1277
+ "epoch": 4.86,
1278
+ "grad_norm": 4.1318278312683105,
1279
+ "learning_rate": 9.94191296259054e-05,
1280
+ "loss": 0.042,
1281
+ "step": 1560
1282
+ },
1283
+ {
1284
+ "epoch": 4.89,
1285
+ "grad_norm": 6.556665897369385,
1286
+ "learning_rate": 9.941166858870275e-05,
1287
+ "loss": 0.045,
1288
+ "step": 1570
1289
+ },
1290
+ {
1291
+ "epoch": 4.92,
1292
+ "grad_norm": 13.292597770690918,
1293
+ "learning_rate": 9.940416022338966e-05,
1294
+ "loss": 0.0682,
1295
+ "step": 1580
1296
+ },
1297
+ {
1298
+ "epoch": 4.95,
1299
+ "grad_norm": 7.545276165008545,
1300
+ "learning_rate": 9.939660453715789e-05,
1301
+ "loss": 0.1476,
1302
+ "step": 1590
1303
+ },
1304
+ {
1305
+ "epoch": 4.98,
1306
+ "grad_norm": 4.052639961242676,
1307
+ "learning_rate": 9.93890015372445e-05,
1308
+ "loss": 0.0219,
1309
+ "step": 1600
1310
+ },
1311
+ {
1312
+ "epoch": 4.98,
1313
+ "eval_accuracy": 0.8280166435506241,
1314
+ "eval_f1": 0.8270572625886861,
1315
+ "eval_loss": 0.8814780116081238,
1316
+ "eval_precision": 0.8368468785461194,
1317
+ "eval_recall": 0.8280166435506241,
1318
+ "eval_runtime": 39.3962,
1319
+ "eval_samples_per_second": 73.205,
1320
+ "eval_steps_per_second": 9.163,
1321
+ "step": 1600
1322
+ },
1323
+ {
1324
+ "epoch": 5.02,
1325
+ "grad_norm": 0.7045537829399109,
1326
+ "learning_rate": 9.93813512309319e-05,
1327
+ "loss": 0.102,
1328
+ "step": 1610
1329
+ },
1330
+ {
1331
+ "epoch": 5.05,
1332
+ "grad_norm": 0.47862136363983154,
1333
+ "learning_rate": 9.937365362554782e-05,
1334
+ "loss": 0.0198,
1335
+ "step": 1620
1336
+ },
1337
+ {
1338
+ "epoch": 5.08,
1339
+ "grad_norm": 0.06051962450146675,
1340
+ "learning_rate": 9.936590872846529e-05,
1341
+ "loss": 0.029,
1342
+ "step": 1630
1343
+ },
1344
+ {
1345
+ "epoch": 5.11,
1346
+ "grad_norm": 8.44522762298584,
1347
+ "learning_rate": 9.935811654710257e-05,
1348
+ "loss": 0.0077,
1349
+ "step": 1640
1350
+ },
1351
+ {
1352
+ "epoch": 5.14,
1353
+ "grad_norm": 0.15062321722507477,
1354
+ "learning_rate": 9.935027708892333e-05,
1355
+ "loss": 0.011,
1356
+ "step": 1650
1357
+ },
1358
+ {
1359
+ "epoch": 5.17,
1360
+ "grad_norm": 0.006027880124747753,
1361
+ "learning_rate": 9.93423903614364e-05,
1362
+ "loss": 0.0524,
1363
+ "step": 1660
1364
+ },
1365
+ {
1366
+ "epoch": 5.2,
1367
+ "grad_norm": 0.09458424896001816,
1368
+ "learning_rate": 9.933445637219598e-05,
1369
+ "loss": 0.074,
1370
+ "step": 1670
1371
+ },
1372
+ {
1373
+ "epoch": 5.23,
1374
+ "grad_norm": 4.973755836486816,
1375
+ "learning_rate": 9.932647512880151e-05,
1376
+ "loss": 0.1273,
1377
+ "step": 1680
1378
+ },
1379
+ {
1380
+ "epoch": 5.26,
1381
+ "grad_norm": 5.976691246032715,
1382
+ "learning_rate": 9.931844663889766e-05,
1383
+ "loss": 0.007,
1384
+ "step": 1690
1385
+ },
1386
+ {
1387
+ "epoch": 5.3,
1388
+ "grad_norm": 14.914401054382324,
1389
+ "learning_rate": 9.931037091017441e-05,
1390
+ "loss": 0.0616,
1391
+ "step": 1700
1392
+ },
1393
+ {
1394
+ "epoch": 5.3,
1395
+ "eval_accuracy": 0.8155339805825242,
1396
+ "eval_f1": 0.7863981529949345,
1397
+ "eval_loss": 1.0824644565582275,
1398
+ "eval_precision": 0.8128168319440793,
1399
+ "eval_recall": 0.8155339805825242,
1400
+ "eval_runtime": 38.9135,
1401
+ "eval_samples_per_second": 74.113,
1402
+ "eval_steps_per_second": 9.277,
1403
+ "step": 1700
1404
+ },
1405
+ {
1406
+ "epoch": 5.33,
1407
+ "grad_norm": 0.02390890195965767,
1408
+ "learning_rate": 9.930224795036694e-05,
1409
+ "loss": 0.009,
1410
+ "step": 1710
1411
+ },
1412
+ {
1413
+ "epoch": 5.36,
1414
+ "grad_norm": 6.92083740234375,
1415
+ "learning_rate": 9.929407776725565e-05,
1416
+ "loss": 0.118,
1417
+ "step": 1720
1418
+ },
1419
+ {
1420
+ "epoch": 5.39,
1421
+ "grad_norm": 0.024119729176163673,
1422
+ "learning_rate": 9.928586036866628e-05,
1423
+ "loss": 0.0051,
1424
+ "step": 1730
1425
+ },
1426
+ {
1427
+ "epoch": 5.42,
1428
+ "grad_norm": 8.526777267456055,
1429
+ "learning_rate": 9.927759576246969e-05,
1430
+ "loss": 0.1098,
1431
+ "step": 1740
1432
+ },
1433
+ {
1434
+ "epoch": 5.45,
1435
+ "grad_norm": 2.77993106842041,
1436
+ "learning_rate": 9.926928395658198e-05,
1437
+ "loss": 0.1047,
1438
+ "step": 1750
1439
+ },
1440
+ {
1441
+ "epoch": 5.48,
1442
+ "grad_norm": 0.030404316261410713,
1443
+ "learning_rate": 9.926092495896446e-05,
1444
+ "loss": 0.0495,
1445
+ "step": 1760
1446
+ },
1447
+ {
1448
+ "epoch": 5.51,
1449
+ "grad_norm": 0.07882791757583618,
1450
+ "learning_rate": 9.925251877762369e-05,
1451
+ "loss": 0.0609,
1452
+ "step": 1770
1453
+ },
1454
+ {
1455
+ "epoch": 5.55,
1456
+ "grad_norm": 1.615186095237732,
1457
+ "learning_rate": 9.924406542061135e-05,
1458
+ "loss": 0.0399,
1459
+ "step": 1780
1460
+ },
1461
+ {
1462
+ "epoch": 5.58,
1463
+ "grad_norm": 5.359433650970459,
1464
+ "learning_rate": 9.923556489602436e-05,
1465
+ "loss": 0.032,
1466
+ "step": 1790
1467
+ },
1468
+ {
1469
+ "epoch": 5.61,
1470
+ "grad_norm": 1.139506459236145,
1471
+ "learning_rate": 9.922701721200479e-05,
1472
+ "loss": 0.0305,
1473
+ "step": 1800
1474
+ },
1475
+ {
1476
+ "epoch": 5.61,
1477
+ "eval_accuracy": 0.840499306518724,
1478
+ "eval_f1": 0.8366786621950907,
1479
+ "eval_loss": 0.8289232850074768,
1480
+ "eval_precision": 0.8427815878781533,
1481
+ "eval_recall": 0.840499306518724,
1482
+ "eval_runtime": 40.1613,
1483
+ "eval_samples_per_second": 71.81,
1484
+ "eval_steps_per_second": 8.989,
1485
+ "step": 1800
1486
+ },
1487
+ {
1488
+ "epoch": 5.61,
1489
+ "step": 1800,
1490
+ "total_flos": 2.2287694956200755e+18,
1491
+ "train_loss": 0.2732895821850333,
1492
+ "train_runtime": 1429.5622,
1493
+ "train_samples_per_second": 358.711,
1494
+ "train_steps_per_second": 22.454
1495
+ }
1496
+ ],
1497
+ "logging_steps": 10,
1498
+ "max_steps": 32100,
1499
+ "num_input_tokens_seen": 0,
1500
+ "num_train_epochs": 100,
1501
+ "save_steps": 100,
1502
+ "total_flos": 2.2287694956200755e+18,
1503
+ "train_batch_size": 16,
1504
+ "trial_name": null,
1505
+ "trial_params": null
1506
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2a635317bae1c238244ca6e5e6c1f043e7908565728a9b0686c08020db030df
3
  size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aab637bd8e8c36001550bf35ab0a6ca3623a95000dfa39cb3d55b23c77bd97fc
3
  size 4920