dhritic99 commited on
Commit
acc3019
1 Parent(s): f28558f

dhritic99/model99123

Browse files
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: google/vit-base-patch16-224-in21k
4
+ tags:
5
+ - generated_from_trainer
6
+ metrics:
7
+ - accuracy
8
+ model-index:
9
+ - name: vit-base-brain-tumor-detection3
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ # vit-base-brain-tumor-detection3
17
+
18
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on an unknown dataset.
19
+ It achieves the following results on the evaluation set:
20
+ - Loss: 0.2613
21
+ - Accuracy: 0.9508
22
+
23
+ ## Model description
24
+
25
+ More information needed
26
+
27
+ ## Intended uses & limitations
28
+
29
+ More information needed
30
+
31
+ ## Training and evaluation data
32
+
33
+ More information needed
34
+
35
+ ## Training procedure
36
+
37
+ ### Training hyperparameters
38
+
39
+ The following hyperparameters were used during training:
40
+ - learning_rate: 3e-05
41
+ - train_batch_size: 16
42
+ - eval_batch_size: 8
43
+ - seed: 42
44
+ - gradient_accumulation_steps: 4
45
+ - total_train_batch_size: 64
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: linear
48
+ - lr_scheduler_warmup_steps: 1000
49
+ - num_epochs: 60
50
+ - mixed_precision_training: Native AMP
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
55
+ |:-------------:|:-------:|:----:|:---------------:|:--------:|
56
+ | 0.4758 | 7.8125 | 500 | 0.5695 | 0.7939 |
57
+ | 0.1137 | 15.625 | 1000 | 0.4398 | 0.8711 |
58
+ | 0.0466 | 23.4375 | 1500 | 0.4086 | 0.9023 |
59
+ | 0.0086 | 31.25 | 2000 | 0.2433 | 0.9463 |
60
+ | 0.0034 | 39.0625 | 2500 | 0.1636 | 0.9688 |
61
+ | 0.002 | 46.875 | 3000 | 0.1739 | 0.9707 |
62
+ | 0.0014 | 54.6875 | 3500 | 0.1818 | 0.9707 |
63
+
64
+
65
+ ### Framework versions
66
+
67
+ - Transformers 4.42.4
68
+ - Pytorch 2.3.1+cu121
69
+ - Datasets 2.20.0
70
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 60.0,
3
+ "eval_accuracy": 0.95078125,
4
+ "eval_loss": 0.26134949922561646,
5
+ "eval_runtime": 6.7187,
6
+ "eval_samples_per_second": 190.513,
7
+ "eval_steps_per_second": 23.814,
8
+ "total_flos": 1.904477274611122e+19,
9
+ "train_loss": 0.14902369955088943,
10
+ "train_runtime": 2785.8591,
11
+ "train_samples_per_second": 88.217,
12
+ "train_steps_per_second": 1.378
13
+ }
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3"
16
+ },
17
+ "image_size": 224,
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2,
24
+ "LABEL_3": 3
25
+ },
26
+ "layer_norm_eps": 1e-12,
27
+ "model_type": "vit",
28
+ "num_attention_heads": 12,
29
+ "num_channels": 3,
30
+ "num_hidden_layers": 12,
31
+ "patch_size": 16,
32
+ "problem_type": "single_label_classification",
33
+ "qkv_bias": true,
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.42.4"
36
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 60.0,
3
+ "eval_accuracy": 0.95078125,
4
+ "eval_loss": 0.26134949922561646,
5
+ "eval_runtime": 6.7187,
6
+ "eval_samples_per_second": 190.513,
7
+ "eval_steps_per_second": 23.814
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3158ce4921cc7de61a99fa5043ead77c62382fbae720ec9b3a76b25a7120d00c
3
+ size 343230128
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "do_rescale": true,
4
+ "do_resize": true,
5
+ "image_mean": [
6
+ 0.5,
7
+ 0.5,
8
+ 0.5
9
+ ],
10
+ "image_processor_type": "ViTImageProcessor",
11
+ "image_std": [
12
+ 0.5,
13
+ 0.5,
14
+ 0.5
15
+ ],
16
+ "resample": 2,
17
+ "rescale_factor": 0.00392156862745098,
18
+ "size": {
19
+ "height": 224,
20
+ "width": 224
21
+ }
22
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 60.0,
3
+ "total_flos": 1.904477274611122e+19,
4
+ "train_loss": 0.14902369955088943,
5
+ "train_runtime": 2785.8591,
6
+ "train_samples_per_second": 88.217,
7
+ "train_steps_per_second": 1.378
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,646 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.16363227367401123,
3
+ "best_model_checkpoint": "./vit-base-brain-tumor-detection3/checkpoint-2500",
4
+ "epoch": 60.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3840,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.78125,
13
+ "grad_norm": 1.8165547847747803,
14
+ "learning_rate": 1.5e-06,
15
+ "loss": 1.3842,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 1.5625,
20
+ "grad_norm": 1.1670353412628174,
21
+ "learning_rate": 3e-06,
22
+ "loss": 1.2128,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 2.34375,
27
+ "grad_norm": 1.4207779169082642,
28
+ "learning_rate": 4.5e-06,
29
+ "loss": 1.0365,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 3.125,
34
+ "grad_norm": 3.359316825866699,
35
+ "learning_rate": 6e-06,
36
+ "loss": 0.9528,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 3.90625,
41
+ "grad_norm": 1.4744346141815186,
42
+ "learning_rate": 7.5e-06,
43
+ "loss": 0.8693,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 4.6875,
48
+ "grad_norm": 2.3251893520355225,
49
+ "learning_rate": 9e-06,
50
+ "loss": 0.8014,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 5.46875,
55
+ "grad_norm": 3.080569267272949,
56
+ "learning_rate": 1.05e-05,
57
+ "loss": 0.7156,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 6.25,
62
+ "grad_norm": 2.417529582977295,
63
+ "learning_rate": 1.2e-05,
64
+ "loss": 0.6215,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 7.03125,
69
+ "grad_norm": 2.6332898139953613,
70
+ "learning_rate": 1.3500000000000001e-05,
71
+ "loss": 0.5781,
72
+ "step": 450
73
+ },
74
+ {
75
+ "epoch": 7.8125,
76
+ "grad_norm": 4.613404750823975,
77
+ "learning_rate": 1.5e-05,
78
+ "loss": 0.4758,
79
+ "step": 500
80
+ },
81
+ {
82
+ "epoch": 7.8125,
83
+ "eval_accuracy": 0.7939453125,
84
+ "eval_loss": 0.5694867372512817,
85
+ "eval_runtime": 6.0709,
86
+ "eval_samples_per_second": 168.674,
87
+ "eval_steps_per_second": 21.084,
88
+ "step": 500
89
+ },
90
+ {
91
+ "epoch": 8.59375,
92
+ "grad_norm": 3.2593910694122314,
93
+ "learning_rate": 1.65e-05,
94
+ "loss": 0.3951,
95
+ "step": 550
96
+ },
97
+ {
98
+ "epoch": 9.375,
99
+ "grad_norm": 4.835229396820068,
100
+ "learning_rate": 1.8e-05,
101
+ "loss": 0.278,
102
+ "step": 600
103
+ },
104
+ {
105
+ "epoch": 10.15625,
106
+ "grad_norm": 2.339912176132202,
107
+ "learning_rate": 1.95e-05,
108
+ "loss": 0.2564,
109
+ "step": 650
110
+ },
111
+ {
112
+ "epoch": 10.9375,
113
+ "grad_norm": 1.6154627799987793,
114
+ "learning_rate": 2.1e-05,
115
+ "loss": 0.2094,
116
+ "step": 700
117
+ },
118
+ {
119
+ "epoch": 11.71875,
120
+ "grad_norm": 3.8827977180480957,
121
+ "learning_rate": 2.25e-05,
122
+ "loss": 0.1495,
123
+ "step": 750
124
+ },
125
+ {
126
+ "epoch": 12.5,
127
+ "grad_norm": 6.941565036773682,
128
+ "learning_rate": 2.4e-05,
129
+ "loss": 0.1284,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 13.28125,
134
+ "grad_norm": 2.415818691253662,
135
+ "learning_rate": 2.55e-05,
136
+ "loss": 0.1136,
137
+ "step": 850
138
+ },
139
+ {
140
+ "epoch": 14.0625,
141
+ "grad_norm": 1.2879343032836914,
142
+ "learning_rate": 2.7000000000000002e-05,
143
+ "loss": 0.0863,
144
+ "step": 900
145
+ },
146
+ {
147
+ "epoch": 14.84375,
148
+ "grad_norm": 14.791470527648926,
149
+ "learning_rate": 2.8499999999999998e-05,
150
+ "loss": 0.1258,
151
+ "step": 950
152
+ },
153
+ {
154
+ "epoch": 15.625,
155
+ "grad_norm": 13.995768547058105,
156
+ "learning_rate": 3e-05,
157
+ "loss": 0.1137,
158
+ "step": 1000
159
+ },
160
+ {
161
+ "epoch": 15.625,
162
+ "eval_accuracy": 0.87109375,
163
+ "eval_loss": 0.43984174728393555,
164
+ "eval_runtime": 5.4017,
165
+ "eval_samples_per_second": 189.569,
166
+ "eval_steps_per_second": 23.696,
167
+ "step": 1000
168
+ },
169
+ {
170
+ "epoch": 16.40625,
171
+ "grad_norm": 24.01154327392578,
172
+ "learning_rate": 2.989554317548747e-05,
173
+ "loss": 0.0756,
174
+ "step": 1050
175
+ },
176
+ {
177
+ "epoch": 17.1875,
178
+ "grad_norm": 0.24175554513931274,
179
+ "learning_rate": 2.979108635097493e-05,
180
+ "loss": 0.0933,
181
+ "step": 1100
182
+ },
183
+ {
184
+ "epoch": 17.96875,
185
+ "grad_norm": 0.25062623620033264,
186
+ "learning_rate": 2.96866295264624e-05,
187
+ "loss": 0.0676,
188
+ "step": 1150
189
+ },
190
+ {
191
+ "epoch": 18.75,
192
+ "grad_norm": 0.775455892086029,
193
+ "learning_rate": 2.958217270194986e-05,
194
+ "loss": 0.081,
195
+ "step": 1200
196
+ },
197
+ {
198
+ "epoch": 19.53125,
199
+ "grad_norm": 0.12767118215560913,
200
+ "learning_rate": 2.947771587743733e-05,
201
+ "loss": 0.0756,
202
+ "step": 1250
203
+ },
204
+ {
205
+ "epoch": 20.3125,
206
+ "grad_norm": 0.17824232578277588,
207
+ "learning_rate": 2.937325905292479e-05,
208
+ "loss": 0.0571,
209
+ "step": 1300
210
+ },
211
+ {
212
+ "epoch": 21.09375,
213
+ "grad_norm": 0.1250143200159073,
214
+ "learning_rate": 2.926880222841226e-05,
215
+ "loss": 0.0462,
216
+ "step": 1350
217
+ },
218
+ {
219
+ "epoch": 21.875,
220
+ "grad_norm": 1.6465438604354858,
221
+ "learning_rate": 2.916434540389972e-05,
222
+ "loss": 0.0346,
223
+ "step": 1400
224
+ },
225
+ {
226
+ "epoch": 22.65625,
227
+ "grad_norm": 9.335956573486328,
228
+ "learning_rate": 2.905988857938719e-05,
229
+ "loss": 0.046,
230
+ "step": 1450
231
+ },
232
+ {
233
+ "epoch": 23.4375,
234
+ "grad_norm": 0.20395609736442566,
235
+ "learning_rate": 2.895543175487465e-05,
236
+ "loss": 0.0466,
237
+ "step": 1500
238
+ },
239
+ {
240
+ "epoch": 23.4375,
241
+ "eval_accuracy": 0.90234375,
242
+ "eval_loss": 0.4086352288722992,
243
+ "eval_runtime": 5.3678,
244
+ "eval_samples_per_second": 190.768,
245
+ "eval_steps_per_second": 23.846,
246
+ "step": 1500
247
+ },
248
+ {
249
+ "epoch": 24.21875,
250
+ "grad_norm": 0.06884710490703583,
251
+ "learning_rate": 2.885097493036212e-05,
252
+ "loss": 0.044,
253
+ "step": 1550
254
+ },
255
+ {
256
+ "epoch": 25.0,
257
+ "grad_norm": 0.2089349329471588,
258
+ "learning_rate": 2.8746518105849583e-05,
259
+ "loss": 0.0382,
260
+ "step": 1600
261
+ },
262
+ {
263
+ "epoch": 25.78125,
264
+ "grad_norm": 0.09039656072854996,
265
+ "learning_rate": 2.8642061281337048e-05,
266
+ "loss": 0.0279,
267
+ "step": 1650
268
+ },
269
+ {
270
+ "epoch": 26.5625,
271
+ "grad_norm": 0.05787573382258415,
272
+ "learning_rate": 2.8537604456824513e-05,
273
+ "loss": 0.0257,
274
+ "step": 1700
275
+ },
276
+ {
277
+ "epoch": 27.34375,
278
+ "grad_norm": 0.04917814955115318,
279
+ "learning_rate": 2.8433147632311978e-05,
280
+ "loss": 0.0229,
281
+ "step": 1750
282
+ },
283
+ {
284
+ "epoch": 28.125,
285
+ "grad_norm": 0.06560017913579941,
286
+ "learning_rate": 2.8328690807799443e-05,
287
+ "loss": 0.0191,
288
+ "step": 1800
289
+ },
290
+ {
291
+ "epoch": 28.90625,
292
+ "grad_norm": 0.03920649737119675,
293
+ "learning_rate": 2.8224233983286908e-05,
294
+ "loss": 0.0114,
295
+ "step": 1850
296
+ },
297
+ {
298
+ "epoch": 29.6875,
299
+ "grad_norm": 0.9064533114433289,
300
+ "learning_rate": 2.8119777158774373e-05,
301
+ "loss": 0.0165,
302
+ "step": 1900
303
+ },
304
+ {
305
+ "epoch": 30.46875,
306
+ "grad_norm": 0.03491423651576042,
307
+ "learning_rate": 2.8015320334261838e-05,
308
+ "loss": 0.0078,
309
+ "step": 1950
310
+ },
311
+ {
312
+ "epoch": 31.25,
313
+ "grad_norm": 0.029768764972686768,
314
+ "learning_rate": 2.7910863509749306e-05,
315
+ "loss": 0.0086,
316
+ "step": 2000
317
+ },
318
+ {
319
+ "epoch": 31.25,
320
+ "eval_accuracy": 0.9462890625,
321
+ "eval_loss": 0.2432650774717331,
322
+ "eval_runtime": 5.6297,
323
+ "eval_samples_per_second": 181.891,
324
+ "eval_steps_per_second": 22.736,
325
+ "step": 2000
326
+ },
327
+ {
328
+ "epoch": 32.03125,
329
+ "grad_norm": 0.02953988127410412,
330
+ "learning_rate": 2.780640668523677e-05,
331
+ "loss": 0.0062,
332
+ "step": 2050
333
+ },
334
+ {
335
+ "epoch": 32.8125,
336
+ "grad_norm": 0.025788016617298126,
337
+ "learning_rate": 2.7701949860724236e-05,
338
+ "loss": 0.0057,
339
+ "step": 2100
340
+ },
341
+ {
342
+ "epoch": 33.59375,
343
+ "grad_norm": 0.03053743578493595,
344
+ "learning_rate": 2.75974930362117e-05,
345
+ "loss": 0.0053,
346
+ "step": 2150
347
+ },
348
+ {
349
+ "epoch": 34.375,
350
+ "grad_norm": 0.021916454657912254,
351
+ "learning_rate": 2.7493036211699166e-05,
352
+ "loss": 0.0049,
353
+ "step": 2200
354
+ },
355
+ {
356
+ "epoch": 35.15625,
357
+ "grad_norm": 0.021212272346019745,
358
+ "learning_rate": 2.738857938718663e-05,
359
+ "loss": 0.0045,
360
+ "step": 2250
361
+ },
362
+ {
363
+ "epoch": 35.9375,
364
+ "grad_norm": 0.020344305783510208,
365
+ "learning_rate": 2.7284122562674096e-05,
366
+ "loss": 0.0043,
367
+ "step": 2300
368
+ },
369
+ {
370
+ "epoch": 36.71875,
371
+ "grad_norm": 0.018891936168074608,
372
+ "learning_rate": 2.717966573816156e-05,
373
+ "loss": 0.004,
374
+ "step": 2350
375
+ },
376
+ {
377
+ "epoch": 37.5,
378
+ "grad_norm": 0.017234979197382927,
379
+ "learning_rate": 2.7075208913649025e-05,
380
+ "loss": 0.0038,
381
+ "step": 2400
382
+ },
383
+ {
384
+ "epoch": 38.28125,
385
+ "grad_norm": 0.016466792672872543,
386
+ "learning_rate": 2.697075208913649e-05,
387
+ "loss": 0.0035,
388
+ "step": 2450
389
+ },
390
+ {
391
+ "epoch": 39.0625,
392
+ "grad_norm": 0.017314311116933823,
393
+ "learning_rate": 2.6866295264623955e-05,
394
+ "loss": 0.0034,
395
+ "step": 2500
396
+ },
397
+ {
398
+ "epoch": 39.0625,
399
+ "eval_accuracy": 0.96875,
400
+ "eval_loss": 0.16363227367401123,
401
+ "eval_runtime": 6.1162,
402
+ "eval_samples_per_second": 167.424,
403
+ "eval_steps_per_second": 20.928,
404
+ "step": 2500
405
+ },
406
+ {
407
+ "epoch": 39.84375,
408
+ "grad_norm": 0.01642526686191559,
409
+ "learning_rate": 2.676183844011142e-05,
410
+ "loss": 0.0032,
411
+ "step": 2550
412
+ },
413
+ {
414
+ "epoch": 40.625,
415
+ "grad_norm": 0.013880325481295586,
416
+ "learning_rate": 2.665738161559889e-05,
417
+ "loss": 0.003,
418
+ "step": 2600
419
+ },
420
+ {
421
+ "epoch": 41.40625,
422
+ "grad_norm": 0.01303493045270443,
423
+ "learning_rate": 2.655292479108635e-05,
424
+ "loss": 0.0028,
425
+ "step": 2650
426
+ },
427
+ {
428
+ "epoch": 42.1875,
429
+ "grad_norm": 0.013205628842115402,
430
+ "learning_rate": 2.644846796657382e-05,
431
+ "loss": 0.0027,
432
+ "step": 2700
433
+ },
434
+ {
435
+ "epoch": 42.96875,
436
+ "grad_norm": 0.011895690113306046,
437
+ "learning_rate": 2.634401114206128e-05,
438
+ "loss": 0.0026,
439
+ "step": 2750
440
+ },
441
+ {
442
+ "epoch": 43.75,
443
+ "grad_norm": 0.011271192692220211,
444
+ "learning_rate": 2.6239554317548748e-05,
445
+ "loss": 0.0024,
446
+ "step": 2800
447
+ },
448
+ {
449
+ "epoch": 44.53125,
450
+ "grad_norm": 0.011179978027939796,
451
+ "learning_rate": 2.613509749303621e-05,
452
+ "loss": 0.0024,
453
+ "step": 2850
454
+ },
455
+ {
456
+ "epoch": 45.3125,
457
+ "grad_norm": 0.010614069178700447,
458
+ "learning_rate": 2.6030640668523678e-05,
459
+ "loss": 0.0022,
460
+ "step": 2900
461
+ },
462
+ {
463
+ "epoch": 46.09375,
464
+ "grad_norm": 0.009998313151299953,
465
+ "learning_rate": 2.5926183844011143e-05,
466
+ "loss": 0.0021,
467
+ "step": 2950
468
+ },
469
+ {
470
+ "epoch": 46.875,
471
+ "grad_norm": 0.009446458891034126,
472
+ "learning_rate": 2.5821727019498608e-05,
473
+ "loss": 0.002,
474
+ "step": 3000
475
+ },
476
+ {
477
+ "epoch": 46.875,
478
+ "eval_accuracy": 0.970703125,
479
+ "eval_loss": 0.17385585606098175,
480
+ "eval_runtime": 5.8806,
481
+ "eval_samples_per_second": 174.131,
482
+ "eval_steps_per_second": 21.766,
483
+ "step": 3000
484
+ },
485
+ {
486
+ "epoch": 47.65625,
487
+ "grad_norm": 0.009701834060251713,
488
+ "learning_rate": 2.5717270194986073e-05,
489
+ "loss": 0.002,
490
+ "step": 3050
491
+ },
492
+ {
493
+ "epoch": 48.4375,
494
+ "grad_norm": 0.009180723689496517,
495
+ "learning_rate": 2.5612813370473538e-05,
496
+ "loss": 0.0019,
497
+ "step": 3100
498
+ },
499
+ {
500
+ "epoch": 49.21875,
501
+ "grad_norm": 0.008364294655621052,
502
+ "learning_rate": 2.5508356545961006e-05,
503
+ "loss": 0.0018,
504
+ "step": 3150
505
+ },
506
+ {
507
+ "epoch": 50.0,
508
+ "grad_norm": 0.008195644244551659,
509
+ "learning_rate": 2.5403899721448468e-05,
510
+ "loss": 0.0017,
511
+ "step": 3200
512
+ },
513
+ {
514
+ "epoch": 50.78125,
515
+ "grad_norm": 0.008112799376249313,
516
+ "learning_rate": 2.5299442896935936e-05,
517
+ "loss": 0.0016,
518
+ "step": 3250
519
+ },
520
+ {
521
+ "epoch": 51.5625,
522
+ "grad_norm": 0.007568549830466509,
523
+ "learning_rate": 2.5194986072423398e-05,
524
+ "loss": 0.0016,
525
+ "step": 3300
526
+ },
527
+ {
528
+ "epoch": 52.34375,
529
+ "grad_norm": 0.007013232912868261,
530
+ "learning_rate": 2.5090529247910866e-05,
531
+ "loss": 0.0015,
532
+ "step": 3350
533
+ },
534
+ {
535
+ "epoch": 53.125,
536
+ "grad_norm": 0.006883300840854645,
537
+ "learning_rate": 2.4986072423398327e-05,
538
+ "loss": 0.0014,
539
+ "step": 3400
540
+ },
541
+ {
542
+ "epoch": 53.90625,
543
+ "grad_norm": 0.006791520398110151,
544
+ "learning_rate": 2.4881615598885796e-05,
545
+ "loss": 0.0014,
546
+ "step": 3450
547
+ },
548
+ {
549
+ "epoch": 54.6875,
550
+ "grad_norm": 0.008187716826796532,
551
+ "learning_rate": 2.4777158774373257e-05,
552
+ "loss": 0.0014,
553
+ "step": 3500
554
+ },
555
+ {
556
+ "epoch": 54.6875,
557
+ "eval_accuracy": 0.970703125,
558
+ "eval_loss": 0.1817573606967926,
559
+ "eval_runtime": 5.8237,
560
+ "eval_samples_per_second": 175.833,
561
+ "eval_steps_per_second": 21.979,
562
+ "step": 3500
563
+ },
564
+ {
565
+ "epoch": 55.46875,
566
+ "grad_norm": 0.0065140994265675545,
567
+ "learning_rate": 2.4672701949860726e-05,
568
+ "loss": 0.0013,
569
+ "step": 3550
570
+ },
571
+ {
572
+ "epoch": 56.25,
573
+ "grad_norm": 0.007060408126562834,
574
+ "learning_rate": 2.456824512534819e-05,
575
+ "loss": 0.0012,
576
+ "step": 3600
577
+ },
578
+ {
579
+ "epoch": 57.03125,
580
+ "grad_norm": 0.0056546530686318874,
581
+ "learning_rate": 2.4463788300835655e-05,
582
+ "loss": 0.0012,
583
+ "step": 3650
584
+ },
585
+ {
586
+ "epoch": 57.8125,
587
+ "grad_norm": 0.006707963068038225,
588
+ "learning_rate": 2.435933147632312e-05,
589
+ "loss": 0.0012,
590
+ "step": 3700
591
+ },
592
+ {
593
+ "epoch": 58.59375,
594
+ "grad_norm": 0.006301193963736296,
595
+ "learning_rate": 2.4254874651810585e-05,
596
+ "loss": 0.0011,
597
+ "step": 3750
598
+ },
599
+ {
600
+ "epoch": 59.375,
601
+ "grad_norm": 0.005066621117293835,
602
+ "learning_rate": 2.415041782729805e-05,
603
+ "loss": 0.0011,
604
+ "step": 3800
605
+ },
606
+ {
607
+ "epoch": 60.0,
608
+ "step": 3840,
609
+ "total_flos": 1.904477274611122e+19,
610
+ "train_loss": 0.14902369955088943,
611
+ "train_runtime": 2785.8591,
612
+ "train_samples_per_second": 88.217,
613
+ "train_steps_per_second": 1.378
614
+ }
615
+ ],
616
+ "logging_steps": 50,
617
+ "max_steps": 3840,
618
+ "num_input_tokens_seen": 0,
619
+ "num_train_epochs": 60,
620
+ "save_steps": 500,
621
+ "stateful_callbacks": {
622
+ "EarlyStoppingCallback": {
623
+ "args": {
624
+ "early_stopping_patience": 5,
625
+ "early_stopping_threshold": 0.0
626
+ },
627
+ "attributes": {
628
+ "early_stopping_patience_counter": 0
629
+ }
630
+ },
631
+ "TrainerControl": {
632
+ "args": {
633
+ "should_epoch_stop": false,
634
+ "should_evaluate": false,
635
+ "should_log": false,
636
+ "should_save": true,
637
+ "should_training_stop": true
638
+ },
639
+ "attributes": {}
640
+ }
641
+ },
642
+ "total_flos": 1.904477274611122e+19,
643
+ "train_batch_size": 16,
644
+ "trial_name": null,
645
+ "trial_params": null
646
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84173542b25ab0ff865d0db2a4e5d9b4838d6312ff6d86d248cd2347a190daf4
3
+ size 5112