m-faraz-ali commited on
Commit
eb05fde
1 Parent(s): e03bbf1

End of training

Browse files
README.md ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: google/vit-base-patch16-224-in21k
4
+ tags:
5
+ - generated_from_trainer
6
+ datasets:
7
+ - imagefolder
8
+ metrics:
9
+ - accuracy
10
+ model-index:
11
+ - name: Tb_Dataset
12
+ results:
13
+ - task:
14
+ name: Image Classification
15
+ type: image-classification
16
+ dataset:
17
+ name: imagefolder
18
+ type: imagefolder
19
+ config: default
20
+ split: validation
21
+ args: default
22
+ metrics:
23
+ - name: Accuracy
24
+ type: accuracy
25
+ value: 0.875
26
+ ---
27
+
28
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
29
+ should probably proofread and complete it, then remove this comment. -->
30
+
31
+ # Tb_Dataset
32
+
33
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the imagefolder dataset.
34
+ It achieves the following results on the evaluation set:
35
+ - Loss: 0.4037
36
+ - Accuracy: 0.875
37
+
38
+ ## Model description
39
+
40
+ More information needed
41
+
42
+ ## Intended uses & limitations
43
+
44
+ More information needed
45
+
46
+ ## Training and evaluation data
47
+
48
+ More information needed
49
+
50
+ ## Training procedure
51
+
52
+ ### Training hyperparameters
53
+
54
+ The following hyperparameters were used during training:
55
+ - learning_rate: 0.0002
56
+ - train_batch_size: 16
57
+ - eval_batch_size: 8
58
+ - seed: 42
59
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
60
+ - lr_scheduler_type: linear
61
+ - num_epochs: 4
62
+
63
+ ### Training results
64
+
65
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
66
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
67
+ | 0.0996 | 0.3067 | 100 | 1.0429 | 0.5625 |
68
+ | 0.0481 | 0.6135 | 200 | 0.5665 | 0.8125 |
69
+ | 0.0391 | 0.9202 | 300 | 1.0037 | 0.6875 |
70
+ | 0.0711 | 1.2270 | 400 | 0.5200 | 0.875 |
71
+ | 0.0258 | 1.5337 | 500 | 0.3818 | 0.9375 |
72
+ | 0.0547 | 1.8405 | 600 | 0.3415 | 0.9375 |
73
+ | 0.0029 | 2.1472 | 700 | 0.0637 | 0.9375 |
74
+ | 0.0543 | 2.4540 | 800 | 0.7362 | 0.8125 |
75
+ | 0.0265 | 2.7607 | 900 | 1.0917 | 0.75 |
76
+ | 0.0017 | 3.0675 | 1000 | 0.0030 | 1.0 |
77
+ | 0.0054 | 3.3742 | 1100 | 0.0364 | 1.0 |
78
+ | 0.0234 | 3.6810 | 1200 | 0.2310 | 0.875 |
79
+ | 0.0076 | 3.9877 | 1300 | 0.4037 | 0.875 |
80
+
81
+
82
+ ### Framework versions
83
+
84
+ - Transformers 4.41.2
85
+ - Pytorch 2.3.0+cu121
86
+ - Datasets 2.20.0
87
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "total_flos": 1.6167928713188475e+18,
4
+ "train_loss": 0.04957773063711799,
5
+ "train_runtime": 1223.9715,
6
+ "train_samples_per_second": 17.046,
7
+ "train_steps_per_second": 1.065
8
+ }
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.0,
10
+ "hidden_size": 768,
11
+ "image_size": 224,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-12,
15
+ "model_type": "vit",
16
+ "num_attention_heads": 12,
17
+ "num_channels": 3,
18
+ "num_hidden_layers": 12,
19
+ "patch_size": 16,
20
+ "problem_type": "single_label_classification",
21
+ "qkv_bias": true,
22
+ "torch_dtype": "float32",
23
+ "transformers_version": "4.41.2"
24
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c000d83dacd9d1c86d4db5fe15c77b65deb78f7e66f525ea8c4f647e18844f76
3
+ size 343223968
preprocessor_config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "resample",
7
+ "do_rescale",
8
+ "rescale_factor",
9
+ "do_normalize",
10
+ "image_mean",
11
+ "image_std",
12
+ "return_tensors",
13
+ "data_format",
14
+ "input_data_format"
15
+ ],
16
+ "do_normalize": true,
17
+ "do_rescale": true,
18
+ "do_resize": true,
19
+ "image_mean": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "image_processor_type": "ViTFeatureExtractor",
25
+ "image_std": [
26
+ 0.5,
27
+ 0.5,
28
+ 0.5
29
+ ],
30
+ "resample": 2,
31
+ "rescale_factor": 0.00392156862745098,
32
+ "size": {
33
+ "height": 224,
34
+ "width": 224
35
+ }
36
+ }
runs/Jul12_09-58-17_d5ffb3757aea/events.out.tfevents.1720778368.d5ffb3757aea.195.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d1b38c488d13ca2dda4bd1ac303a0ce1fe46fb89480abaa746b7fc7e0ca0339
3
+ size 36601
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "total_flos": 1.6167928713188475e+18,
4
+ "train_loss": 0.04957773063711799,
5
+ "train_runtime": 1223.9715,
6
+ "train_samples_per_second": 17.046,
7
+ "train_steps_per_second": 1.065
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1069 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.0030439873225986958,
3
+ "best_model_checkpoint": "./Tb_Dataset/checkpoint-1000",
4
+ "epoch": 4.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1304,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.03067484662576687,
13
+ "grad_norm": 1.666305661201477,
14
+ "learning_rate": 0.00019846625766871168,
15
+ "loss": 0.4847,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.06134969325153374,
20
+ "grad_norm": 1.4896280765533447,
21
+ "learning_rate": 0.00019693251533742332,
22
+ "loss": 0.278,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.09202453987730061,
27
+ "grad_norm": 1.3254238367080688,
28
+ "learning_rate": 0.000195398773006135,
29
+ "loss": 0.2401,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.12269938650306748,
34
+ "grad_norm": 1.1399273872375488,
35
+ "learning_rate": 0.00019386503067484663,
36
+ "loss": 0.188,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.15337423312883436,
41
+ "grad_norm": 2.320486307144165,
42
+ "learning_rate": 0.0001923312883435583,
43
+ "loss": 0.1944,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.18404907975460122,
48
+ "grad_norm": 0.24434129893779755,
49
+ "learning_rate": 0.00019079754601226997,
50
+ "loss": 0.0738,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.2147239263803681,
55
+ "grad_norm": 0.7523425221443176,
56
+ "learning_rate": 0.0001892638036809816,
57
+ "loss": 0.171,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.24539877300613497,
62
+ "grad_norm": 3.6861462593078613,
63
+ "learning_rate": 0.00018773006134969328,
64
+ "loss": 0.178,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.27607361963190186,
69
+ "grad_norm": 2.111417770385742,
70
+ "learning_rate": 0.00018619631901840492,
71
+ "loss": 0.2488,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.3067484662576687,
76
+ "grad_norm": 0.8211548328399658,
77
+ "learning_rate": 0.00018466257668711656,
78
+ "loss": 0.0996,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.3067484662576687,
83
+ "eval_accuracy": 0.5625,
84
+ "eval_loss": 1.0429491996765137,
85
+ "eval_runtime": 0.6744,
86
+ "eval_samples_per_second": 23.723,
87
+ "eval_steps_per_second": 2.965,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.3374233128834356,
92
+ "grad_norm": 0.12294389307498932,
93
+ "learning_rate": 0.00018312883435582823,
94
+ "loss": 0.1403,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 0.36809815950920244,
99
+ "grad_norm": 1.0773956775665283,
100
+ "learning_rate": 0.00018159509202453987,
101
+ "loss": 0.0801,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 0.3987730061349693,
106
+ "grad_norm": 1.2252161502838135,
107
+ "learning_rate": 0.00018006134969325154,
108
+ "loss": 0.1667,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 0.4294478527607362,
113
+ "grad_norm": 0.3200875520706177,
114
+ "learning_rate": 0.00017852760736196318,
115
+ "loss": 0.1365,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 0.4601226993865031,
120
+ "grad_norm": 0.4385143518447876,
121
+ "learning_rate": 0.00017699386503067485,
122
+ "loss": 0.1371,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 0.49079754601226994,
127
+ "grad_norm": 0.11226026713848114,
128
+ "learning_rate": 0.00017546012269938652,
129
+ "loss": 0.042,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 0.5214723926380368,
134
+ "grad_norm": 1.9924014806747437,
135
+ "learning_rate": 0.00017392638036809816,
136
+ "loss": 0.1138,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 0.5521472392638037,
141
+ "grad_norm": 2.601227045059204,
142
+ "learning_rate": 0.00017239263803680983,
143
+ "loss": 0.1729,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.5828220858895705,
148
+ "grad_norm": 5.278203964233398,
149
+ "learning_rate": 0.00017085889570552147,
150
+ "loss": 0.1036,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 0.6134969325153374,
155
+ "grad_norm": 1.20023512840271,
156
+ "learning_rate": 0.00016932515337423314,
157
+ "loss": 0.0481,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 0.6134969325153374,
162
+ "eval_accuracy": 0.8125,
163
+ "eval_loss": 0.5665359497070312,
164
+ "eval_runtime": 0.4807,
165
+ "eval_samples_per_second": 33.285,
166
+ "eval_steps_per_second": 4.161,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 0.6441717791411042,
171
+ "grad_norm": 0.7395023107528687,
172
+ "learning_rate": 0.0001677914110429448,
173
+ "loss": 0.1225,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 0.6748466257668712,
178
+ "grad_norm": 0.05845380574464798,
179
+ "learning_rate": 0.00016625766871165645,
180
+ "loss": 0.1034,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 0.7055214723926381,
185
+ "grad_norm": 2.7569024562835693,
186
+ "learning_rate": 0.00016472392638036812,
187
+ "loss": 0.0538,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 0.7361963190184049,
192
+ "grad_norm": 0.06222417205572128,
193
+ "learning_rate": 0.00016319018404907976,
194
+ "loss": 0.0958,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 0.7668711656441718,
199
+ "grad_norm": 1.9317396879196167,
200
+ "learning_rate": 0.00016165644171779143,
201
+ "loss": 0.09,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 0.7975460122699386,
206
+ "grad_norm": 3.952806234359741,
207
+ "learning_rate": 0.0001601226993865031,
208
+ "loss": 0.0878,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 0.8282208588957055,
213
+ "grad_norm": 1.5617386102676392,
214
+ "learning_rate": 0.00015858895705521474,
215
+ "loss": 0.1304,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 0.8588957055214724,
220
+ "grad_norm": 0.32101425528526306,
221
+ "learning_rate": 0.0001570552147239264,
222
+ "loss": 0.0684,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 0.8895705521472392,
227
+ "grad_norm": 0.6578021049499512,
228
+ "learning_rate": 0.00015552147239263805,
229
+ "loss": 0.0917,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 0.9202453987730062,
234
+ "grad_norm": 0.03468528762459755,
235
+ "learning_rate": 0.0001539877300613497,
236
+ "loss": 0.0391,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 0.9202453987730062,
241
+ "eval_accuracy": 0.6875,
242
+ "eval_loss": 1.0036612749099731,
243
+ "eval_runtime": 0.4639,
244
+ "eval_samples_per_second": 34.488,
245
+ "eval_steps_per_second": 4.311,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 0.950920245398773,
250
+ "grad_norm": 0.04961508885025978,
251
+ "learning_rate": 0.00015245398773006136,
252
+ "loss": 0.0435,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 0.9815950920245399,
257
+ "grad_norm": 0.7167016863822937,
258
+ "learning_rate": 0.000150920245398773,
259
+ "loss": 0.0667,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 1.0122699386503067,
264
+ "grad_norm": 0.3984834551811218,
265
+ "learning_rate": 0.00014938650306748467,
266
+ "loss": 0.1054,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 1.0429447852760736,
271
+ "grad_norm": 0.09521777182817459,
272
+ "learning_rate": 0.0001478527607361963,
273
+ "loss": 0.0565,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 1.0736196319018405,
278
+ "grad_norm": 3.836611032485962,
279
+ "learning_rate": 0.00014631901840490798,
280
+ "loss": 0.0498,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 1.1042944785276074,
285
+ "grad_norm": 0.6689714789390564,
286
+ "learning_rate": 0.00014478527607361964,
287
+ "loss": 0.0693,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 1.1349693251533743,
292
+ "grad_norm": 0.42697030305862427,
293
+ "learning_rate": 0.00014325153374233129,
294
+ "loss": 0.1017,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 1.165644171779141,
299
+ "grad_norm": 0.28322863578796387,
300
+ "learning_rate": 0.00014171779141104295,
301
+ "loss": 0.0072,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 1.196319018404908,
306
+ "grad_norm": 0.02922147326171398,
307
+ "learning_rate": 0.0001401840490797546,
308
+ "loss": 0.0355,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 1.2269938650306749,
313
+ "grad_norm": 0.4956642687320709,
314
+ "learning_rate": 0.00013865030674846626,
315
+ "loss": 0.0711,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 1.2269938650306749,
320
+ "eval_accuracy": 0.875,
321
+ "eval_loss": 0.5200170874595642,
322
+ "eval_runtime": 0.4602,
323
+ "eval_samples_per_second": 34.769,
324
+ "eval_steps_per_second": 4.346,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 1.2576687116564418,
329
+ "grad_norm": 0.35631048679351807,
330
+ "learning_rate": 0.00013711656441717793,
331
+ "loss": 0.018,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 1.2883435582822087,
336
+ "grad_norm": 0.053142815828323364,
337
+ "learning_rate": 0.00013558282208588957,
338
+ "loss": 0.0114,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 1.3190184049079754,
343
+ "grad_norm": 3.590878963470459,
344
+ "learning_rate": 0.00013404907975460124,
345
+ "loss": 0.0385,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 1.3496932515337423,
350
+ "grad_norm": 0.0230648685246706,
351
+ "learning_rate": 0.00013251533742331288,
352
+ "loss": 0.0333,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 1.3803680981595092,
357
+ "grad_norm": 3.86190128326416,
358
+ "learning_rate": 0.00013098159509202455,
359
+ "loss": 0.0393,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 1.4110429447852761,
364
+ "grad_norm": 0.9012386798858643,
365
+ "learning_rate": 0.00012944785276073622,
366
+ "loss": 0.0571,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 1.441717791411043,
371
+ "grad_norm": 0.07151665538549423,
372
+ "learning_rate": 0.00012791411042944786,
373
+ "loss": 0.0104,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 1.4723926380368098,
378
+ "grad_norm": 0.783393919467926,
379
+ "learning_rate": 0.00012638036809815953,
380
+ "loss": 0.101,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 1.5030674846625767,
385
+ "grad_norm": 0.11429934948682785,
386
+ "learning_rate": 0.00012484662576687117,
387
+ "loss": 0.0861,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 1.5337423312883436,
392
+ "grad_norm": 0.22566761076450348,
393
+ "learning_rate": 0.00012331288343558281,
394
+ "loss": 0.0258,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 1.5337423312883436,
399
+ "eval_accuracy": 0.9375,
400
+ "eval_loss": 0.38183438777923584,
401
+ "eval_runtime": 0.6415,
402
+ "eval_samples_per_second": 24.943,
403
+ "eval_steps_per_second": 3.118,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 1.5644171779141103,
408
+ "grad_norm": 0.0231198500841856,
409
+ "learning_rate": 0.0001217791411042945,
410
+ "loss": 0.0112,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 1.5950920245398774,
415
+ "grad_norm": 0.07680622488260269,
416
+ "learning_rate": 0.00012024539877300614,
417
+ "loss": 0.0332,
418
+ "step": 520
419
+ },
420
+ {
421
+ "epoch": 1.6257668711656441,
422
+ "grad_norm": 4.2723798751831055,
423
+ "learning_rate": 0.0001187116564417178,
424
+ "loss": 0.0615,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 1.656441717791411,
429
+ "grad_norm": 0.020270884037017822,
430
+ "learning_rate": 0.00011717791411042945,
431
+ "loss": 0.0025,
432
+ "step": 540
433
+ },
434
+ {
435
+ "epoch": 1.687116564417178,
436
+ "grad_norm": 0.017487134784460068,
437
+ "learning_rate": 0.0001156441717791411,
438
+ "loss": 0.0032,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 1.7177914110429446,
443
+ "grad_norm": 3.106546401977539,
444
+ "learning_rate": 0.00011411042944785277,
445
+ "loss": 0.0776,
446
+ "step": 560
447
+ },
448
+ {
449
+ "epoch": 1.7484662576687118,
450
+ "grad_norm": 0.2955380380153656,
451
+ "learning_rate": 0.00011257668711656441,
452
+ "loss": 0.046,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 1.7791411042944785,
457
+ "grad_norm": 0.03838299587368965,
458
+ "learning_rate": 0.00011104294478527608,
459
+ "loss": 0.0039,
460
+ "step": 580
461
+ },
462
+ {
463
+ "epoch": 1.8098159509202454,
464
+ "grad_norm": 0.02066592499613762,
465
+ "learning_rate": 0.00010950920245398772,
466
+ "loss": 0.0411,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 1.8404907975460123,
471
+ "grad_norm": 0.029740285128355026,
472
+ "learning_rate": 0.00010797546012269939,
473
+ "loss": 0.0547,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 1.8404907975460123,
478
+ "eval_accuracy": 0.9375,
479
+ "eval_loss": 0.3414860665798187,
480
+ "eval_runtime": 0.6402,
481
+ "eval_samples_per_second": 24.991,
482
+ "eval_steps_per_second": 3.124,
483
+ "step": 600
484
+ },
485
+ {
486
+ "epoch": 1.871165644171779,
487
+ "grad_norm": 0.06896264851093292,
488
+ "learning_rate": 0.00010644171779141106,
489
+ "loss": 0.0412,
490
+ "step": 610
491
+ },
492
+ {
493
+ "epoch": 1.9018404907975461,
494
+ "grad_norm": 0.17898960411548615,
495
+ "learning_rate": 0.0001049079754601227,
496
+ "loss": 0.0059,
497
+ "step": 620
498
+ },
499
+ {
500
+ "epoch": 1.9325153374233128,
501
+ "grad_norm": 0.02462293580174446,
502
+ "learning_rate": 0.00010337423312883437,
503
+ "loss": 0.0334,
504
+ "step": 630
505
+ },
506
+ {
507
+ "epoch": 1.9631901840490797,
508
+ "grad_norm": 0.15255212783813477,
509
+ "learning_rate": 0.00010184049079754601,
510
+ "loss": 0.0484,
511
+ "step": 640
512
+ },
513
+ {
514
+ "epoch": 1.9938650306748467,
515
+ "grad_norm": 1.2095363140106201,
516
+ "learning_rate": 0.00010030674846625767,
517
+ "loss": 0.0409,
518
+ "step": 650
519
+ },
520
+ {
521
+ "epoch": 2.0245398773006134,
522
+ "grad_norm": 0.05067542567849159,
523
+ "learning_rate": 9.877300613496932e-05,
524
+ "loss": 0.0239,
525
+ "step": 660
526
+ },
527
+ {
528
+ "epoch": 2.0552147239263805,
529
+ "grad_norm": 0.048488594591617584,
530
+ "learning_rate": 9.723926380368099e-05,
531
+ "loss": 0.0092,
532
+ "step": 670
533
+ },
534
+ {
535
+ "epoch": 2.085889570552147,
536
+ "grad_norm": 0.06358503550291061,
537
+ "learning_rate": 9.570552147239264e-05,
538
+ "loss": 0.0284,
539
+ "step": 680
540
+ },
541
+ {
542
+ "epoch": 2.116564417177914,
543
+ "grad_norm": 0.02291274443268776,
544
+ "learning_rate": 9.41717791411043e-05,
545
+ "loss": 0.0312,
546
+ "step": 690
547
+ },
548
+ {
549
+ "epoch": 2.147239263803681,
550
+ "grad_norm": 0.01959369145333767,
551
+ "learning_rate": 9.263803680981595e-05,
552
+ "loss": 0.0029,
553
+ "step": 700
554
+ },
555
+ {
556
+ "epoch": 2.147239263803681,
557
+ "eval_accuracy": 0.9375,
558
+ "eval_loss": 0.06373238563537598,
559
+ "eval_runtime": 1.4073,
560
+ "eval_samples_per_second": 11.369,
561
+ "eval_steps_per_second": 1.421,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 2.1779141104294477,
566
+ "grad_norm": 0.015473265200853348,
567
+ "learning_rate": 9.110429447852761e-05,
568
+ "loss": 0.0052,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 2.208588957055215,
573
+ "grad_norm": 6.127484321594238,
574
+ "learning_rate": 8.957055214723928e-05,
575
+ "loss": 0.0215,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 2.2392638036809815,
580
+ "grad_norm": 0.014825492165982723,
581
+ "learning_rate": 8.803680981595093e-05,
582
+ "loss": 0.0051,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 2.2699386503067487,
587
+ "grad_norm": 0.09054987877607346,
588
+ "learning_rate": 8.650306748466259e-05,
589
+ "loss": 0.0089,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 2.3006134969325154,
594
+ "grad_norm": 0.012495328672230244,
595
+ "learning_rate": 8.496932515337423e-05,
596
+ "loss": 0.0035,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 2.331288343558282,
601
+ "grad_norm": 0.012604707852005959,
602
+ "learning_rate": 8.343558282208588e-05,
603
+ "loss": 0.0046,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 2.361963190184049,
608
+ "grad_norm": 0.012834266759455204,
609
+ "learning_rate": 8.190184049079755e-05,
610
+ "loss": 0.0685,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 2.392638036809816,
615
+ "grad_norm": 1.8484301567077637,
616
+ "learning_rate": 8.036809815950921e-05,
617
+ "loss": 0.0058,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 2.4233128834355826,
622
+ "grad_norm": 0.01303753349930048,
623
+ "learning_rate": 7.883435582822086e-05,
624
+ "loss": 0.0201,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 2.4539877300613497,
629
+ "grad_norm": 0.011059868149459362,
630
+ "learning_rate": 7.730061349693252e-05,
631
+ "loss": 0.0543,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 2.4539877300613497,
636
+ "eval_accuracy": 0.8125,
637
+ "eval_loss": 0.7362374067306519,
638
+ "eval_runtime": 0.6252,
639
+ "eval_samples_per_second": 25.594,
640
+ "eval_steps_per_second": 3.199,
641
+ "step": 800
642
+ },
643
+ {
644
+ "epoch": 2.4846625766871164,
645
+ "grad_norm": 0.013651982881128788,
646
+ "learning_rate": 7.576687116564417e-05,
647
+ "loss": 0.0023,
648
+ "step": 810
649
+ },
650
+ {
651
+ "epoch": 2.5153374233128836,
652
+ "grad_norm": 0.013891194015741348,
653
+ "learning_rate": 7.423312883435584e-05,
654
+ "loss": 0.0035,
655
+ "step": 820
656
+ },
657
+ {
658
+ "epoch": 2.5460122699386503,
659
+ "grad_norm": 0.022767823189496994,
660
+ "learning_rate": 7.26993865030675e-05,
661
+ "loss": 0.0023,
662
+ "step": 830
663
+ },
664
+ {
665
+ "epoch": 2.5766871165644174,
666
+ "grad_norm": 0.01230280939489603,
667
+ "learning_rate": 7.116564417177914e-05,
668
+ "loss": 0.0163,
669
+ "step": 840
670
+ },
671
+ {
672
+ "epoch": 2.607361963190184,
673
+ "grad_norm": 0.01010560616850853,
674
+ "learning_rate": 6.963190184049079e-05,
675
+ "loss": 0.0016,
676
+ "step": 850
677
+ },
678
+ {
679
+ "epoch": 2.638036809815951,
680
+ "grad_norm": 0.010164570063352585,
681
+ "learning_rate": 6.809815950920245e-05,
682
+ "loss": 0.0287,
683
+ "step": 860
684
+ },
685
+ {
686
+ "epoch": 2.668711656441718,
687
+ "grad_norm": 0.010293642990291119,
688
+ "learning_rate": 6.656441717791412e-05,
689
+ "loss": 0.0307,
690
+ "step": 870
691
+ },
692
+ {
693
+ "epoch": 2.6993865030674846,
694
+ "grad_norm": 0.01078992523252964,
695
+ "learning_rate": 6.503067484662577e-05,
696
+ "loss": 0.0106,
697
+ "step": 880
698
+ },
699
+ {
700
+ "epoch": 2.7300613496932513,
701
+ "grad_norm": 0.01032053492963314,
702
+ "learning_rate": 6.349693251533743e-05,
703
+ "loss": 0.0014,
704
+ "step": 890
705
+ },
706
+ {
707
+ "epoch": 2.7607361963190185,
708
+ "grad_norm": 0.009690840728580952,
709
+ "learning_rate": 6.196319018404908e-05,
710
+ "loss": 0.0265,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 2.7607361963190185,
715
+ "eval_accuracy": 0.75,
716
+ "eval_loss": 1.0916930437088013,
717
+ "eval_runtime": 0.4595,
718
+ "eval_samples_per_second": 34.82,
719
+ "eval_steps_per_second": 4.352,
720
+ "step": 900
721
+ },
722
+ {
723
+ "epoch": 2.791411042944785,
724
+ "grad_norm": 0.014224858023226261,
725
+ "learning_rate": 6.0429447852760736e-05,
726
+ "loss": 0.0622,
727
+ "step": 910
728
+ },
729
+ {
730
+ "epoch": 2.8220858895705523,
731
+ "grad_norm": 0.013891954906284809,
732
+ "learning_rate": 5.88957055214724e-05,
733
+ "loss": 0.0153,
734
+ "step": 920
735
+ },
736
+ {
737
+ "epoch": 2.852760736196319,
738
+ "grad_norm": 0.0167331974953413,
739
+ "learning_rate": 5.736196319018405e-05,
740
+ "loss": 0.0026,
741
+ "step": 930
742
+ },
743
+ {
744
+ "epoch": 2.883435582822086,
745
+ "grad_norm": 0.03341999277472496,
746
+ "learning_rate": 5.582822085889571e-05,
747
+ "loss": 0.0119,
748
+ "step": 940
749
+ },
750
+ {
751
+ "epoch": 2.914110429447853,
752
+ "grad_norm": 0.012064835987985134,
753
+ "learning_rate": 5.429447852760736e-05,
754
+ "loss": 0.0149,
755
+ "step": 950
756
+ },
757
+ {
758
+ "epoch": 2.9447852760736195,
759
+ "grad_norm": 0.012997813522815704,
760
+ "learning_rate": 5.276073619631902e-05,
761
+ "loss": 0.0034,
762
+ "step": 960
763
+ },
764
+ {
765
+ "epoch": 2.9754601226993866,
766
+ "grad_norm": 0.014833126217126846,
767
+ "learning_rate": 5.122699386503068e-05,
768
+ "loss": 0.0209,
769
+ "step": 970
770
+ },
771
+ {
772
+ "epoch": 3.0061349693251533,
773
+ "grad_norm": 0.010445049963891506,
774
+ "learning_rate": 4.9693251533742335e-05,
775
+ "loss": 0.0434,
776
+ "step": 980
777
+ },
778
+ {
779
+ "epoch": 3.03680981595092,
780
+ "grad_norm": 0.012170241214334965,
781
+ "learning_rate": 4.815950920245399e-05,
782
+ "loss": 0.0058,
783
+ "step": 990
784
+ },
785
+ {
786
+ "epoch": 3.067484662576687,
787
+ "grad_norm": 0.010554679669439793,
788
+ "learning_rate": 4.6625766871165645e-05,
789
+ "loss": 0.0017,
790
+ "step": 1000
791
+ },
792
+ {
793
+ "epoch": 3.067484662576687,
794
+ "eval_accuracy": 1.0,
795
+ "eval_loss": 0.0030439873225986958,
796
+ "eval_runtime": 0.4699,
797
+ "eval_samples_per_second": 34.049,
798
+ "eval_steps_per_second": 4.256,
799
+ "step": 1000
800
+ },
801
+ {
802
+ "epoch": 3.098159509202454,
803
+ "grad_norm": 0.010963929817080498,
804
+ "learning_rate": 4.5092024539877307e-05,
805
+ "loss": 0.0241,
806
+ "step": 1010
807
+ },
808
+ {
809
+ "epoch": 3.128834355828221,
810
+ "grad_norm": 0.010549047961831093,
811
+ "learning_rate": 4.3558282208588955e-05,
812
+ "loss": 0.0015,
813
+ "step": 1020
814
+ },
815
+ {
816
+ "epoch": 3.1595092024539877,
817
+ "grad_norm": 0.012895047664642334,
818
+ "learning_rate": 4.2024539877300617e-05,
819
+ "loss": 0.0021,
820
+ "step": 1030
821
+ },
822
+ {
823
+ "epoch": 3.190184049079755,
824
+ "grad_norm": 0.009695703163743019,
825
+ "learning_rate": 4.049079754601227e-05,
826
+ "loss": 0.0013,
827
+ "step": 1040
828
+ },
829
+ {
830
+ "epoch": 3.2208588957055215,
831
+ "grad_norm": 0.01083831675350666,
832
+ "learning_rate": 3.895705521472393e-05,
833
+ "loss": 0.0018,
834
+ "step": 1050
835
+ },
836
+ {
837
+ "epoch": 3.2515337423312882,
838
+ "grad_norm": 0.030923034995794296,
839
+ "learning_rate": 3.742331288343559e-05,
840
+ "loss": 0.0012,
841
+ "step": 1060
842
+ },
843
+ {
844
+ "epoch": 3.2822085889570554,
845
+ "grad_norm": 0.009053406305611134,
846
+ "learning_rate": 3.5889570552147236e-05,
847
+ "loss": 0.0011,
848
+ "step": 1070
849
+ },
850
+ {
851
+ "epoch": 3.312883435582822,
852
+ "grad_norm": 0.009371621534228325,
853
+ "learning_rate": 3.43558282208589e-05,
854
+ "loss": 0.0011,
855
+ "step": 1080
856
+ },
857
+ {
858
+ "epoch": 3.3435582822085887,
859
+ "grad_norm": 0.009119446389377117,
860
+ "learning_rate": 3.282208588957055e-05,
861
+ "loss": 0.0011,
862
+ "step": 1090
863
+ },
864
+ {
865
+ "epoch": 3.374233128834356,
866
+ "grad_norm": 0.009182000532746315,
867
+ "learning_rate": 3.1288343558282215e-05,
868
+ "loss": 0.0054,
869
+ "step": 1100
870
+ },
871
+ {
872
+ "epoch": 3.374233128834356,
873
+ "eval_accuracy": 1.0,
874
+ "eval_loss": 0.036368854343891144,
875
+ "eval_runtime": 0.4537,
876
+ "eval_samples_per_second": 35.267,
877
+ "eval_steps_per_second": 4.408,
878
+ "step": 1100
879
+ },
880
+ {
881
+ "epoch": 3.4049079754601226,
882
+ "grad_norm": 0.00895740371197462,
883
+ "learning_rate": 2.9754601226993867e-05,
884
+ "loss": 0.0017,
885
+ "step": 1110
886
+ },
887
+ {
888
+ "epoch": 3.4355828220858897,
889
+ "grad_norm": 0.20501597225666046,
890
+ "learning_rate": 2.822085889570552e-05,
891
+ "loss": 0.0013,
892
+ "step": 1120
893
+ },
894
+ {
895
+ "epoch": 3.4662576687116564,
896
+ "grad_norm": 0.009406461380422115,
897
+ "learning_rate": 2.668711656441718e-05,
898
+ "loss": 0.0011,
899
+ "step": 1130
900
+ },
901
+ {
902
+ "epoch": 3.4969325153374236,
903
+ "grad_norm": 0.009192903526127338,
904
+ "learning_rate": 2.5153374233128835e-05,
905
+ "loss": 0.0012,
906
+ "step": 1140
907
+ },
908
+ {
909
+ "epoch": 3.5276073619631902,
910
+ "grad_norm": 0.009342888370156288,
911
+ "learning_rate": 2.361963190184049e-05,
912
+ "loss": 0.0362,
913
+ "step": 1150
914
+ },
915
+ {
916
+ "epoch": 3.558282208588957,
917
+ "grad_norm": 0.01002499833703041,
918
+ "learning_rate": 2.208588957055215e-05,
919
+ "loss": 0.0011,
920
+ "step": 1160
921
+ },
922
+ {
923
+ "epoch": 3.588957055214724,
924
+ "grad_norm": 0.01044855359941721,
925
+ "learning_rate": 2.0552147239263807e-05,
926
+ "loss": 0.0011,
927
+ "step": 1170
928
+ },
929
+ {
930
+ "epoch": 3.6196319018404908,
931
+ "grad_norm": 0.030983537435531616,
932
+ "learning_rate": 1.9018404907975462e-05,
933
+ "loss": 0.0011,
934
+ "step": 1180
935
+ },
936
+ {
937
+ "epoch": 3.6503067484662575,
938
+ "grad_norm": 0.009104466997087002,
939
+ "learning_rate": 1.7484662576687117e-05,
940
+ "loss": 0.0012,
941
+ "step": 1190
942
+ },
943
+ {
944
+ "epoch": 3.6809815950920246,
945
+ "grad_norm": 0.009420313872396946,
946
+ "learning_rate": 1.5950920245398772e-05,
947
+ "loss": 0.0234,
948
+ "step": 1200
949
+ },
950
+ {
951
+ "epoch": 3.6809815950920246,
952
+ "eval_accuracy": 0.875,
953
+ "eval_loss": 0.23103433847427368,
954
+ "eval_runtime": 0.4704,
955
+ "eval_samples_per_second": 34.012,
956
+ "eval_steps_per_second": 4.252,
957
+ "step": 1200
958
+ },
959
+ {
960
+ "epoch": 3.7116564417177913,
961
+ "grad_norm": 0.00850239023566246,
962
+ "learning_rate": 1.441717791411043e-05,
963
+ "loss": 0.001,
964
+ "step": 1210
965
+ },
966
+ {
967
+ "epoch": 3.7423312883435584,
968
+ "grad_norm": 0.008601406589150429,
969
+ "learning_rate": 1.2883435582822087e-05,
970
+ "loss": 0.0013,
971
+ "step": 1220
972
+ },
973
+ {
974
+ "epoch": 3.773006134969325,
975
+ "grad_norm": 0.011089220643043518,
976
+ "learning_rate": 1.1349693251533742e-05,
977
+ "loss": 0.0011,
978
+ "step": 1230
979
+ },
980
+ {
981
+ "epoch": 3.8036809815950923,
982
+ "grad_norm": 0.008406179025769234,
983
+ "learning_rate": 9.8159509202454e-06,
984
+ "loss": 0.001,
985
+ "step": 1240
986
+ },
987
+ {
988
+ "epoch": 3.834355828220859,
989
+ "grad_norm": 0.009516136720776558,
990
+ "learning_rate": 8.282208588957055e-06,
991
+ "loss": 0.0012,
992
+ "step": 1250
993
+ },
994
+ {
995
+ "epoch": 3.8650306748466257,
996
+ "grad_norm": 0.011050822213292122,
997
+ "learning_rate": 6.748466257668712e-06,
998
+ "loss": 0.0012,
999
+ "step": 1260
1000
+ },
1001
+ {
1002
+ "epoch": 3.895705521472393,
1003
+ "grad_norm": 0.008980349637567997,
1004
+ "learning_rate": 5.214723926380368e-06,
1005
+ "loss": 0.0012,
1006
+ "step": 1270
1007
+ },
1008
+ {
1009
+ "epoch": 3.9263803680981595,
1010
+ "grad_norm": 0.00903311651200056,
1011
+ "learning_rate": 3.680981595092025e-06,
1012
+ "loss": 0.0014,
1013
+ "step": 1280
1014
+ },
1015
+ {
1016
+ "epoch": 3.957055214723926,
1017
+ "grad_norm": 0.00838613323867321,
1018
+ "learning_rate": 2.147239263803681e-06,
1019
+ "loss": 0.0253,
1020
+ "step": 1290
1021
+ },
1022
+ {
1023
+ "epoch": 3.9877300613496933,
1024
+ "grad_norm": 0.008292277343571186,
1025
+ "learning_rate": 6.134969325153375e-07,
1026
+ "loss": 0.0076,
1027
+ "step": 1300
1028
+ },
1029
+ {
1030
+ "epoch": 3.9877300613496933,
1031
+ "eval_accuracy": 0.875,
1032
+ "eval_loss": 0.40373101830482483,
1033
+ "eval_runtime": 0.4908,
1034
+ "eval_samples_per_second": 32.599,
1035
+ "eval_steps_per_second": 4.075,
1036
+ "step": 1300
1037
+ },
1038
+ {
1039
+ "epoch": 4.0,
1040
+ "step": 1304,
1041
+ "total_flos": 1.6167928713188475e+18,
1042
+ "train_loss": 0.04957773063711799,
1043
+ "train_runtime": 1223.9715,
1044
+ "train_samples_per_second": 17.046,
1045
+ "train_steps_per_second": 1.065
1046
+ }
1047
+ ],
1048
+ "logging_steps": 10,
1049
+ "max_steps": 1304,
1050
+ "num_input_tokens_seen": 0,
1051
+ "num_train_epochs": 4,
1052
+ "save_steps": 100,
1053
+ "stateful_callbacks": {
1054
+ "TrainerControl": {
1055
+ "args": {
1056
+ "should_epoch_stop": false,
1057
+ "should_evaluate": false,
1058
+ "should_log": false,
1059
+ "should_save": true,
1060
+ "should_training_stop": false
1061
+ },
1062
+ "attributes": {}
1063
+ }
1064
+ },
1065
+ "total_flos": 1.6167928713188475e+18,
1066
+ "train_batch_size": 16,
1067
+ "trial_name": null,
1068
+ "trial_params": null
1069
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e1265e8e0fae3dd397db275a1e4bc01e7818b66ee6cc859974880cd008cab48
3
+ size 5112