Mariofm02 commited on
Commit
b397482
1 Parent(s): a587258

Model save

Browse files
README.md CHANGED
@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21K](https://huggingface.co/google/vit-base-patch16-224-in21K) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.9121
21
- - Accuracy: 0.7710
22
 
23
  ## Model description
24
 
@@ -43,29 +43,41 @@ The following hyperparameters were used during training:
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: linear
46
- - num_epochs: 4
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
53
- | 3.1307 | 0.24 | 100 | 3.0180 | 0.2093 |
54
- | 2.7372 | 0.48 | 200 | 2.5301 | 0.2762 |
55
- | 2.4969 | 0.73 | 300 | 2.1760 | 0.3439 |
56
- | 2.1973 | 0.97 | 400 | 2.0103 | 0.3756 |
57
- | 1.8847 | 1.21 | 500 | 1.8402 | 0.4108 |
58
- | 1.746 | 1.45 | 600 | 1.7051 | 0.4803 |
59
- | 1.8698 | 1.69 | 700 | 1.5985 | 0.4889 |
60
- | 1.7261 | 1.94 | 800 | 1.4312 | 0.5840 |
61
- | 1.7385 | 2.18 | 900 | 1.3585 | 0.6286 |
62
- | 1.5873 | 2.42 | 1000 | 1.2374 | 0.6758 |
63
- | 1.4775 | 2.66 | 1100 | 1.1352 | 0.7024 |
64
- | 1.2697 | 2.91 | 1200 | 1.1044 | 0.7093 |
65
- | 1.2137 | 3.15 | 1300 | 1.0006 | 0.7616 |
66
- | 1.423 | 3.39 | 1400 | 0.9589 | 0.7744 |
67
- | 1.0098 | 3.63 | 1500 | 0.9360 | 0.7684 |
68
- | 1.1325 | 3.87 | 1600 | 0.9121 | 0.7710 |
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  ### Framework versions
 
17
 
18
  This model is a fine-tuned version of [google/vit-base-patch16-224-in21K](https://huggingface.co/google/vit-base-patch16-224-in21K) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.5081
21
+ - Accuracy: 0.8696
22
 
23
  ## Model description
24
 
 
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: linear
46
+ - num_epochs: 7
47
  - mixed_precision_training: Native AMP
48
 
49
  ### Training results
50
 
51
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
52
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
53
+ | 1.3563 | 0.24 | 100 | 1.1495 | 0.6750 |
54
+ | 1.3393 | 0.48 | 200 | 1.0388 | 0.7204 |
55
+ | 1.2033 | 0.73 | 300 | 0.9324 | 0.7547 |
56
+ | 0.9672 | 0.97 | 400 | 0.8558 | 0.7659 |
57
+ | 0.8674 | 1.21 | 500 | 0.8456 | 0.7616 |
58
+ | 0.8277 | 1.45 | 600 | 0.7563 | 0.7959 |
59
+ | 0.8703 | 1.69 | 700 | 0.8465 | 0.7539 |
60
+ | 0.893 | 1.94 | 800 | 0.6881 | 0.8002 |
61
+ | 0.9454 | 2.18 | 900 | 0.7211 | 0.8027 |
62
+ | 0.8109 | 2.42 | 1000 | 0.6369 | 0.8285 |
63
+ | 0.8762 | 2.66 | 1100 | 0.6336 | 0.8396 |
64
+ | 0.8034 | 2.91 | 1200 | 0.6580 | 0.8165 |
65
+ | 0.5833 | 3.15 | 1300 | 0.5828 | 0.8439 |
66
+ | 0.8811 | 3.39 | 1400 | 0.6564 | 0.8259 |
67
+ | 0.5639 | 3.63 | 1500 | 0.5737 | 0.8439 |
68
+ | 0.639 | 3.87 | 1600 | 0.5609 | 0.8379 |
69
+ | 0.6455 | 4.12 | 1700 | 0.5820 | 0.8370 |
70
+ | 0.5402 | 4.36 | 1800 | 0.5797 | 0.8345 |
71
+ | 0.5311 | 4.6 | 1900 | 0.5511 | 0.8456 |
72
+ | 0.5734 | 4.84 | 2000 | 0.5444 | 0.8508 |
73
+ | 0.5206 | 5.08 | 2100 | 0.5326 | 0.8636 |
74
+ | 0.6272 | 5.33 | 2200 | 0.5478 | 0.8525 |
75
+ | 0.5124 | 5.57 | 2300 | 0.5296 | 0.8688 |
76
+ | 0.5659 | 5.81 | 2400 | 0.5181 | 0.8705 |
77
+ | 0.4212 | 6.05 | 2500 | 0.5200 | 0.8611 |
78
+ | 0.4338 | 6.3 | 2600 | 0.5135 | 0.8731 |
79
+ | 0.3407 | 6.54 | 2700 | 0.5147 | 0.8722 |
80
+ | 0.4043 | 6.78 | 2800 | 0.5081 | 0.8696 |
81
 
82
 
83
  ### Framework versions
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.7710120068610634,
4
+ "eval_loss": 0.912144124507904,
5
+ "eval_runtime": 6.7842,
6
+ "eval_samples_per_second": 171.871,
7
+ "eval_steps_per_second": 21.521,
8
+ "total_flos": 2.047635634195759e+18,
9
+ "train_loss": 1.8191430680543978,
10
+ "train_runtime": 594.0822,
11
+ "train_samples_per_second": 44.458,
12
+ "train_steps_per_second": 2.781
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "eval_accuracy": 0.7710120068610634,
4
+ "eval_loss": 0.912144124507904,
5
+ "eval_runtime": 6.7842,
6
+ "eval_samples_per_second": 171.871,
7
+ "eval_steps_per_second": 21.521
8
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:808ea45f936d03a7fc517107e6638c9c74952f79ce7f1f2082bf4a417f12605c
3
  size 343377784
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb9b70ee282c426c2a645c30197368de15a5908c06f2c905de4194ed11a5c4ed
3
  size 343377784
runs/Mar29_19-44-52_44990517b672/events.out.tfevents.1711742478.44990517b672.3784.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3636374ba0e100e35ba985b7def150b74b2d498b9b67d3dfddd2dc95add95a23
3
+ size 411
runs/Mar29_20-03-14_44990517b672/events.out.tfevents.1711742611.44990517b672.3784.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c50db23ca1c40405186375018a2238b4a607bb59484fe5484ecc67db25f05dc
3
+ size 77831
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 4.0,
3
+ "total_flos": 2.047635634195759e+18,
4
+ "train_loss": 1.8191430680543978,
5
+ "train_runtime": 594.0822,
6
+ "train_samples_per_second": 44.458,
7
+ "train_steps_per_second": 2.781
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.912144124507904,
3
+ "best_model_checkpoint": "finetuned-cards-blackjack/checkpoint-1600",
4
+ "epoch": 4.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1652,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02,
13
+ "grad_norm": 1.466597557067871,
14
+ "learning_rate": 0.0001987893462469734,
15
+ "loss": 3.9543,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.05,
20
+ "grad_norm": 1.9476360082626343,
21
+ "learning_rate": 0.00019757869249394675,
22
+ "loss": 3.8868,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.07,
27
+ "grad_norm": 1.6487232446670532,
28
+ "learning_rate": 0.0001963680387409201,
29
+ "loss": 3.8185,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.1,
34
+ "grad_norm": 1.8101606369018555,
35
+ "learning_rate": 0.00019515738498789345,
36
+ "loss": 3.6559,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.12,
41
+ "grad_norm": 1.7900973558425903,
42
+ "learning_rate": 0.00019394673123486684,
43
+ "loss": 3.559,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.15,
48
+ "grad_norm": 1.7922214269638062,
49
+ "learning_rate": 0.0001927360774818402,
50
+ "loss": 3.4135,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.17,
55
+ "grad_norm": 1.9818700551986694,
56
+ "learning_rate": 0.00019152542372881357,
57
+ "loss": 3.3906,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.19,
62
+ "grad_norm": 1.9315565824508667,
63
+ "learning_rate": 0.00019031476997578695,
64
+ "loss": 3.3191,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.22,
69
+ "grad_norm": 1.9850099086761475,
70
+ "learning_rate": 0.0001891041162227603,
71
+ "loss": 3.2122,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.24,
76
+ "grad_norm": 1.9584887027740479,
77
+ "learning_rate": 0.00018789346246973366,
78
+ "loss": 3.1307,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.24,
83
+ "eval_accuracy": 0.20926243567753003,
84
+ "eval_loss": 3.017998456954956,
85
+ "eval_runtime": 6.19,
86
+ "eval_samples_per_second": 188.367,
87
+ "eval_steps_per_second": 23.586,
88
+ "step": 100
89
+ },
90
+ {
91
+ "epoch": 0.27,
92
+ "grad_norm": 2.39744234085083,
93
+ "learning_rate": 0.00018668280871670702,
94
+ "loss": 3.0667,
95
+ "step": 110
96
+ },
97
+ {
98
+ "epoch": 0.29,
99
+ "grad_norm": 1.891518473625183,
100
+ "learning_rate": 0.0001854721549636804,
101
+ "loss": 2.984,
102
+ "step": 120
103
+ },
104
+ {
105
+ "epoch": 0.31,
106
+ "grad_norm": 1.9065883159637451,
107
+ "learning_rate": 0.00018426150121065375,
108
+ "loss": 2.8457,
109
+ "step": 130
110
+ },
111
+ {
112
+ "epoch": 0.34,
113
+ "grad_norm": 2.126429796218872,
114
+ "learning_rate": 0.00018305084745762714,
115
+ "loss": 2.9638,
116
+ "step": 140
117
+ },
118
+ {
119
+ "epoch": 0.36,
120
+ "grad_norm": 1.9387011528015137,
121
+ "learning_rate": 0.00018184019370460052,
122
+ "loss": 2.7348,
123
+ "step": 150
124
+ },
125
+ {
126
+ "epoch": 0.39,
127
+ "grad_norm": 1.923202633857727,
128
+ "learning_rate": 0.00018062953995157384,
129
+ "loss": 2.8489,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 0.41,
134
+ "grad_norm": 2.581446409225464,
135
+ "learning_rate": 0.00017941888619854723,
136
+ "loss": 2.7744,
137
+ "step": 170
138
+ },
139
+ {
140
+ "epoch": 0.44,
141
+ "grad_norm": 1.6987770795822144,
142
+ "learning_rate": 0.00017820823244552058,
143
+ "loss": 2.6428,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 0.46,
148
+ "grad_norm": 1.9667104482650757,
149
+ "learning_rate": 0.00017699757869249396,
150
+ "loss": 2.6952,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 0.48,
155
+ "grad_norm": 3.4282023906707764,
156
+ "learning_rate": 0.00017578692493946732,
157
+ "loss": 2.7372,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 0.48,
162
+ "eval_accuracy": 0.27615780445969124,
163
+ "eval_loss": 2.530054807662964,
164
+ "eval_runtime": 6.2338,
165
+ "eval_samples_per_second": 187.045,
166
+ "eval_steps_per_second": 23.421,
167
+ "step": 200
168
+ },
169
+ {
170
+ "epoch": 0.51,
171
+ "grad_norm": 1.9124583005905151,
172
+ "learning_rate": 0.0001745762711864407,
173
+ "loss": 2.6423,
174
+ "step": 210
175
+ },
176
+ {
177
+ "epoch": 0.53,
178
+ "grad_norm": 2.4269683361053467,
179
+ "learning_rate": 0.00017336561743341405,
180
+ "loss": 2.668,
181
+ "step": 220
182
+ },
183
+ {
184
+ "epoch": 0.56,
185
+ "grad_norm": 1.9838333129882812,
186
+ "learning_rate": 0.0001721549636803874,
187
+ "loss": 2.5786,
188
+ "step": 230
189
+ },
190
+ {
191
+ "epoch": 0.58,
192
+ "grad_norm": 3.200087070465088,
193
+ "learning_rate": 0.0001709443099273608,
194
+ "loss": 2.5938,
195
+ "step": 240
196
+ },
197
+ {
198
+ "epoch": 0.61,
199
+ "grad_norm": 2.93118953704834,
200
+ "learning_rate": 0.00016973365617433414,
201
+ "loss": 2.4526,
202
+ "step": 250
203
+ },
204
+ {
205
+ "epoch": 0.63,
206
+ "grad_norm": 2.555947780609131,
207
+ "learning_rate": 0.00016852300242130752,
208
+ "loss": 2.41,
209
+ "step": 260
210
+ },
211
+ {
212
+ "epoch": 0.65,
213
+ "grad_norm": 2.9446065425872803,
214
+ "learning_rate": 0.00016731234866828088,
215
+ "loss": 2.4537,
216
+ "step": 270
217
+ },
218
+ {
219
+ "epoch": 0.68,
220
+ "grad_norm": 3.393993377685547,
221
+ "learning_rate": 0.00016610169491525423,
222
+ "loss": 2.4256,
223
+ "step": 280
224
+ },
225
+ {
226
+ "epoch": 0.7,
227
+ "grad_norm": 2.721825122833252,
228
+ "learning_rate": 0.00016489104116222762,
229
+ "loss": 2.4719,
230
+ "step": 290
231
+ },
232
+ {
233
+ "epoch": 0.73,
234
+ "grad_norm": 3.2610368728637695,
235
+ "learning_rate": 0.00016368038740920097,
236
+ "loss": 2.4969,
237
+ "step": 300
238
+ },
239
+ {
240
+ "epoch": 0.73,
241
+ "eval_accuracy": 0.3439108061749571,
242
+ "eval_loss": 2.175961971282959,
243
+ "eval_runtime": 6.2327,
244
+ "eval_samples_per_second": 187.079,
245
+ "eval_steps_per_second": 23.425,
246
+ "step": 300
247
+ },
248
+ {
249
+ "epoch": 0.75,
250
+ "grad_norm": 3.067995309829712,
251
+ "learning_rate": 0.00016246973365617435,
252
+ "loss": 2.4904,
253
+ "step": 310
254
+ },
255
+ {
256
+ "epoch": 0.77,
257
+ "grad_norm": 2.7957141399383545,
258
+ "learning_rate": 0.0001612590799031477,
259
+ "loss": 2.3913,
260
+ "step": 320
261
+ },
262
+ {
263
+ "epoch": 0.8,
264
+ "grad_norm": 2.281586170196533,
265
+ "learning_rate": 0.0001600484261501211,
266
+ "loss": 2.1749,
267
+ "step": 330
268
+ },
269
+ {
270
+ "epoch": 0.82,
271
+ "grad_norm": 2.4833972454071045,
272
+ "learning_rate": 0.00015883777239709444,
273
+ "loss": 2.4058,
274
+ "step": 340
275
+ },
276
+ {
277
+ "epoch": 0.85,
278
+ "grad_norm": 2.5052073001861572,
279
+ "learning_rate": 0.0001576271186440678,
280
+ "loss": 2.3236,
281
+ "step": 350
282
+ },
283
+ {
284
+ "epoch": 0.87,
285
+ "grad_norm": 2.479684352874756,
286
+ "learning_rate": 0.00015641646489104115,
287
+ "loss": 2.373,
288
+ "step": 360
289
+ },
290
+ {
291
+ "epoch": 0.9,
292
+ "grad_norm": 3.6352992057800293,
293
+ "learning_rate": 0.00015520581113801453,
294
+ "loss": 2.3282,
295
+ "step": 370
296
+ },
297
+ {
298
+ "epoch": 0.92,
299
+ "grad_norm": 2.748934030532837,
300
+ "learning_rate": 0.00015399515738498791,
301
+ "loss": 2.2062,
302
+ "step": 380
303
+ },
304
+ {
305
+ "epoch": 0.94,
306
+ "grad_norm": 2.0645978450775146,
307
+ "learning_rate": 0.00015278450363196127,
308
+ "loss": 2.1253,
309
+ "step": 390
310
+ },
311
+ {
312
+ "epoch": 0.97,
313
+ "grad_norm": 2.2856009006500244,
314
+ "learning_rate": 0.00015157384987893465,
315
+ "loss": 2.1973,
316
+ "step": 400
317
+ },
318
+ {
319
+ "epoch": 0.97,
320
+ "eval_accuracy": 0.37564322469982847,
321
+ "eval_loss": 2.0102577209472656,
322
+ "eval_runtime": 5.9741,
323
+ "eval_samples_per_second": 195.175,
324
+ "eval_steps_per_second": 24.439,
325
+ "step": 400
326
+ },
327
+ {
328
+ "epoch": 0.99,
329
+ "grad_norm": 2.866960048675537,
330
+ "learning_rate": 0.00015036319612590798,
331
+ "loss": 2.214,
332
+ "step": 410
333
+ },
334
+ {
335
+ "epoch": 1.02,
336
+ "grad_norm": 3.171844482421875,
337
+ "learning_rate": 0.00014915254237288136,
338
+ "loss": 2.0948,
339
+ "step": 420
340
+ },
341
+ {
342
+ "epoch": 1.04,
343
+ "grad_norm": 3.6916253566741943,
344
+ "learning_rate": 0.00014794188861985471,
345
+ "loss": 2.0649,
346
+ "step": 430
347
+ },
348
+ {
349
+ "epoch": 1.07,
350
+ "grad_norm": 2.3281314373016357,
351
+ "learning_rate": 0.0001467312348668281,
352
+ "loss": 2.0633,
353
+ "step": 440
354
+ },
355
+ {
356
+ "epoch": 1.09,
357
+ "grad_norm": 3.370180368423462,
358
+ "learning_rate": 0.00014552058111380148,
359
+ "loss": 1.9949,
360
+ "step": 450
361
+ },
362
+ {
363
+ "epoch": 1.11,
364
+ "grad_norm": 2.5389626026153564,
365
+ "learning_rate": 0.00014430992736077483,
366
+ "loss": 2.086,
367
+ "step": 460
368
+ },
369
+ {
370
+ "epoch": 1.14,
371
+ "grad_norm": 2.47526216506958,
372
+ "learning_rate": 0.00014309927360774819,
373
+ "loss": 2.0443,
374
+ "step": 470
375
+ },
376
+ {
377
+ "epoch": 1.16,
378
+ "grad_norm": 2.821577548980713,
379
+ "learning_rate": 0.00014188861985472154,
380
+ "loss": 2.0808,
381
+ "step": 480
382
+ },
383
+ {
384
+ "epoch": 1.19,
385
+ "grad_norm": 2.978994369506836,
386
+ "learning_rate": 0.00014067796610169492,
387
+ "loss": 2.1278,
388
+ "step": 490
389
+ },
390
+ {
391
+ "epoch": 1.21,
392
+ "grad_norm": 3.1431379318237305,
393
+ "learning_rate": 0.00013946731234866828,
394
+ "loss": 1.8847,
395
+ "step": 500
396
+ },
397
+ {
398
+ "epoch": 1.21,
399
+ "eval_accuracy": 0.41080617495711835,
400
+ "eval_loss": 1.8402307033538818,
401
+ "eval_runtime": 6.2119,
402
+ "eval_samples_per_second": 187.705,
403
+ "eval_steps_per_second": 23.503,
404
+ "step": 500
405
+ },
406
+ {
407
+ "epoch": 1.23,
408
+ "grad_norm": 3.1350502967834473,
409
+ "learning_rate": 0.00013825665859564166,
410
+ "loss": 2.02,
411
+ "step": 510
412
+ },
413
+ {
414
+ "epoch": 1.26,
415
+ "grad_norm": 2.63952374458313,
416
+ "learning_rate": 0.00013704600484261504,
417
+ "loss": 2.1684,
418
+ "step": 520
419
+ },
420
+ {
421
+ "epoch": 1.28,
422
+ "grad_norm": 2.7914199829101562,
423
+ "learning_rate": 0.00013583535108958837,
424
+ "loss": 1.8532,
425
+ "step": 530
426
+ },
427
+ {
428
+ "epoch": 1.31,
429
+ "grad_norm": 4.124698638916016,
430
+ "learning_rate": 0.00013462469733656175,
431
+ "loss": 1.9593,
432
+ "step": 540
433
+ },
434
+ {
435
+ "epoch": 1.33,
436
+ "grad_norm": 3.0953214168548584,
437
+ "learning_rate": 0.0001334140435835351,
438
+ "loss": 2.0143,
439
+ "step": 550
440
+ },
441
+ {
442
+ "epoch": 1.36,
443
+ "grad_norm": 3.626241683959961,
444
+ "learning_rate": 0.00013220338983050849,
445
+ "loss": 2.0349,
446
+ "step": 560
447
+ },
448
+ {
449
+ "epoch": 1.38,
450
+ "grad_norm": 3.22306752204895,
451
+ "learning_rate": 0.00013099273607748184,
452
+ "loss": 1.9283,
453
+ "step": 570
454
+ },
455
+ {
456
+ "epoch": 1.4,
457
+ "grad_norm": 2.6860299110412598,
458
+ "learning_rate": 0.00012978208232445522,
459
+ "loss": 1.9022,
460
+ "step": 580
461
+ },
462
+ {
463
+ "epoch": 1.43,
464
+ "grad_norm": 3.2099533081054688,
465
+ "learning_rate": 0.00012857142857142858,
466
+ "loss": 1.9102,
467
+ "step": 590
468
+ },
469
+ {
470
+ "epoch": 1.45,
471
+ "grad_norm": 2.5889129638671875,
472
+ "learning_rate": 0.00012736077481840193,
473
+ "loss": 1.746,
474
+ "step": 600
475
+ },
476
+ {
477
+ "epoch": 1.45,
478
+ "eval_accuracy": 0.48027444253859347,
479
+ "eval_loss": 1.7051318883895874,
480
+ "eval_runtime": 6.0914,
481
+ "eval_samples_per_second": 191.418,
482
+ "eval_steps_per_second": 23.968,
483
+ "step": 600
484
+ },
485
+ {
486
+ "epoch": 1.48,
487
+ "grad_norm": 2.6496353149414062,
488
+ "learning_rate": 0.0001261501210653753,
489
+ "loss": 1.8087,
490
+ "step": 610
491
+ },
492
+ {
493
+ "epoch": 1.5,
494
+ "grad_norm": 2.2695322036743164,
495
+ "learning_rate": 0.00012493946731234867,
496
+ "loss": 1.9172,
497
+ "step": 620
498
+ },
499
+ {
500
+ "epoch": 1.53,
501
+ "grad_norm": 3.144073724746704,
502
+ "learning_rate": 0.00012372881355932205,
503
+ "loss": 1.8943,
504
+ "step": 630
505
+ },
506
+ {
507
+ "epoch": 1.55,
508
+ "grad_norm": 2.9001333713531494,
509
+ "learning_rate": 0.0001225181598062954,
510
+ "loss": 1.9463,
511
+ "step": 640
512
+ },
513
+ {
514
+ "epoch": 1.57,
515
+ "grad_norm": 2.5096278190612793,
516
+ "learning_rate": 0.00012130750605326877,
517
+ "loss": 1.8045,
518
+ "step": 650
519
+ },
520
+ {
521
+ "epoch": 1.6,
522
+ "grad_norm": 2.2238059043884277,
523
+ "learning_rate": 0.00012009685230024215,
524
+ "loss": 1.9322,
525
+ "step": 660
526
+ },
527
+ {
528
+ "epoch": 1.62,
529
+ "grad_norm": 2.7545368671417236,
530
+ "learning_rate": 0.00011888619854721549,
531
+ "loss": 1.7305,
532
+ "step": 670
533
+ },
534
+ {
535
+ "epoch": 1.65,
536
+ "grad_norm": 2.8309366703033447,
537
+ "learning_rate": 0.00011767554479418887,
538
+ "loss": 1.8587,
539
+ "step": 680
540
+ },
541
+ {
542
+ "epoch": 1.67,
543
+ "grad_norm": 5.093832492828369,
544
+ "learning_rate": 0.00011646489104116223,
545
+ "loss": 1.8362,
546
+ "step": 690
547
+ },
548
+ {
549
+ "epoch": 1.69,
550
+ "grad_norm": 2.4374847412109375,
551
+ "learning_rate": 0.0001152542372881356,
552
+ "loss": 1.8698,
553
+ "step": 700
554
+ },
555
+ {
556
+ "epoch": 1.69,
557
+ "eval_accuracy": 0.4888507718696398,
558
+ "eval_loss": 1.5985045433044434,
559
+ "eval_runtime": 6.4332,
560
+ "eval_samples_per_second": 181.249,
561
+ "eval_steps_per_second": 22.695,
562
+ "step": 700
563
+ },
564
+ {
565
+ "epoch": 1.72,
566
+ "grad_norm": 2.8519837856292725,
567
+ "learning_rate": 0.00011404358353510895,
568
+ "loss": 1.8736,
569
+ "step": 710
570
+ },
571
+ {
572
+ "epoch": 1.74,
573
+ "grad_norm": 2.8379719257354736,
574
+ "learning_rate": 0.00011283292978208233,
575
+ "loss": 1.6395,
576
+ "step": 720
577
+ },
578
+ {
579
+ "epoch": 1.77,
580
+ "grad_norm": 3.884648323059082,
581
+ "learning_rate": 0.00011174334140435836,
582
+ "loss": 1.7938,
583
+ "step": 730
584
+ },
585
+ {
586
+ "epoch": 1.79,
587
+ "grad_norm": 3.2592883110046387,
588
+ "learning_rate": 0.00011053268765133173,
589
+ "loss": 1.6813,
590
+ "step": 740
591
+ },
592
+ {
593
+ "epoch": 1.82,
594
+ "grad_norm": 5.118261337280273,
595
+ "learning_rate": 0.00010932203389830508,
596
+ "loss": 1.9414,
597
+ "step": 750
598
+ },
599
+ {
600
+ "epoch": 1.84,
601
+ "grad_norm": 2.822026491165161,
602
+ "learning_rate": 0.00010811138014527846,
603
+ "loss": 1.7598,
604
+ "step": 760
605
+ },
606
+ {
607
+ "epoch": 1.86,
608
+ "grad_norm": 2.8540070056915283,
609
+ "learning_rate": 0.00010690072639225182,
610
+ "loss": 1.7024,
611
+ "step": 770
612
+ },
613
+ {
614
+ "epoch": 1.89,
615
+ "grad_norm": 4.354470252990723,
616
+ "learning_rate": 0.00010569007263922519,
617
+ "loss": 1.8987,
618
+ "step": 780
619
+ },
620
+ {
621
+ "epoch": 1.91,
622
+ "grad_norm": 3.528857707977295,
623
+ "learning_rate": 0.00010447941888619854,
624
+ "loss": 1.7933,
625
+ "step": 790
626
+ },
627
+ {
628
+ "epoch": 1.94,
629
+ "grad_norm": 2.76985764503479,
630
+ "learning_rate": 0.00010326876513317192,
631
+ "loss": 1.7261,
632
+ "step": 800
633
+ },
634
+ {
635
+ "epoch": 1.94,
636
+ "eval_accuracy": 0.5840480274442539,
637
+ "eval_loss": 1.4311938285827637,
638
+ "eval_runtime": 6.2955,
639
+ "eval_samples_per_second": 185.213,
640
+ "eval_steps_per_second": 23.191,
641
+ "step": 800
642
+ },
643
+ {
644
+ "epoch": 1.96,
645
+ "grad_norm": 3.15104079246521,
646
+ "learning_rate": 0.00010205811138014529,
647
+ "loss": 1.8079,
648
+ "step": 810
649
+ },
650
+ {
651
+ "epoch": 1.99,
652
+ "grad_norm": 3.0211942195892334,
653
+ "learning_rate": 0.00010084745762711865,
654
+ "loss": 1.611,
655
+ "step": 820
656
+ },
657
+ {
658
+ "epoch": 2.01,
659
+ "grad_norm": 2.527198076248169,
660
+ "learning_rate": 9.963680387409201e-05,
661
+ "loss": 1.7344,
662
+ "step": 830
663
+ },
664
+ {
665
+ "epoch": 2.03,
666
+ "grad_norm": 3.654705762863159,
667
+ "learning_rate": 9.842615012106537e-05,
668
+ "loss": 1.5921,
669
+ "step": 840
670
+ },
671
+ {
672
+ "epoch": 2.06,
673
+ "grad_norm": 2.6901042461395264,
674
+ "learning_rate": 9.721549636803875e-05,
675
+ "loss": 1.5688,
676
+ "step": 850
677
+ },
678
+ {
679
+ "epoch": 2.08,
680
+ "grad_norm": 2.830200672149658,
681
+ "learning_rate": 9.600484261501212e-05,
682
+ "loss": 1.5546,
683
+ "step": 860
684
+ },
685
+ {
686
+ "epoch": 2.11,
687
+ "grad_norm": 3.2287344932556152,
688
+ "learning_rate": 9.479418886198547e-05,
689
+ "loss": 1.5714,
690
+ "step": 870
691
+ },
692
+ {
693
+ "epoch": 2.13,
694
+ "grad_norm": 3.661449432373047,
695
+ "learning_rate": 9.358353510895884e-05,
696
+ "loss": 1.4973,
697
+ "step": 880
698
+ },
699
+ {
700
+ "epoch": 2.15,
701
+ "grad_norm": 6.353243827819824,
702
+ "learning_rate": 9.237288135593221e-05,
703
+ "loss": 1.5294,
704
+ "step": 890
705
+ },
706
+ {
707
+ "epoch": 2.18,
708
+ "grad_norm": 3.703733444213867,
709
+ "learning_rate": 9.116222760290558e-05,
710
+ "loss": 1.7385,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 2.18,
715
+ "eval_accuracy": 0.6286449399656947,
716
+ "eval_loss": 1.3585376739501953,
717
+ "eval_runtime": 5.9781,
718
+ "eval_samples_per_second": 195.046,
719
+ "eval_steps_per_second": 24.423,
720
+ "step": 900
721
+ },
722
+ {
723
+ "epoch": 2.2,
724
+ "grad_norm": 2.730365514755249,
725
+ "learning_rate": 8.995157384987893e-05,
726
+ "loss": 1.626,
727
+ "step": 910
728
+ },
729
+ {
730
+ "epoch": 2.23,
731
+ "grad_norm": 4.335669040679932,
732
+ "learning_rate": 8.874092009685231e-05,
733
+ "loss": 1.5823,
734
+ "step": 920
735
+ },
736
+ {
737
+ "epoch": 2.25,
738
+ "grad_norm": 2.272915840148926,
739
+ "learning_rate": 8.753026634382567e-05,
740
+ "loss": 1.47,
741
+ "step": 930
742
+ },
743
+ {
744
+ "epoch": 2.28,
745
+ "grad_norm": 3.335453510284424,
746
+ "learning_rate": 8.631961259079904e-05,
747
+ "loss": 1.4733,
748
+ "step": 940
749
+ },
750
+ {
751
+ "epoch": 2.3,
752
+ "grad_norm": 5.18184232711792,
753
+ "learning_rate": 8.51089588377724e-05,
754
+ "loss": 1.3798,
755
+ "step": 950
756
+ },
757
+ {
758
+ "epoch": 2.32,
759
+ "grad_norm": 3.79761004447937,
760
+ "learning_rate": 8.389830508474577e-05,
761
+ "loss": 1.5103,
762
+ "step": 960
763
+ },
764
+ {
765
+ "epoch": 2.35,
766
+ "grad_norm": 2.568056344985962,
767
+ "learning_rate": 8.268765133171913e-05,
768
+ "loss": 1.5016,
769
+ "step": 970
770
+ },
771
+ {
772
+ "epoch": 2.37,
773
+ "grad_norm": 4.231459140777588,
774
+ "learning_rate": 8.14769975786925e-05,
775
+ "loss": 1.4617,
776
+ "step": 980
777
+ },
778
+ {
779
+ "epoch": 2.4,
780
+ "grad_norm": 3.2914044857025146,
781
+ "learning_rate": 8.026634382566586e-05,
782
+ "loss": 1.5527,
783
+ "step": 990
784
+ },
785
+ {
786
+ "epoch": 2.42,
787
+ "grad_norm": 2.967702627182007,
788
+ "learning_rate": 7.905569007263923e-05,
789
+ "loss": 1.5873,
790
+ "step": 1000
791
+ },
792
+ {
793
+ "epoch": 2.42,
794
+ "eval_accuracy": 0.6758147512864494,
795
+ "eval_loss": 1.2374264001846313,
796
+ "eval_runtime": 6.2974,
797
+ "eval_samples_per_second": 185.155,
798
+ "eval_steps_per_second": 23.184,
799
+ "step": 1000
800
+ },
801
+ {
802
+ "epoch": 2.45,
803
+ "grad_norm": 2.7834739685058594,
804
+ "learning_rate": 7.78450363196126e-05,
805
+ "loss": 1.4255,
806
+ "step": 1010
807
+ },
808
+ {
809
+ "epoch": 2.47,
810
+ "grad_norm": 3.380810260772705,
811
+ "learning_rate": 7.663438256658597e-05,
812
+ "loss": 1.4528,
813
+ "step": 1020
814
+ },
815
+ {
816
+ "epoch": 2.49,
817
+ "grad_norm": 3.3973748683929443,
818
+ "learning_rate": 7.542372881355932e-05,
819
+ "loss": 1.5726,
820
+ "step": 1030
821
+ },
822
+ {
823
+ "epoch": 2.52,
824
+ "grad_norm": 2.9069502353668213,
825
+ "learning_rate": 7.421307506053269e-05,
826
+ "loss": 1.2987,
827
+ "step": 1040
828
+ },
829
+ {
830
+ "epoch": 2.54,
831
+ "grad_norm": 2.8832297325134277,
832
+ "learning_rate": 7.300242130750606e-05,
833
+ "loss": 1.437,
834
+ "step": 1050
835
+ },
836
+ {
837
+ "epoch": 2.57,
838
+ "grad_norm": 3.137310743331909,
839
+ "learning_rate": 7.179176755447942e-05,
840
+ "loss": 1.5,
841
+ "step": 1060
842
+ },
843
+ {
844
+ "epoch": 2.59,
845
+ "grad_norm": 3.156430244445801,
846
+ "learning_rate": 7.058111380145279e-05,
847
+ "loss": 1.341,
848
+ "step": 1070
849
+ },
850
+ {
851
+ "epoch": 2.62,
852
+ "grad_norm": 3.470303535461426,
853
+ "learning_rate": 6.937046004842616e-05,
854
+ "loss": 1.3986,
855
+ "step": 1080
856
+ },
857
+ {
858
+ "epoch": 2.64,
859
+ "grad_norm": 3.426010847091675,
860
+ "learning_rate": 6.815980629539952e-05,
861
+ "loss": 1.3874,
862
+ "step": 1090
863
+ },
864
+ {
865
+ "epoch": 2.66,
866
+ "grad_norm": 3.8181042671203613,
867
+ "learning_rate": 6.694915254237288e-05,
868
+ "loss": 1.4775,
869
+ "step": 1100
870
+ },
871
+ {
872
+ "epoch": 2.66,
873
+ "eval_accuracy": 0.7024013722126929,
874
+ "eval_loss": 1.1351556777954102,
875
+ "eval_runtime": 6.2887,
876
+ "eval_samples_per_second": 185.412,
877
+ "eval_steps_per_second": 23.216,
878
+ "step": 1100
879
+ },
880
+ {
881
+ "epoch": 2.69,
882
+ "grad_norm": 3.4228086471557617,
883
+ "learning_rate": 6.573849878934625e-05,
884
+ "loss": 1.4804,
885
+ "step": 1110
886
+ },
887
+ {
888
+ "epoch": 2.71,
889
+ "grad_norm": 4.945833206176758,
890
+ "learning_rate": 6.45278450363196e-05,
891
+ "loss": 1.2617,
892
+ "step": 1120
893
+ },
894
+ {
895
+ "epoch": 2.74,
896
+ "grad_norm": 2.712095022201538,
897
+ "learning_rate": 6.331719128329297e-05,
898
+ "loss": 1.4254,
899
+ "step": 1130
900
+ },
901
+ {
902
+ "epoch": 2.76,
903
+ "grad_norm": 3.2312748432159424,
904
+ "learning_rate": 6.210653753026636e-05,
905
+ "loss": 1.4141,
906
+ "step": 1140
907
+ },
908
+ {
909
+ "epoch": 2.78,
910
+ "grad_norm": 2.4630300998687744,
911
+ "learning_rate": 6.089588377723972e-05,
912
+ "loss": 1.3438,
913
+ "step": 1150
914
+ },
915
+ {
916
+ "epoch": 2.81,
917
+ "grad_norm": 2.9009976387023926,
918
+ "learning_rate": 5.968523002421308e-05,
919
+ "loss": 1.3625,
920
+ "step": 1160
921
+ },
922
+ {
923
+ "epoch": 2.83,
924
+ "grad_norm": 5.364362716674805,
925
+ "learning_rate": 5.8474576271186446e-05,
926
+ "loss": 1.4056,
927
+ "step": 1170
928
+ },
929
+ {
930
+ "epoch": 2.86,
931
+ "grad_norm": 3.0310747623443604,
932
+ "learning_rate": 5.726392251815981e-05,
933
+ "loss": 1.2943,
934
+ "step": 1180
935
+ },
936
+ {
937
+ "epoch": 2.88,
938
+ "grad_norm": 2.7472984790802,
939
+ "learning_rate": 5.605326876513317e-05,
940
+ "loss": 1.4934,
941
+ "step": 1190
942
+ },
943
+ {
944
+ "epoch": 2.91,
945
+ "grad_norm": 2.9528918266296387,
946
+ "learning_rate": 5.484261501210654e-05,
947
+ "loss": 1.2697,
948
+ "step": 1200
949
+ },
950
+ {
951
+ "epoch": 2.91,
952
+ "eval_accuracy": 0.70926243567753,
953
+ "eval_loss": 1.104396104812622,
954
+ "eval_runtime": 6.0071,
955
+ "eval_samples_per_second": 194.105,
956
+ "eval_steps_per_second": 24.305,
957
+ "step": 1200
958
+ },
959
+ {
960
+ "epoch": 2.93,
961
+ "grad_norm": 2.5816805362701416,
962
+ "learning_rate": 5.363196125907991e-05,
963
+ "loss": 1.3362,
964
+ "step": 1210
965
+ },
966
+ {
967
+ "epoch": 2.95,
968
+ "grad_norm": 3.5116188526153564,
969
+ "learning_rate": 5.242130750605327e-05,
970
+ "loss": 1.3128,
971
+ "step": 1220
972
+ },
973
+ {
974
+ "epoch": 2.98,
975
+ "grad_norm": 2.873042583465576,
976
+ "learning_rate": 5.121065375302664e-05,
977
+ "loss": 1.3257,
978
+ "step": 1230
979
+ },
980
+ {
981
+ "epoch": 3.0,
982
+ "grad_norm": 6.232132434844971,
983
+ "learning_rate": 5e-05,
984
+ "loss": 1.256,
985
+ "step": 1240
986
+ },
987
+ {
988
+ "epoch": 3.03,
989
+ "grad_norm": 2.3054957389831543,
990
+ "learning_rate": 4.8789346246973364e-05,
991
+ "loss": 1.1805,
992
+ "step": 1250
993
+ },
994
+ {
995
+ "epoch": 3.05,
996
+ "grad_norm": 3.0687952041625977,
997
+ "learning_rate": 4.757869249394674e-05,
998
+ "loss": 1.0767,
999
+ "step": 1260
1000
+ },
1001
+ {
1002
+ "epoch": 3.08,
1003
+ "grad_norm": 3.774822235107422,
1004
+ "learning_rate": 4.63680387409201e-05,
1005
+ "loss": 1.311,
1006
+ "step": 1270
1007
+ },
1008
+ {
1009
+ "epoch": 3.1,
1010
+ "grad_norm": 4.785544395446777,
1011
+ "learning_rate": 4.515738498789346e-05,
1012
+ "loss": 1.2997,
1013
+ "step": 1280
1014
+ },
1015
+ {
1016
+ "epoch": 3.12,
1017
+ "grad_norm": 3.4525294303894043,
1018
+ "learning_rate": 4.394673123486683e-05,
1019
+ "loss": 1.2039,
1020
+ "step": 1290
1021
+ },
1022
+ {
1023
+ "epoch": 3.15,
1024
+ "grad_norm": 3.312502861022949,
1025
+ "learning_rate": 4.27360774818402e-05,
1026
+ "loss": 1.2137,
1027
+ "step": 1300
1028
+ },
1029
+ {
1030
+ "epoch": 3.15,
1031
+ "eval_accuracy": 0.7615780445969125,
1032
+ "eval_loss": 1.0005759000778198,
1033
+ "eval_runtime": 6.3563,
1034
+ "eval_samples_per_second": 183.44,
1035
+ "eval_steps_per_second": 22.969,
1036
+ "step": 1300
1037
+ },
1038
+ {
1039
+ "epoch": 3.17,
1040
+ "grad_norm": 3.375433921813965,
1041
+ "learning_rate": 4.152542372881356e-05,
1042
+ "loss": 1.2714,
1043
+ "step": 1310
1044
+ },
1045
+ {
1046
+ "epoch": 3.2,
1047
+ "grad_norm": 2.5909006595611572,
1048
+ "learning_rate": 4.0314769975786926e-05,
1049
+ "loss": 1.3154,
1050
+ "step": 1320
1051
+ },
1052
+ {
1053
+ "epoch": 3.22,
1054
+ "grad_norm": 3.0990185737609863,
1055
+ "learning_rate": 3.910411622276029e-05,
1056
+ "loss": 1.144,
1057
+ "step": 1330
1058
+ },
1059
+ {
1060
+ "epoch": 3.24,
1061
+ "grad_norm": 1.911260962486267,
1062
+ "learning_rate": 3.789346246973366e-05,
1063
+ "loss": 1.0008,
1064
+ "step": 1340
1065
+ },
1066
+ {
1067
+ "epoch": 3.27,
1068
+ "grad_norm": 2.93192458152771,
1069
+ "learning_rate": 3.6682808716707024e-05,
1070
+ "loss": 1.0603,
1071
+ "step": 1350
1072
+ },
1073
+ {
1074
+ "epoch": 3.29,
1075
+ "grad_norm": 3.3576924800872803,
1076
+ "learning_rate": 3.5472154963680385e-05,
1077
+ "loss": 1.2791,
1078
+ "step": 1360
1079
+ },
1080
+ {
1081
+ "epoch": 3.32,
1082
+ "grad_norm": 2.8567686080932617,
1083
+ "learning_rate": 3.426150121065376e-05,
1084
+ "loss": 1.1223,
1085
+ "step": 1370
1086
+ },
1087
+ {
1088
+ "epoch": 3.34,
1089
+ "grad_norm": 2.735358953475952,
1090
+ "learning_rate": 3.305084745762712e-05,
1091
+ "loss": 1.2043,
1092
+ "step": 1380
1093
+ },
1094
+ {
1095
+ "epoch": 3.37,
1096
+ "grad_norm": 3.374582529067993,
1097
+ "learning_rate": 3.184019370460048e-05,
1098
+ "loss": 1.0495,
1099
+ "step": 1390
1100
+ },
1101
+ {
1102
+ "epoch": 3.39,
1103
+ "grad_norm": 4.9084792137146,
1104
+ "learning_rate": 3.062953995157385e-05,
1105
+ "loss": 1.423,
1106
+ "step": 1400
1107
+ },
1108
+ {
1109
+ "epoch": 3.39,
1110
+ "eval_accuracy": 0.774442538593482,
1111
+ "eval_loss": 0.9588848352432251,
1112
+ "eval_runtime": 6.2494,
1113
+ "eval_samples_per_second": 186.579,
1114
+ "eval_steps_per_second": 23.362,
1115
+ "step": 1400
1116
+ },
1117
+ {
1118
+ "epoch": 3.41,
1119
+ "grad_norm": 4.47416353225708,
1120
+ "learning_rate": 2.941888619854722e-05,
1121
+ "loss": 1.2965,
1122
+ "step": 1410
1123
+ },
1124
+ {
1125
+ "epoch": 3.44,
1126
+ "grad_norm": 2.692729949951172,
1127
+ "learning_rate": 2.8208232445520583e-05,
1128
+ "loss": 1.1812,
1129
+ "step": 1420
1130
+ },
1131
+ {
1132
+ "epoch": 3.46,
1133
+ "grad_norm": 3.5278244018554688,
1134
+ "learning_rate": 2.6997578692493948e-05,
1135
+ "loss": 1.2515,
1136
+ "step": 1430
1137
+ },
1138
+ {
1139
+ "epoch": 3.49,
1140
+ "grad_norm": 2.9056203365325928,
1141
+ "learning_rate": 2.5786924939467316e-05,
1142
+ "loss": 1.0617,
1143
+ "step": 1440
1144
+ },
1145
+ {
1146
+ "epoch": 3.51,
1147
+ "grad_norm": 2.6366896629333496,
1148
+ "learning_rate": 2.457627118644068e-05,
1149
+ "loss": 1.0449,
1150
+ "step": 1450
1151
+ },
1152
+ {
1153
+ "epoch": 3.54,
1154
+ "grad_norm": 3.593003034591675,
1155
+ "learning_rate": 2.3365617433414045e-05,
1156
+ "loss": 1.1273,
1157
+ "step": 1460
1158
+ },
1159
+ {
1160
+ "epoch": 3.56,
1161
+ "grad_norm": 3.5506863594055176,
1162
+ "learning_rate": 2.215496368038741e-05,
1163
+ "loss": 1.2625,
1164
+ "step": 1470
1165
+ },
1166
+ {
1167
+ "epoch": 3.58,
1168
+ "grad_norm": 4.686192989349365,
1169
+ "learning_rate": 2.0944309927360775e-05,
1170
+ "loss": 1.1439,
1171
+ "step": 1480
1172
+ },
1173
+ {
1174
+ "epoch": 3.61,
1175
+ "grad_norm": 3.072838068008423,
1176
+ "learning_rate": 1.9733656174334143e-05,
1177
+ "loss": 1.2008,
1178
+ "step": 1490
1179
+ },
1180
+ {
1181
+ "epoch": 3.63,
1182
+ "grad_norm": 4.130647659301758,
1183
+ "learning_rate": 1.8523002421307507e-05,
1184
+ "loss": 1.0098,
1185
+ "step": 1500
1186
+ },
1187
+ {
1188
+ "epoch": 3.63,
1189
+ "eval_accuracy": 0.7684391080617495,
1190
+ "eval_loss": 0.9360153675079346,
1191
+ "eval_runtime": 5.9954,
1192
+ "eval_samples_per_second": 194.481,
1193
+ "eval_steps_per_second": 24.352,
1194
+ "step": 1500
1195
+ },
1196
+ {
1197
+ "epoch": 3.66,
1198
+ "grad_norm": 2.432633638381958,
1199
+ "learning_rate": 1.7312348668280872e-05,
1200
+ "loss": 1.0802,
1201
+ "step": 1510
1202
+ },
1203
+ {
1204
+ "epoch": 3.68,
1205
+ "grad_norm": 3.6661131381988525,
1206
+ "learning_rate": 1.6101694915254237e-05,
1207
+ "loss": 1.0655,
1208
+ "step": 1520
1209
+ },
1210
+ {
1211
+ "epoch": 3.7,
1212
+ "grad_norm": 3.967733860015869,
1213
+ "learning_rate": 1.4891041162227603e-05,
1214
+ "loss": 1.1482,
1215
+ "step": 1530
1216
+ },
1217
+ {
1218
+ "epoch": 3.73,
1219
+ "grad_norm": 3.776456832885742,
1220
+ "learning_rate": 1.3680387409200971e-05,
1221
+ "loss": 1.2236,
1222
+ "step": 1540
1223
+ },
1224
+ {
1225
+ "epoch": 3.75,
1226
+ "grad_norm": 3.1570096015930176,
1227
+ "learning_rate": 1.2469733656174334e-05,
1228
+ "loss": 1.0433,
1229
+ "step": 1550
1230
+ },
1231
+ {
1232
+ "epoch": 3.78,
1233
+ "grad_norm": 3.3112399578094482,
1234
+ "learning_rate": 1.12590799031477e-05,
1235
+ "loss": 1.1766,
1236
+ "step": 1560
1237
+ },
1238
+ {
1239
+ "epoch": 3.8,
1240
+ "grad_norm": 3.405649185180664,
1241
+ "learning_rate": 1.0048426150121065e-05,
1242
+ "loss": 1.1755,
1243
+ "step": 1570
1244
+ },
1245
+ {
1246
+ "epoch": 3.83,
1247
+ "grad_norm": 2.6833651065826416,
1248
+ "learning_rate": 8.837772397094432e-06,
1249
+ "loss": 1.0593,
1250
+ "step": 1580
1251
+ },
1252
+ {
1253
+ "epoch": 3.85,
1254
+ "grad_norm": 3.3236443996429443,
1255
+ "learning_rate": 7.627118644067798e-06,
1256
+ "loss": 1.1001,
1257
+ "step": 1590
1258
+ },
1259
+ {
1260
+ "epoch": 3.87,
1261
+ "grad_norm": 3.5733933448791504,
1262
+ "learning_rate": 6.4164648910411625e-06,
1263
+ "loss": 1.1325,
1264
+ "step": 1600
1265
+ },
1266
+ {
1267
+ "epoch": 3.87,
1268
+ "eval_accuracy": 0.7710120068610634,
1269
+ "eval_loss": 0.912144124507904,
1270
+ "eval_runtime": 6.3188,
1271
+ "eval_samples_per_second": 184.528,
1272
+ "eval_steps_per_second": 23.106,
1273
+ "step": 1600
1274
+ },
1275
+ {
1276
+ "epoch": 3.9,
1277
+ "grad_norm": 3.3235766887664795,
1278
+ "learning_rate": 5.205811138014528e-06,
1279
+ "loss": 1.1434,
1280
+ "step": 1610
1281
+ },
1282
+ {
1283
+ "epoch": 3.92,
1284
+ "grad_norm": 5.47670841217041,
1285
+ "learning_rate": 3.9951573849878936e-06,
1286
+ "loss": 1.0415,
1287
+ "step": 1620
1288
+ },
1289
+ {
1290
+ "epoch": 3.95,
1291
+ "grad_norm": 2.83181095123291,
1292
+ "learning_rate": 2.784503631961259e-06,
1293
+ "loss": 1.0888,
1294
+ "step": 1630
1295
+ },
1296
+ {
1297
+ "epoch": 3.97,
1298
+ "grad_norm": 4.57571268081665,
1299
+ "learning_rate": 1.5738498789346248e-06,
1300
+ "loss": 1.0908,
1301
+ "step": 1640
1302
+ },
1303
+ {
1304
+ "epoch": 4.0,
1305
+ "grad_norm": 3.1416895389556885,
1306
+ "learning_rate": 3.6319612590799036e-07,
1307
+ "loss": 0.9855,
1308
+ "step": 1650
1309
+ },
1310
+ {
1311
+ "epoch": 4.0,
1312
+ "step": 1652,
1313
+ "total_flos": 2.047635634195759e+18,
1314
+ "train_loss": 1.8191430680543978,
1315
+ "train_runtime": 594.0822,
1316
+ "train_samples_per_second": 44.458,
1317
+ "train_steps_per_second": 2.781
1318
+ }
1319
+ ],
1320
+ "logging_steps": 10,
1321
+ "max_steps": 1652,
1322
+ "num_input_tokens_seen": 0,
1323
+ "num_train_epochs": 4,
1324
+ "save_steps": 100,
1325
+ "total_flos": 2.047635634195759e+18,
1326
+ "train_batch_size": 16,
1327
+ "trial_name": null,
1328
+ "trial_params": null
1329
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cbb4d0502cb6aa8e763c1e6b3bea2a272af2e0cd58d51af575190057bab553e7
3
  size 4920
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52755d98ad2bd9ec55bf7137c74905f11d5f04d9a40c982b4c6e4d07c6bb986d
3
  size 4920