thomnis commited on
Commit
53f7683
·
verified ·
1 Parent(s): e4a7887

Training in progress, step 3180

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f616f630746d04c9175206b323f3efc1fba028a9d9ae24628f0eb46c4b4f6d09
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3af5b47ba404b3ef2f87defb7e11fd5376d55cadf07c6b8c59e1c82fd72748ba
3
  size 268290900
run-0/checkpoint-4770/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65e5543a4e295e69fdfb44843883da15cfd3b9bf17821bac59341fd39ee8bde7
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc29d6af29939b29f91bdbace98d3cdc3a61f370227ddd284844628979bfd7c2
3
  size 268290900
run-0/checkpoint-4770/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f9b6945fd2fa9e8dbe19a9f55c88b893d51903ad9dfef37fcbdf9ed901342acf
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1df0e8342d5f260a38e69245d8f2ffc780a567abe7bc1be4281fa5a82d7b9111
3
  size 536643898
run-0/checkpoint-4770/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f8801c763f9120e5f53ce15acda99946f2cd7188301cb6528dceee25f4d1655
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8840c149b61925bd5e83fa32c5310aec7035d57589d5f1f11c8325abd3ac0bf
3
  size 14244
run-0/checkpoint-4770/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b7cafc595af315fbc075588bd6ada31386ccf077c11a4eaa4be1c5fa21a079e9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d06c54e3d86a082332e59d2e96766e07aa99bd4ff2d81ee9db09bd2d10dd61b
3
  size 1064
run-0/checkpoint-4770/trainer_state.json CHANGED
@@ -10,233 +10,233 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.8087115287780762,
14
- "learning_rate": 1.916642112888053e-05,
15
- "loss": 0.6426,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.4332258064516129,
21
- "eval_loss": 0.3768153786659241,
22
- "eval_runtime": 2.7098,
23
- "eval_samples_per_second": 1144.007,
24
- "eval_steps_per_second": 23.987,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.5955724120140076,
30
- "learning_rate": 2.817448208700141e-05,
31
- "loss": 0.2402,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.8554838709677419,
37
- "eval_loss": 0.09723836928606033,
38
- "eval_runtime": 2.6882,
39
- "eval_samples_per_second": 1153.187,
40
- "eval_steps_per_second": 24.18,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.3765904903411865,
46
- "learning_rate": 2.6015074248128236e-05,
47
- "loss": 0.0949,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
- "eval_accuracy": 0.9080645161290323,
53
- "eval_loss": 0.04660297930240631,
54
- "eval_runtime": 2.6607,
55
- "eval_samples_per_second": 1165.117,
56
- "eval_steps_per_second": 24.43,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.26715776324272156,
62
- "learning_rate": 2.385566640925506e-05,
63
- "loss": 0.0599,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.9283870967741935,
69
- "eval_loss": 0.03252074867486954,
70
- "eval_runtime": 2.6662,
71
- "eval_samples_per_second": 1162.716,
72
- "eval_steps_per_second": 24.38,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.20171727240085602,
78
- "learning_rate": 2.1696258570381886e-05,
79
- "loss": 0.0462,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
- "eval_accuracy": 0.9309677419354838,
85
- "eval_loss": 0.026936793699860573,
86
- "eval_runtime": 2.6975,
87
- "eval_samples_per_second": 1149.215,
88
- "eval_steps_per_second": 24.096,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.21952152252197266,
94
- "learning_rate": 1.9536850731508715e-05,
95
- "loss": 0.0395,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
- "eval_accuracy": 0.9345161290322581,
101
- "eval_loss": 0.023858336731791496,
102
- "eval_runtime": 2.6565,
103
- "eval_samples_per_second": 1166.971,
104
- "eval_steps_per_second": 24.469,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.30627548694610596,
110
- "learning_rate": 1.737744289263554e-05,
111
- "loss": 0.0356,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.9354838709677419,
117
- "eval_loss": 0.022071754559874535,
118
- "eval_runtime": 2.6587,
119
- "eval_samples_per_second": 1165.985,
120
- "eval_steps_per_second": 24.448,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.16566617786884308,
126
- "learning_rate": 1.5218035053762365e-05,
127
- "loss": 0.0328,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.9370967741935484,
133
- "eval_loss": 0.021090181544423103,
134
- "eval_runtime": 2.6745,
135
- "eval_samples_per_second": 1159.074,
136
- "eval_steps_per_second": 24.303,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.16504672169685364,
142
- "learning_rate": 1.3058627214889192e-05,
143
- "loss": 0.0308,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
- "eval_accuracy": 0.9348387096774193,
149
- "eval_loss": 0.019647156819701195,
150
- "eval_runtime": 2.6634,
151
- "eval_samples_per_second": 1163.916,
152
- "eval_steps_per_second": 24.405,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
- "grad_norm": 0.1753920316696167,
158
- "learning_rate": 1.0899219376016019e-05,
159
- "loss": 0.0293,
160
  "step": 3170
161
  },
162
  {
163
  "epoch": 10.0,
164
- "eval_accuracy": 0.9367741935483871,
165
- "eval_loss": 0.019323358312249184,
166
- "eval_runtime": 2.6766,
167
- "eval_samples_per_second": 1158.172,
168
- "eval_steps_per_second": 24.284,
169
  "step": 3180
170
  },
171
  {
172
  "epoch": 10.965408805031446,
173
- "grad_norm": 0.16472382843494415,
174
- "learning_rate": 8.739811537142844e-06,
175
- "loss": 0.028,
176
  "step": 3487
177
  },
178
  {
179
  "epoch": 11.0,
180
- "eval_accuracy": 0.9338709677419355,
181
- "eval_loss": 0.019101083278656006,
182
- "eval_runtime": 2.6594,
183
- "eval_samples_per_second": 1165.659,
184
- "eval_steps_per_second": 24.441,
185
  "step": 3498
186
  },
187
  {
188
  "epoch": 11.962264150943396,
189
- "grad_norm": 0.1276603639125824,
190
- "learning_rate": 6.580403698269671e-06,
191
- "loss": 0.0271,
192
  "step": 3804
193
  },
194
  {
195
  "epoch": 12.0,
196
- "eval_accuracy": 0.9341935483870968,
197
- "eval_loss": 0.018606653437018394,
198
- "eval_runtime": 2.6418,
199
- "eval_samples_per_second": 1173.436,
200
- "eval_steps_per_second": 24.604,
201
  "step": 3816
202
  },
203
  {
204
  "epoch": 12.959119496855346,
205
- "grad_norm": 0.13854487240314484,
206
- "learning_rate": 4.420995859396498e-06,
207
- "loss": 0.0264,
208
  "step": 4121
209
  },
210
  {
211
  "epoch": 13.0,
212
- "eval_accuracy": 0.9358064516129032,
213
- "eval_loss": 0.018243877217173576,
214
- "eval_runtime": 2.6544,
215
- "eval_samples_per_second": 1167.882,
216
- "eval_steps_per_second": 24.488,
217
  "step": 4134
218
  },
219
  {
220
  "epoch": 13.955974842767295,
221
- "grad_norm": 0.1205814927816391,
222
- "learning_rate": 2.2615880205233243e-06,
223
- "loss": 0.0259,
224
  "step": 4438
225
  },
226
  {
227
  "epoch": 14.0,
228
- "eval_accuracy": 0.9361290322580645,
229
- "eval_loss": 0.01793498359620571,
230
- "eval_runtime": 2.6531,
231
- "eval_samples_per_second": 1168.457,
232
- "eval_steps_per_second": 24.5,
233
  "step": 4452
234
  },
235
  {
236
  "epoch": 14.952830188679245,
237
- "grad_norm": 0.13082493841648102,
238
- "learning_rate": 1.0218018165015018e-07,
239
- "loss": 0.0256,
240
  "step": 4755
241
  }
242
  ],
@@ -257,17 +257,15 @@
257
  "attributes": {}
258
  }
259
  },
260
- "total_flos": 1259981299661700.0,
261
  "train_batch_size": 48,
262
  "trial_name": null,
263
  "trial_params": {
264
- "alpha": 0.41037073052052975,
265
- "fp16": false,
266
- "learning_rate": 2.920309591561292e-05,
267
- "lr_scheduler": "cosine",
268
  "num_train_epochs": 15,
269
- "temperature": 4,
270
- "warmup_steps": 483,
271
- "weight_decay": 0.1243517366819557
272
  }
273
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.8092947006225586,
14
+ "learning_rate": 0.0004835195985179114,
15
+ "loss": 0.2576,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.7422580645161291,
21
+ "eval_loss": 0.11835014075040817,
22
+ "eval_runtime": 5.4376,
23
+ "eval_samples_per_second": 570.107,
24
+ "eval_steps_per_second": 11.954,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.4659373164176941,
30
+ "learning_rate": 0.0004678272940120885,
31
+ "loss": 0.0912,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.8132258064516129,
37
+ "eval_loss": 0.08229727298021317,
38
+ "eval_runtime": 5.3212,
39
+ "eval_samples_per_second": 582.581,
40
+ "eval_steps_per_second": 12.215,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.39092108607292175,
46
+ "learning_rate": 0.0004424318300788979,
47
+ "loss": 0.058,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
+ "eval_accuracy": 0.853225806451613,
53
+ "eval_loss": 0.07101583480834961,
54
+ "eval_runtime": 5.4193,
55
+ "eval_samples_per_second": 572.031,
56
+ "eval_steps_per_second": 11.994,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.18997839093208313,
62
+ "learning_rate": 0.00040843616613818045,
63
+ "loss": 0.0468,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.8619354838709677,
69
+ "eval_loss": 0.061981625854969025,
70
+ "eval_runtime": 5.4233,
71
+ "eval_samples_per_second": 571.613,
72
+ "eval_steps_per_second": 11.985,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.57562655210495,
78
+ "learning_rate": 0.00036731677995288685,
79
+ "loss": 0.0404,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
+ "eval_accuracy": 0.8870967741935484,
85
+ "eval_loss": 0.05836557596921921,
86
+ "eval_runtime": 5.4115,
87
+ "eval_samples_per_second": 572.852,
88
+ "eval_steps_per_second": 12.011,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.656391978263855,
94
+ "learning_rate": 0.0003208595421986017,
95
+ "loss": 0.0363,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
+ "eval_accuracy": 0.8767741935483871,
101
+ "eval_loss": 0.057680290192365646,
102
+ "eval_runtime": 5.4159,
103
+ "eval_samples_per_second": 572.384,
104
+ "eval_steps_per_second": 12.002,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.21630945801734924,
110
+ "learning_rate": 0.00027108215367951916,
111
+ "loss": 0.0284,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.8964516129032258,
117
+ "eval_loss": 0.05861014127731323,
118
+ "eval_runtime": 5.4131,
119
+ "eval_samples_per_second": 572.683,
120
+ "eval_steps_per_second": 12.008,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.07545796036720276,
126
+ "learning_rate": 0.00022014651384770874,
127
+ "loss": 0.0245,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.9048387096774193,
133
+ "eval_loss": 0.04624096304178238,
134
+ "eval_runtime": 5.409,
135
+ "eval_samples_per_second": 573.114,
136
+ "eval_steps_per_second": 12.017,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.08013833314180374,
142
+ "learning_rate": 0.000170264826579655,
143
+ "loss": 0.0209,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
+ "eval_accuracy": 0.9106451612903226,
149
+ "eval_loss": 0.042855095118284225,
150
+ "eval_runtime": 5.3995,
151
+ "eval_samples_per_second": 574.13,
152
+ "eval_steps_per_second": 12.038,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
+ "grad_norm": 0.14883077144622803,
158
+ "learning_rate": 0.00012360352116346234,
159
+ "loss": 0.0192,
160
  "step": 3170
161
  },
162
  {
163
  "epoch": 10.0,
164
+ "eval_accuracy": 0.9132258064516129,
165
+ "eval_loss": 0.04050453379750252,
166
+ "eval_runtime": 5.3574,
167
+ "eval_samples_per_second": 578.637,
168
+ "eval_steps_per_second": 12.133,
169
  "step": 3180
170
  },
171
  {
172
  "epoch": 10.965408805031446,
173
+ "grad_norm": 0.19123013317584991,
174
+ "learning_rate": 8.218916133847685e-05,
175
+ "loss": 0.0179,
176
  "step": 3487
177
  },
178
  {
179
  "epoch": 11.0,
180
+ "eval_accuracy": 0.9125806451612903,
181
+ "eval_loss": 0.04078555479645729,
182
+ "eval_runtime": 5.4382,
183
+ "eval_samples_per_second": 570.045,
184
+ "eval_steps_per_second": 11.953,
185
  "step": 3498
186
  },
187
  {
188
  "epoch": 11.962264150943396,
189
+ "grad_norm": 0.04665813222527504,
190
+ "learning_rate": 4.782042888526468e-05,
191
+ "loss": 0.0168,
192
  "step": 3804
193
  },
194
  {
195
  "epoch": 12.0,
196
+ "eval_accuracy": 0.9141935483870968,
197
+ "eval_loss": 0.039393454790115356,
198
+ "eval_runtime": 5.4122,
199
+ "eval_samples_per_second": 572.776,
200
+ "eval_steps_per_second": 12.01,
201
  "step": 3816
202
  },
203
  {
204
  "epoch": 12.959119496855346,
205
+ "grad_norm": 0.05185122787952423,
206
+ "learning_rate": 2.1990004437934068e-05,
207
+ "loss": 0.016,
208
  "step": 4121
209
  },
210
  {
211
  "epoch": 13.0,
212
+ "eval_accuracy": 0.9141935483870968,
213
+ "eval_loss": 0.038763027638196945,
214
+ "eval_runtime": 5.3902,
215
+ "eval_samples_per_second": 575.122,
216
+ "eval_steps_per_second": 12.059,
217
  "step": 4134
218
  },
219
  {
220
  "epoch": 13.955974842767295,
221
+ "grad_norm": 0.04909258708357811,
222
+ "learning_rate": 5.819738341004267e-06,
223
+ "loss": 0.0156,
224
  "step": 4438
225
  },
226
  {
227
  "epoch": 14.0,
228
+ "eval_accuracy": 0.912258064516129,
229
+ "eval_loss": 0.03870353475213051,
230
+ "eval_runtime": 5.403,
231
+ "eval_samples_per_second": 573.76,
232
+ "eval_steps_per_second": 12.03,
233
  "step": 4452
234
  },
235
  {
236
  "epoch": 14.952830188679245,
237
+ "grad_norm": 0.04776826128363609,
238
+ "learning_rate": 1.1927168377902104e-08,
239
+ "loss": 0.0153,
240
  "step": 4755
241
  }
242
  ],
 
257
  "attributes": {}
258
  }
259
  },
260
+ "total_flos": 1236646073993904.0,
261
  "train_batch_size": 48,
262
  "trial_name": null,
263
  "trial_params": {
264
+ "alpha": 0.6530130862589958,
265
+ "learning_rate": 0.0004888272055421989,
266
+ "lr_scheduler_type": "cosine",
 
267
  "num_train_epochs": 15,
268
+ "temperature": 12.988061249662817,
269
+ "weight_decay": 0.12001185240531148
 
270
  }
271
  }
run-0/checkpoint-4770/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98549b52b22df474fbbc0748be6e1194cec163ce38b8fb481e05fa237fc1202a
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcb2de51440170a957f85acebd5913a80a242d1ed0e9f3a20c96e7877c9cde03
3
  size 5240
run-1/checkpoint-3180/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ea7db98346b2c0cd208ac3ebdf7072aba54c51a3fc69db5503e7b871d7dedfce
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3af5b47ba404b3ef2f87defb7e11fd5376d55cadf07c6b8c59e1c82fd72748ba
3
  size 268290900
run-1/checkpoint-3180/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a6cd76751c4db51ae15207a0b853b539e0e6d654f068b2b256c7097dbc214512
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:405119682f9239707233567dd5d9a60d32a1ea6f44d37d2a5571862ca57d2ead
3
  size 536643898
run-1/checkpoint-3180/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1c7efb9327cc6cac3837eb529885b0c020016a0b54b94de8e8090f5dcf4e6f3
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9817e48018a891540f49bc64fd6d77915ab61cfea36550e9faed889a7f70bc14
3
  size 1064
run-1/checkpoint-3180/trainer_state.json CHANGED
@@ -10,153 +10,153 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
- "grad_norm": 0.6745762825012207,
14
- "learning_rate": 2.7360787331269834e-05,
15
- "loss": 0.357,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.7412903225806452,
21
- "eval_loss": 0.14044521749019623,
22
- "eval_runtime": 5.4763,
23
- "eval_samples_per_second": 566.072,
24
- "eval_steps_per_second": 11.869,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
- "grad_norm": 0.4204491376876831,
30
- "learning_rate": 2.433131838819874e-05,
31
- "loss": 0.1207,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.8770967741935484,
37
- "eval_loss": 0.06287968158721924,
38
- "eval_runtime": 5.5283,
39
- "eval_samples_per_second": 560.754,
40
- "eval_steps_per_second": 11.758,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
- "grad_norm": 0.4077504873275757,
46
- "learning_rate": 2.130184944512765e-05,
47
- "loss": 0.0718,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
- "eval_accuracy": 0.9116129032258065,
53
- "eval_loss": 0.04054585471749306,
54
- "eval_runtime": 5.4053,
55
- "eval_samples_per_second": 573.506,
56
- "eval_steps_per_second": 12.025,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
- "grad_norm": 0.2433169037103653,
62
- "learning_rate": 1.8272380502056557e-05,
63
- "loss": 0.0534,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.9274193548387096,
69
- "eval_loss": 0.03159501776099205,
70
- "eval_runtime": 5.4668,
71
- "eval_samples_per_second": 567.056,
72
- "eval_steps_per_second": 11.89,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
- "grad_norm": 0.21281147003173828,
78
- "learning_rate": 1.5242911558985466e-05,
79
- "loss": 0.0442,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
  "eval_accuracy": 0.9309677419354838,
85
- "eval_loss": 0.02665688283741474,
86
- "eval_runtime": 5.5204,
87
- "eval_samples_per_second": 561.552,
88
- "eval_steps_per_second": 11.774,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
- "grad_norm": 0.23512092232704163,
94
- "learning_rate": 1.2213442615914374e-05,
95
- "loss": 0.0389,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
- "eval_accuracy": 0.9329032258064516,
101
- "eval_loss": 0.024243181571364403,
102
- "eval_runtime": 5.4419,
103
- "eval_samples_per_second": 569.649,
104
- "eval_steps_per_second": 11.944,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
- "grad_norm": 0.2756204605102539,
110
- "learning_rate": 9.18397367284328e-06,
111
- "loss": 0.0356,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.9361290322580645,
117
- "eval_loss": 0.02270686812698841,
118
- "eval_runtime": 5.4205,
119
- "eval_samples_per_second": 571.9,
120
- "eval_steps_per_second": 11.991,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
- "grad_norm": 0.1520700752735138,
126
- "learning_rate": 6.154504729772188e-06,
127
- "loss": 0.0336,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.9354838709677419,
133
- "eval_loss": 0.021294621750712395,
134
- "eval_runtime": 5.4685,
135
- "eval_samples_per_second": 566.883,
136
- "eval_steps_per_second": 11.886,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
- "grad_norm": 0.2087375968694687,
142
- "learning_rate": 3.1250357867010953e-06,
143
- "loss": 0.0322,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
- "eval_accuracy": 0.9354838709677419,
149
- "eval_loss": 0.020687058568000793,
150
- "eval_runtime": 5.4663,
151
- "eval_samples_per_second": 567.112,
152
- "eval_steps_per_second": 11.891,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
- "grad_norm": 0.14469225704669952,
158
- "learning_rate": 9.556684363000292e-08,
159
- "loss": 0.0314,
160
  "step": 3170
161
  }
162
  ],
@@ -181,11 +181,11 @@
181
  "train_batch_size": 48,
182
  "trial_name": null,
183
  "trial_params": {
184
- "alpha": 0.0478722237588074,
185
- "learning_rate": 3.0390256274340926e-05,
186
- "lr_scheduler_type": "linear",
187
  "num_train_epochs": 10,
188
- "temperature": 7.533742821161418,
189
- "weight_decay": 0.25236012891933407
190
  }
191
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 0.9968553459119497,
13
+ "grad_norm": 0.316487580537796,
14
+ "learning_rate": 0.0003250399324920357,
15
+ "loss": 0.2047,
16
  "step": 317
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.867741935483871,
21
+ "eval_loss": 0.06543365120887756,
22
+ "eval_runtime": 5.3433,
23
+ "eval_samples_per_second": 580.166,
24
+ "eval_steps_per_second": 12.165,
25
  "step": 318
26
  },
27
  {
28
  "epoch": 1.9937106918238994,
29
+ "grad_norm": 0.5043902397155762,
30
+ "learning_rate": 0.00030152270079867525,
31
+ "loss": 0.0555,
32
  "step": 634
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.9038709677419355,
37
+ "eval_loss": 0.05041499063372612,
38
+ "eval_runtime": 5.404,
39
+ "eval_samples_per_second": 573.648,
40
+ "eval_steps_per_second": 12.028,
41
  "step": 636
42
  },
43
  {
44
  "epoch": 2.990566037735849,
45
+ "grad_norm": 0.3954188823699951,
46
+ "learning_rate": 0.0002648777146860182,
47
+ "loss": 0.038,
48
  "step": 951
49
  },
50
  {
51
  "epoch": 3.0,
52
+ "eval_accuracy": 0.9180645161290323,
53
+ "eval_loss": 0.04048455134034157,
54
+ "eval_runtime": 5.3969,
55
+ "eval_samples_per_second": 574.404,
56
+ "eval_steps_per_second": 12.044,
57
  "step": 954
58
  },
59
  {
60
  "epoch": 3.9874213836477987,
61
+ "grad_norm": 0.5388094186782837,
62
+ "learning_rate": 0.00021866970042254042,
63
+ "loss": 0.029,
64
  "step": 1268
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.9296774193548387,
69
+ "eval_loss": 0.03541301190853119,
70
+ "eval_runtime": 5.4403,
71
+ "eval_samples_per_second": 569.818,
72
+ "eval_steps_per_second": 11.948,
73
  "step": 1272
74
  },
75
  {
76
  "epoch": 4.984276729559748,
77
+ "grad_norm": 0.07852072268724442,
78
+ "learning_rate": 0.00016739365008581398,
79
+ "loss": 0.0235,
80
  "step": 1585
81
  },
82
  {
83
  "epoch": 5.0,
84
  "eval_accuracy": 0.9309677419354838,
85
+ "eval_loss": 0.03130786865949631,
86
+ "eval_runtime": 5.3733,
87
+ "eval_samples_per_second": 576.931,
88
+ "eval_steps_per_second": 12.097,
89
  "step": 1590
90
  },
91
  {
92
  "epoch": 5.981132075471698,
93
+ "grad_norm": 0.08189109712839127,
94
+ "learning_rate": 0.0001160375607518124,
95
+ "loss": 0.0202,
96
  "step": 1902
97
  },
98
  {
99
  "epoch": 6.0,
100
+ "eval_accuracy": 0.9364516129032258,
101
+ "eval_loss": 0.02977406606078148,
102
+ "eval_runtime": 5.37,
103
+ "eval_samples_per_second": 577.282,
104
+ "eval_steps_per_second": 12.104,
105
  "step": 1908
106
  },
107
  {
108
  "epoch": 6.977987421383648,
109
+ "grad_norm": 0.07439889013767242,
110
+ "learning_rate": 6.959721547615756e-05,
111
+ "loss": 0.0181,
112
  "step": 2219
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.94,
117
+ "eval_loss": 0.02745823562145233,
118
+ "eval_runtime": 5.4422,
119
+ "eval_samples_per_second": 569.621,
120
+ "eval_steps_per_second": 11.944,
121
  "step": 2226
122
  },
123
  {
124
  "epoch": 7.9748427672955975,
125
+ "grad_norm": 0.06082445755600929,
126
+ "learning_rate": 3.25902068760846e-05,
127
+ "loss": 0.0164,
128
  "step": 2536
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.9409677419354838,
133
+ "eval_loss": 0.0263341274112463,
134
+ "eval_runtime": 5.4487,
135
+ "eval_samples_per_second": 568.945,
136
+ "eval_steps_per_second": 11.929,
137
  "step": 2544
138
  },
139
  {
140
  "epoch": 8.971698113207546,
141
+ "grad_norm": 0.060345038771629333,
142
+ "learning_rate": 8.616477799677371e-06,
143
+ "loss": 0.0157,
144
  "step": 2853
145
  },
146
  {
147
  "epoch": 9.0,
148
+ "eval_accuracy": 0.9403225806451613,
149
+ "eval_loss": 0.025894558057188988,
150
+ "eval_runtime": 5.3968,
151
+ "eval_samples_per_second": 574.416,
152
+ "eval_steps_per_second": 12.044,
153
  "step": 2862
154
  },
155
  {
156
  "epoch": 9.968553459119496,
157
+ "grad_norm": 0.05505794286727905,
158
+ "learning_rate": 8.128511055690263e-09,
159
+ "loss": 0.0154,
160
  "step": 3170
161
  }
162
  ],
 
181
  "train_batch_size": 48,
182
  "trial_name": null,
183
  "trial_params": {
184
+ "alpha": 0.7646743801285832,
185
+ "learning_rate": 0.00033314171634682974,
186
+ "lr_scheduler_type": "cosine",
187
  "num_train_epochs": 10,
188
+ "temperature": 5.485237170675724,
189
+ "weight_decay": 0.1816702846280333
190
  }
191
  }
run-1/checkpoint-3180/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bea033762087a013bcaa4855bf7aa0df6974842e0d16f839e7b485dbf4ad1b90
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8997ba64c78b786621c16831c31cf39c03bf94132fb6b567c38b51563d00871
3
  size 5240
runs/Oct20_13-24-54_87443764e281/events.out.tfevents.1729430716.87443764e281.307.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb6900bdfa3d451111dd92904bbb5c6458b693a6f5700b92ca4667aa36b2e01d
3
+ size 20862
runs/Oct20_13-24-54_87443764e281/events.out.tfevents.1729431620.87443764e281.307.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65101625d9bfcf5ad22ec451c8698810ccd3415b2f61a9476a74da5e9eded3f3
3
+ size 18155
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c744f9b5cd82a802cdd7073267f096e67cf946c7be55e97ab845307ca43ebf7
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8997ba64c78b786621c16831c31cf39c03bf94132fb6b567c38b51563d00871
3
  size 5240