jaime-epoch-metrics commited on
Commit
0b16855
·
verified ·
1 Parent(s): 3c5c868

epochmetrics/task-embedder

Browse files
README.md CHANGED
@@ -13,14 +13,14 @@ model-index:
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
  should probably proofread and complete it, then remove this comment. -->
15
 
16
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/epoch-metrics/fine-tuning/runs/cpytiehg)
17
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/epoch-metrics/fine-tuning/runs/cpytiehg)
18
  # task-embedder
19
 
20
  This model is a fine-tuned version of [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 2.0565
23
- - Accuracy: 0.6332
24
 
25
  ## Model description
26
 
@@ -51,21 +51,21 @@ The following hyperparameters were used during training:
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
- | 5.6287 | 1.0 | 171 | 4.0112 | 0.3787 |
55
- | 3.7578 | 2.0 | 342 | 3.2693 | 0.4648 |
56
- | 3.2266 | 3.0 | 513 | 2.9233 | 0.5083 |
57
- | 2.9062 | 4.0 | 684 | 2.6422 | 0.5454 |
58
- | 2.7046 | 5.0 | 855 | 2.5057 | 0.5657 |
59
- | 2.5462 | 6.0 | 1026 | 2.3794 | 0.5850 |
60
- | 2.4348 | 7.0 | 1197 | 2.2906 | 0.5981 |
61
- | 2.3406 | 8.0 | 1368 | 2.2580 | 0.6043 |
62
- | 2.2544 | 9.0 | 1539 | 2.1751 | 0.6137 |
63
- | 2.2031 | 10.0 | 1710 | 2.1368 | 0.6225 |
64
- | 2.1693 | 11.0 | 1881 | 2.1410 | 0.6185 |
65
- | 2.1243 | 12.0 | 2052 | 2.0609 | 0.6291 |
66
- | 2.086 | 13.0 | 2223 | 2.0226 | 0.6354 |
67
- | 2.0771 | 14.0 | 2394 | 2.0461 | 0.6358 |
68
- | 2.0692 | 15.0 | 2565 | 2.0071 | 0.6430 |
69
 
70
 
71
  ### Framework versions
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
  should probably proofread and complete it, then remove this comment. -->
15
 
16
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/epoch-metrics/fine-tuning/runs/dnlz2u2m)
17
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/epoch-metrics/fine-tuning/runs/dnlz2u2m)
18
  # task-embedder
19
 
20
  This model is a fine-tuned version of [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 2.2775
23
+ - Accuracy: 0.5753
24
 
25
  ## Model description
26
 
 
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
+ | 5.6652 | 1.0 | 83 | 4.2850 | 0.3649 |
55
+ | 3.9443 | 2.0 | 166 | 3.5407 | 0.4281 |
56
+ | 3.3575 | 3.0 | 249 | 3.1092 | 0.4710 |
57
+ | 3.084 | 4.0 | 332 | 2.8743 | 0.4962 |
58
+ | 2.8764 | 5.0 | 415 | 2.7020 | 0.5211 |
59
+ | 2.7367 | 6.0 | 498 | 2.6699 | 0.5188 |
60
+ | 2.6275 | 7.0 | 581 | 2.5638 | 0.5404 |
61
+ | 2.5257 | 8.0 | 664 | 2.5348 | 0.5430 |
62
+ | 2.4742 | 9.0 | 747 | 2.4302 | 0.5591 |
63
+ | 2.4238 | 10.0 | 830 | 2.4159 | 0.5577 |
64
+ | 2.3516 | 11.0 | 913 | 2.3461 | 0.5741 |
65
+ | 2.3115 | 12.0 | 996 | 2.3291 | 0.5728 |
66
+ | 2.29 | 13.0 | 1079 | 2.3577 | 0.5698 |
67
+ | 2.2412 | 14.0 | 1162 | 2.3473 | 0.5674 |
68
+ | 2.245 | 15.0 | 1245 | 2.3113 | 0.5720 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 15.0,
3
- "eval_accuracy": 0.6331559235166568,
4
- "eval_loss": 2.056515693664551,
5
- "eval_runtime": 8.1808,
6
- "eval_samples": 350,
7
- "eval_samples_per_second": 42.783,
8
- "eval_steps_per_second": 5.378,
9
- "perplexity": 7.818679619302798,
10
- "total_flos": 5397015001420800.0,
11
- "train_loss": 2.701929186845151,
12
- "train_runtime": 4016.0696,
13
- "train_samples": 1367,
14
- "train_samples_per_second": 5.106,
15
- "train_steps_per_second": 0.639
16
  }
 
1
  {
2
  "epoch": 15.0,
3
+ "eval_accuracy": 0.575332866152768,
4
+ "eval_loss": 2.2775449752807617,
5
+ "eval_runtime": 3.8167,
6
+ "eval_samples": 164,
7
+ "eval_samples_per_second": 42.969,
8
+ "eval_steps_per_second": 5.502,
9
+ "perplexity": 9.752707856097555,
10
+ "total_flos": 2605727798784000.0,
11
+ "train_loss": 2.876972158654148,
12
+ "train_runtime": 3151.4946,
13
+ "train_samples": 660,
14
+ "train_samples_per_second": 3.141,
15
+ "train_steps_per_second": 0.395
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 15.0,
3
- "eval_accuracy": 0.6331559235166568,
4
- "eval_loss": 2.056515693664551,
5
- "eval_runtime": 8.1808,
6
- "eval_samples": 350,
7
- "eval_samples_per_second": 42.783,
8
- "eval_steps_per_second": 5.378,
9
- "perplexity": 7.818679619302798
10
  }
 
1
  {
2
  "epoch": 15.0,
3
+ "eval_accuracy": 0.575332866152768,
4
+ "eval_loss": 2.2775449752807617,
5
+ "eval_runtime": 3.8167,
6
+ "eval_samples": 164,
7
+ "eval_samples_per_second": 42.969,
8
+ "eval_steps_per_second": 5.502,
9
+ "perplexity": 9.752707856097555
10
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2dcd47ba207a3f860f8ae2d2805dcf6163dae73d4d41393602f3b1985f1fa65a
3
  size 438097372
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb843bd317c02b0ffe6ea2191b10bf635670e51473572909500f0a010125d309
3
  size 438097372
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 15.0,
3
- "total_flos": 5397015001420800.0,
4
- "train_loss": 2.701929186845151,
5
- "train_runtime": 4016.0696,
6
- "train_samples": 1367,
7
- "train_samples_per_second": 5.106,
8
- "train_steps_per_second": 0.639
9
  }
 
1
  {
2
  "epoch": 15.0,
3
+ "total_flos": 2605727798784000.0,
4
+ "train_loss": 2.876972158654148,
5
+ "train_runtime": 3151.4946,
6
+ "train_samples": 660,
7
+ "train_samples_per_second": 3.141,
8
+ "train_steps_per_second": 0.395
9
  }
trainer_state.json CHANGED
@@ -1,265 +1,265 @@
1
  {
2
- "best_metric": 2.0070760250091553,
3
- "best_model_checkpoint": "epochmetrics/task-embedder/checkpoint-2565",
4
  "epoch": 15.0,
5
  "eval_steps": 1,
6
- "global_step": 2565,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "grad_norm": 6.0819196701049805,
14
  "learning_rate": 4.666666666666667e-05,
15
- "loss": 5.6287,
16
- "step": 171
17
  },
18
  {
19
  "epoch": 1.0,
20
- "eval_accuracy": 0.3786586320237909,
21
- "eval_loss": 4.011183738708496,
22
- "eval_runtime": 13.9675,
23
- "eval_samples_per_second": 25.058,
24
- "eval_steps_per_second": 3.15,
25
- "step": 171
26
  },
27
  {
28
  "epoch": 2.0,
29
- "grad_norm": 5.902393341064453,
30
  "learning_rate": 4.3333333333333334e-05,
31
- "loss": 3.7578,
32
- "step": 342
33
  },
34
  {
35
  "epoch": 2.0,
36
- "eval_accuracy": 0.4647582296424634,
37
- "eval_loss": 3.269301414489746,
38
- "eval_runtime": 14.2546,
39
- "eval_samples_per_second": 24.554,
40
- "eval_steps_per_second": 3.087,
41
- "step": 342
42
  },
43
  {
44
  "epoch": 3.0,
45
- "grad_norm": 6.104726314544678,
46
  "learning_rate": 4e-05,
47
- "loss": 3.2266,
48
- "step": 513
49
  },
50
  {
51
  "epoch": 3.0,
52
- "eval_accuracy": 0.508273029532564,
53
- "eval_loss": 2.9233286380767822,
54
- "eval_runtime": 8.5864,
55
- "eval_samples_per_second": 40.762,
56
- "eval_steps_per_second": 5.124,
57
- "step": 513
58
  },
59
  {
60
  "epoch": 4.0,
61
- "grad_norm": 5.560727119445801,
62
  "learning_rate": 3.6666666666666666e-05,
63
- "loss": 2.9062,
64
- "step": 684
65
  },
66
  {
67
  "epoch": 4.0,
68
- "eval_accuracy": 0.5453717994380466,
69
- "eval_loss": 2.642239809036255,
70
- "eval_runtime": 8.5093,
71
- "eval_samples_per_second": 41.131,
72
- "eval_steps_per_second": 5.171,
73
- "step": 684
74
  },
75
  {
76
  "epoch": 5.0,
77
- "grad_norm": 5.642611980438232,
78
  "learning_rate": 3.3333333333333335e-05,
79
- "loss": 2.7046,
80
- "step": 855
81
  },
82
  {
83
  "epoch": 5.0,
84
- "eval_accuracy": 0.5657459867799811,
85
- "eval_loss": 2.505657434463501,
86
- "eval_runtime": 8.4686,
87
- "eval_samples_per_second": 41.329,
88
- "eval_steps_per_second": 5.196,
89
- "step": 855
90
  },
91
  {
92
  "epoch": 6.0,
93
- "grad_norm": 5.382541656494141,
94
  "learning_rate": 3e-05,
95
- "loss": 2.5462,
96
- "step": 1026
97
  },
98
  {
99
  "epoch": 6.0,
100
- "eval_accuracy": 0.5850446167634338,
101
- "eval_loss": 2.3794305324554443,
102
- "eval_runtime": 8.6097,
103
- "eval_samples_per_second": 40.652,
104
- "eval_steps_per_second": 5.111,
105
- "step": 1026
106
  },
107
  {
108
  "epoch": 7.0,
109
- "grad_norm": 5.883482933044434,
110
  "learning_rate": 2.6666666666666667e-05,
111
- "loss": 2.4348,
112
- "step": 1197
113
  },
114
  {
115
  "epoch": 7.0,
116
- "eval_accuracy": 0.5981030022732617,
117
- "eval_loss": 2.290560722351074,
118
- "eval_runtime": 14.2401,
119
- "eval_samples_per_second": 24.579,
120
- "eval_steps_per_second": 3.09,
121
- "step": 1197
122
  },
123
  {
124
  "epoch": 8.0,
125
- "grad_norm": 4.892796993255615,
126
  "learning_rate": 2.3333333333333336e-05,
127
- "loss": 2.3406,
128
- "step": 1368
129
  },
130
  {
131
  "epoch": 8.0,
132
- "eval_accuracy": 0.6042614409580466,
133
- "eval_loss": 2.2579710483551025,
134
- "eval_runtime": 14.1291,
135
- "eval_samples_per_second": 24.772,
136
- "eval_steps_per_second": 3.114,
137
- "step": 1368
138
  },
139
  {
140
  "epoch": 9.0,
141
- "grad_norm": 5.444692611694336,
142
  "learning_rate": 2e-05,
143
- "loss": 2.2544,
144
- "step": 1539
145
  },
146
  {
147
  "epoch": 9.0,
148
- "eval_accuracy": 0.6137461398368833,
149
- "eval_loss": 2.1750903129577637,
150
- "eval_runtime": 18.0466,
151
- "eval_samples_per_second": 19.394,
152
- "eval_steps_per_second": 2.438,
153
- "step": 1539
154
  },
155
  {
156
  "epoch": 10.0,
157
- "grad_norm": 5.572258949279785,
158
  "learning_rate": 1.6666666666666667e-05,
159
- "loss": 2.2031,
160
- "step": 1710
161
  },
162
  {
163
  "epoch": 10.0,
164
- "eval_accuracy": 0.6225209429183713,
165
- "eval_loss": 2.136831045150757,
166
- "eval_runtime": 8.6198,
167
- "eval_samples_per_second": 40.604,
168
- "eval_steps_per_second": 5.105,
169
- "step": 1710
170
  },
171
  {
172
  "epoch": 11.0,
173
- "grad_norm": 5.637876510620117,
174
  "learning_rate": 1.3333333333333333e-05,
175
- "loss": 2.1693,
176
- "step": 1881
177
  },
178
  {
179
  "epoch": 11.0,
180
- "eval_accuracy": 0.6184971098265896,
181
- "eval_loss": 2.140977144241333,
182
- "eval_runtime": 8.6333,
183
- "eval_samples_per_second": 40.541,
184
- "eval_steps_per_second": 5.097,
185
- "step": 1881
186
  },
187
  {
188
  "epoch": 12.0,
189
- "grad_norm": 5.16227388381958,
190
  "learning_rate": 1e-05,
191
- "loss": 2.1243,
192
- "step": 2052
193
  },
194
  {
195
  "epoch": 12.0,
196
- "eval_accuracy": 0.6290763561437572,
197
- "eval_loss": 2.0609424114227295,
198
- "eval_runtime": 8.8768,
199
- "eval_samples_per_second": 39.429,
200
- "eval_steps_per_second": 4.957,
201
- "step": 2052
202
  },
203
  {
204
  "epoch": 13.0,
205
- "grad_norm": 5.17201566696167,
206
  "learning_rate": 6.666666666666667e-06,
207
- "loss": 2.086,
208
- "step": 2223
209
  },
210
  {
211
  "epoch": 13.0,
212
- "eval_accuracy": 0.6354386788761055,
213
- "eval_loss": 2.0226352214813232,
214
- "eval_runtime": 9.1353,
215
- "eval_samples_per_second": 38.313,
216
- "eval_steps_per_second": 4.817,
217
- "step": 2223
218
  },
219
  {
220
  "epoch": 14.0,
221
- "grad_norm": 5.382483959197998,
222
  "learning_rate": 3.3333333333333333e-06,
223
- "loss": 2.0771,
224
- "step": 2394
225
  },
226
  {
227
  "epoch": 14.0,
228
- "eval_accuracy": 0.6357628841792445,
229
- "eval_loss": 2.046103000640869,
230
- "eval_runtime": 15.2839,
231
- "eval_samples_per_second": 22.9,
232
- "eval_steps_per_second": 2.879,
233
- "step": 2394
234
  },
235
  {
236
  "epoch": 15.0,
237
- "grad_norm": 5.431705474853516,
238
  "learning_rate": 0.0,
239
- "loss": 2.0692,
240
- "step": 2565
241
  },
242
  {
243
  "epoch": 15.0,
244
- "eval_accuracy": 0.6430424528301887,
245
- "eval_loss": 2.0070760250091553,
246
- "eval_runtime": 13.9029,
247
- "eval_samples_per_second": 25.175,
248
- "eval_steps_per_second": 3.165,
249
- "step": 2565
250
  },
251
  {
252
  "epoch": 15.0,
253
- "step": 2565,
254
- "total_flos": 5397015001420800.0,
255
- "train_loss": 2.701929186845151,
256
- "train_runtime": 4016.0696,
257
- "train_samples_per_second": 5.106,
258
- "train_steps_per_second": 0.639
259
  }
260
  ],
261
  "logging_steps": 1,
262
- "max_steps": 2565,
263
  "num_input_tokens_seen": 0,
264
  "num_train_epochs": 15,
265
  "save_steps": 1,
@@ -275,7 +275,7 @@
275
  "attributes": {}
276
  }
277
  },
278
- "total_flos": 5397015001420800.0,
279
  "train_batch_size": 8,
280
  "trial_name": null,
281
  "trial_params": null
 
1
  {
2
+ "best_metric": 2.3113090991973877,
3
+ "best_model_checkpoint": "epochmetrics/task-embedder/checkpoint-1245",
4
  "epoch": 15.0,
5
  "eval_steps": 1,
6
+ "global_step": 1245,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "grad_norm": 6.888462066650391,
14
  "learning_rate": 4.666666666666667e-05,
15
+ "loss": 5.6652,
16
+ "step": 83
17
  },
18
  {
19
  "epoch": 1.0,
20
+ "eval_accuracy": 0.36489910020092603,
21
+ "eval_loss": 4.285048484802246,
22
+ "eval_runtime": 9.4538,
23
+ "eval_samples_per_second": 17.348,
24
+ "eval_steps_per_second": 2.221,
25
+ "step": 83
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "grad_norm": 7.599656581878662,
30
  "learning_rate": 4.3333333333333334e-05,
31
+ "loss": 3.9443,
32
+ "step": 166
33
  },
34
  {
35
  "epoch": 2.0,
36
+ "eval_accuracy": 0.4281380027739251,
37
+ "eval_loss": 3.5406606197357178,
38
+ "eval_runtime": 9.4735,
39
+ "eval_samples_per_second": 17.311,
40
+ "eval_steps_per_second": 2.217,
41
+ "step": 166
42
  },
43
  {
44
  "epoch": 3.0,
45
+ "grad_norm": 7.9922637939453125,
46
  "learning_rate": 4e-05,
47
+ "loss": 3.3575,
48
+ "step": 249
49
  },
50
  {
51
  "epoch": 3.0,
52
+ "eval_accuracy": 0.4709654748308882,
53
+ "eval_loss": 3.109172821044922,
54
+ "eval_runtime": 10.2035,
55
+ "eval_samples_per_second": 16.073,
56
+ "eval_steps_per_second": 2.058,
57
+ "step": 249
58
  },
59
  {
60
  "epoch": 4.0,
61
+ "grad_norm": 7.56485652923584,
62
  "learning_rate": 3.6666666666666666e-05,
63
+ "loss": 3.084,
64
+ "step": 332
65
  },
66
  {
67
  "epoch": 4.0,
68
+ "eval_accuracy": 0.4962124510230736,
69
+ "eval_loss": 2.8743200302124023,
70
+ "eval_runtime": 9.3602,
71
+ "eval_samples_per_second": 17.521,
72
+ "eval_steps_per_second": 2.244,
73
+ "step": 332
74
  },
75
  {
76
  "epoch": 5.0,
77
+ "grad_norm": 7.547119617462158,
78
  "learning_rate": 3.3333333333333335e-05,
79
+ "loss": 2.8764,
80
+ "step": 415
81
  },
82
  {
83
  "epoch": 5.0,
84
+ "eval_accuracy": 0.5210972307154713,
85
+ "eval_loss": 2.7019972801208496,
86
+ "eval_runtime": 10.277,
87
+ "eval_samples_per_second": 15.958,
88
+ "eval_steps_per_second": 2.043,
89
+ "step": 415
90
  },
91
  {
92
  "epoch": 6.0,
93
+ "grad_norm": 6.975924968719482,
94
  "learning_rate": 3e-05,
95
+ "loss": 2.7367,
96
+ "step": 498
97
  },
98
  {
99
  "epoch": 6.0,
100
+ "eval_accuracy": 0.5187872505830526,
101
+ "eval_loss": 2.669877767562866,
102
+ "eval_runtime": 3.8263,
103
+ "eval_samples_per_second": 42.861,
104
+ "eval_steps_per_second": 5.488,
105
+ "step": 498
106
  },
107
  {
108
  "epoch": 7.0,
109
+ "grad_norm": 7.427117347717285,
110
  "learning_rate": 2.6666666666666667e-05,
111
+ "loss": 2.6275,
112
+ "step": 581
113
  },
114
  {
115
  "epoch": 7.0,
116
+ "eval_accuracy": 0.5403870967741935,
117
+ "eval_loss": 2.5638315677642822,
118
+ "eval_runtime": 3.776,
119
+ "eval_samples_per_second": 43.433,
120
+ "eval_steps_per_second": 5.562,
121
+ "step": 581
122
  },
123
  {
124
  "epoch": 8.0,
125
+ "grad_norm": 7.4915266036987305,
126
  "learning_rate": 2.3333333333333336e-05,
127
+ "loss": 2.5257,
128
+ "step": 664
129
  },
130
  {
131
  "epoch": 8.0,
132
+ "eval_accuracy": 0.5430055462628752,
133
+ "eval_loss": 2.5348384380340576,
134
+ "eval_runtime": 3.924,
135
+ "eval_samples_per_second": 41.794,
136
+ "eval_steps_per_second": 5.352,
137
+ "step": 664
138
  },
139
  {
140
  "epoch": 9.0,
141
+ "grad_norm": 7.47868013381958,
142
  "learning_rate": 2e-05,
143
+ "loss": 2.4742,
144
+ "step": 747
145
  },
146
  {
147
  "epoch": 9.0,
148
+ "eval_accuracy": 0.5590811583839829,
149
+ "eval_loss": 2.4301819801330566,
150
+ "eval_runtime": 3.7824,
151
+ "eval_samples_per_second": 43.359,
152
+ "eval_steps_per_second": 5.552,
153
+ "step": 747
154
  },
155
  {
156
  "epoch": 10.0,
157
+ "grad_norm": 7.228312015533447,
158
  "learning_rate": 1.6666666666666667e-05,
159
+ "loss": 2.4238,
160
+ "step": 830
161
  },
162
  {
163
  "epoch": 10.0,
164
+ "eval_accuracy": 0.5576721426074799,
165
+ "eval_loss": 2.4159433841705322,
166
+ "eval_runtime": 3.7919,
167
+ "eval_samples_per_second": 43.251,
168
+ "eval_steps_per_second": 5.538,
169
+ "step": 830
170
  },
171
  {
172
  "epoch": 11.0,
173
+ "grad_norm": 7.564913272857666,
174
  "learning_rate": 1.3333333333333333e-05,
175
+ "loss": 2.3516,
176
+ "step": 913
177
  },
178
  {
179
  "epoch": 11.0,
180
+ "eval_accuracy": 0.5740578439964943,
181
+ "eval_loss": 2.3461461067199707,
182
+ "eval_runtime": 3.8232,
183
+ "eval_samples_per_second": 42.896,
184
+ "eval_steps_per_second": 5.493,
185
+ "step": 913
186
  },
187
  {
188
  "epoch": 12.0,
189
+ "grad_norm": 7.104005336761475,
190
  "learning_rate": 1e-05,
191
+ "loss": 2.3115,
192
+ "step": 996
193
  },
194
  {
195
  "epoch": 12.0,
196
+ "eval_accuracy": 0.572778166550035,
197
+ "eval_loss": 2.329103469848633,
198
+ "eval_runtime": 3.8201,
199
+ "eval_samples_per_second": 42.93,
200
+ "eval_steps_per_second": 5.497,
201
+ "step": 996
202
  },
203
  {
204
  "epoch": 13.0,
205
+ "grad_norm": 7.211333751678467,
206
  "learning_rate": 6.666666666666667e-06,
207
+ "loss": 2.29,
208
+ "step": 1079
209
  },
210
  {
211
  "epoch": 13.0,
212
+ "eval_accuracy": 0.5698073370282396,
213
+ "eval_loss": 2.3577311038970947,
214
+ "eval_runtime": 3.8954,
215
+ "eval_samples_per_second": 42.101,
216
+ "eval_steps_per_second": 5.391,
217
+ "step": 1079
218
  },
219
  {
220
  "epoch": 14.0,
221
+ "grad_norm": 7.1609063148498535,
222
  "learning_rate": 3.3333333333333333e-06,
223
+ "loss": 2.2412,
224
+ "step": 1162
225
  },
226
  {
227
  "epoch": 14.0,
228
+ "eval_accuracy": 0.5673802421477452,
229
+ "eval_loss": 2.347292423248291,
230
+ "eval_runtime": 3.8169,
231
+ "eval_samples_per_second": 42.967,
232
+ "eval_steps_per_second": 5.502,
233
+ "step": 1162
234
  },
235
  {
236
  "epoch": 15.0,
237
+ "grad_norm": 6.575444221496582,
238
  "learning_rate": 0.0,
239
+ "loss": 2.245,
240
+ "step": 1245
241
  },
242
  {
243
  "epoch": 15.0,
244
+ "eval_accuracy": 0.5719677022994558,
245
+ "eval_loss": 2.3113090991973877,
246
+ "eval_runtime": 3.8203,
247
+ "eval_samples_per_second": 42.928,
248
+ "eval_steps_per_second": 5.497,
249
+ "step": 1245
250
  },
251
  {
252
  "epoch": 15.0,
253
+ "step": 1245,
254
+ "total_flos": 2605727798784000.0,
255
+ "train_loss": 2.876972158654148,
256
+ "train_runtime": 3151.4946,
257
+ "train_samples_per_second": 3.141,
258
+ "train_steps_per_second": 0.395
259
  }
260
  ],
261
  "logging_steps": 1,
262
+ "max_steps": 1245,
263
  "num_input_tokens_seen": 0,
264
  "num_train_epochs": 15,
265
  "save_steps": 1,
 
275
  "attributes": {}
276
  }
277
  },
278
+ "total_flos": 2605727798784000.0,
279
  "train_batch_size": 8,
280
  "trial_name": null,
281
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4fb11e754eb9b0cbb02a9fe5c043c7d02b1a27c6cb263e621b2865fa7ba734c
3
  size 5176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efb5343939497c2638a2c01c51241b6b8bee97a03d13857293cf84a9fe7f70e3
3
  size 5176