AbrahamSanders commited on
Commit
3cbed44
1 Parent(s): 2f95de8

Model update

Browse files
README.md CHANGED
@@ -5,19 +5,19 @@ tags:
5
  metrics:
6
  - accuracy
7
  model-index:
8
- - name: rtchat-2.7b-no-anchor
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
- # rtchat-2.7b-no-anchor
16
 
17
  This model is a fine-tuned version of [facebook/opt-2.7b](https://huggingface.co/facebook/opt-2.7b) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 2.0337
20
- - Accuracy: 0.7355
21
 
22
  ## Model description
23
 
@@ -36,33 +36,32 @@ More information needed
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
- - learning_rate: 5e-05
40
  - train_batch_size: 1
41
  - eval_batch_size: 1
42
  - seed: 42
43
- - gradient_accumulation_steps: 32
44
- - total_train_batch_size: 32
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
47
  - lr_scheduler_warmup_ratio: 0.1
48
- - num_epochs: 4.0
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
- | 2.1457 | 0.5 | 163 | 2.1289 | 0.7255 |
55
- | 2.077 | 1.0 | 326 | 2.0780 | 0.7301 |
56
- | 1.8545 | 1.5 | 489 | 2.0484 | 0.7333 |
57
- | 1.852 | 2.0 | 652 | 2.0337 | 0.7355 |
58
- | 1.5892 | 2.51 | 815 | 2.0437 | 0.7366 |
59
- | 1.549 | 3.01 | 978 | 2.0590 | 0.7367 |
60
- | 1.3551 | 3.51 | 1141 | 2.0624 | 0.7373 |
61
 
62
 
63
  ### Framework versions
64
 
65
- - Transformers 4.27.2
66
- - Pytorch 1.13.1+cu117
67
  - Datasets 2.7.1
68
  - Tokenizers 0.12.1
 
5
  metrics:
6
  - accuracy
7
  model-index:
8
+ - name: opt-2.7b-realtime-chat-v2
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
+ # opt-2.7b-realtime-chat-v2
16
 
17
  This model is a fine-tuned version of [facebook/opt-2.7b](https://huggingface.co/facebook/opt-2.7b) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 2.0888
20
+ - Accuracy: 0.6870
21
 
22
  ## Model description
23
 
 
36
  ### Training hyperparameters
37
 
38
  The following hyperparameters were used during training:
39
+ - learning_rate: 3e-05
40
  - train_batch_size: 1
41
  - eval_batch_size: 1
42
  - seed: 42
43
+ - gradient_accumulation_steps: 128
44
+ - total_train_batch_size: 128
45
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
46
  - lr_scheduler_type: linear
47
  - lr_scheduler_warmup_ratio: 0.1
48
+ - num_epochs: 3.0
49
 
50
  ### Training results
51
 
52
  | Training Loss | Epoch | Step | Validation Loss | Accuracy |
53
  |:-------------:|:-----:|:----:|:---------------:|:--------:|
54
+ | 2.0974 | 0.5 | 51 | 2.1267 | 0.6826 |
55
+ | 2.0842 | 1.0 | 102 | 2.0968 | 0.6859 |
56
+ | 1.9624 | 1.49 | 153 | 2.0936 | 0.6863 |
57
+ | 1.9476 | 1.99 | 204 | 2.0888 | 0.6870 |
58
+ | 1.888 | 2.49 | 255 | 2.0993 | 0.6864 |
59
+ | 1.8687 | 2.99 | 306 | 2.0994 | 0.6865 |
 
60
 
61
 
62
  ### Framework versions
63
 
64
+ - Transformers 4.28.1
65
+ - Pytorch 2.0.1+cu118
66
  - Datasets 2.7.1
67
  - Tokenizers 0.12.1
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "epoch": 4.0,
3
- "eval_accuracy": 0.7355063417557176,
4
- "eval_loss": 2.033745288848877,
5
- "eval_runtime": 355.7749,
6
- "eval_samples": 1178,
7
- "eval_samples_per_second": 3.311,
8
- "eval_steps_per_second": 3.311,
9
- "perplexity": 7.6426567803576875,
10
- "train_loss": 1.7419609642028808,
11
- "train_runtime": 52862.2699,
12
- "train_samples": 10407,
13
- "train_samples_per_second": 0.787,
14
- "train_steps_per_second": 0.025
15
  }
 
1
  {
2
+ "epoch": 2.99,
3
+ "eval_accuracy": 0.6870071488415761,
4
+ "eval_loss": 2.0888001918792725,
5
+ "eval_runtime": 509.5975,
6
+ "eval_samples": 1796,
7
+ "eval_samples_per_second": 3.524,
8
+ "eval_steps_per_second": 3.524,
9
+ "perplexity": 8.075220634386367,
10
+ "train_loss": 2.010221361334807,
11
+ "train_runtime": 46544.0603,
12
+ "train_samples": 13111,
13
+ "train_samples_per_second": 0.845,
14
+ "train_steps_per_second": 0.007
15
  }
config.json CHANGED
@@ -24,7 +24,7 @@
24
  "pad_token_id": 1,
25
  "prefix": "</s>",
26
  "torch_dtype": "float32",
27
- "transformers_version": "4.27.2",
28
  "use_cache": true,
29
  "vocab_size": 50265,
30
  "word_embed_proj_dim": 2560
 
24
  "pad_token_id": 1,
25
  "prefix": "</s>",
26
  "torch_dtype": "float32",
27
+ "transformers_version": "4.28.1",
28
  "use_cache": true,
29
  "vocab_size": 50265,
30
  "word_embed_proj_dim": 2560
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 4.0,
3
- "eval_accuracy": 0.7355063417557176,
4
- "eval_loss": 2.033745288848877,
5
- "eval_runtime": 355.7749,
6
- "eval_samples": 1178,
7
- "eval_samples_per_second": 3.311,
8
- "eval_steps_per_second": 3.311,
9
- "perplexity": 7.6426567803576875
10
  }
 
1
  {
2
+ "epoch": 2.99,
3
+ "eval_accuracy": 0.6870071488415761,
4
+ "eval_loss": 2.0888001918792725,
5
+ "eval_runtime": 509.5975,
6
+ "eval_samples": 1796,
7
+ "eval_samples_per_second": 3.524,
8
+ "eval_steps_per_second": 3.524,
9
+ "perplexity": 8.075220634386367
10
  }
generation_config.json CHANGED
@@ -3,5 +3,5 @@
3
  "bos_token_id": 2,
4
  "eos_token_id": 2,
5
  "pad_token_id": 1,
6
- "transformers_version": "4.27.2"
7
  }
 
3
  "bos_token_id": 2,
4
  "eos_token_id": 2,
5
  "pad_token_id": 1,
6
+ "transformers_version": "4.28.1"
7
  }
pytorch_model-00001-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:158d05894a42474f304f89cfa57df7893c9b91a103bfbc7dbf142da9d41d6959
3
  size 9977078723
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5663e8b1081fbaf6440fd6ab38f293ffa5eb67c0af62a2e7775192557c0eefed
3
  size 9977078723
pytorch_model-00002-of-00002.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6d50bb9b69f7eae4bae41a94273925ab8cafb75e501b4b4877851699fb027be
3
  size 1144136929
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e483afa16baf460b6d548b4fcefe15a95b96faca2a1d00300508694bf9535f90
3
  size 1144136929
runs/{Mar26_04-01-41_panacea/1679817763.2045138/events.out.tfevents.1679817763.panacea.358302.1 → May20_16-39-41_panacea/1684615241.5386765/events.out.tfevents.1684615241.panacea.344900.1} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f21c361bf973c104dcd1ca1e92c6ad1e7c1d6f9302342da0611f163dfd86cc57
3
- size 5793
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72486d16f01267fe4033873a7d5e8848b301f38f611bbcf77b5a2ffc8b55e3ae
3
+ size 5859
runs/{Mar26_04-01-41_panacea/events.out.tfevents.1679817763.panacea.358302.0 → May20_16-39-41_panacea/events.out.tfevents.1684615241.panacea.344900.0} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:739947ffe2812a4602ecc005867e78512fb2667b55613308d47adc3b60b5246e
3
- size 27016
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eba7fbc4b3b305a059bd4a5914673a837e6936dc8d13dcb4a52646a703572f66
3
+ size 30178
runs/{Mar26_04-01-41_panacea/events.out.tfevents.1679871090.panacea.358302.2 → May20_16-39-41_panacea/events.out.tfevents.1684662404.panacea.344900.2} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:abb82ba7863914d6c96a00d5375704549095fd0d11be2b1c99e4cb10041b7225
3
  size 363
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dfc715017543caa195a56d8608c6c94bc4548f9b212793816cb9569e4dd3179
3
  size 363
tokenizer_config.json CHANGED
@@ -9,6 +9,7 @@
9
  "rstrip": false,
10
  "single_word": false
11
  },
 
12
  "eos_token": {
13
  "__type": "AddedToken",
14
  "content": "</s>",
@@ -27,7 +28,6 @@
27
  "rstrip": false,
28
  "single_word": false
29
  },
30
- "special_tokens_map_file": null,
31
  "tokenizer_class": "GPT2Tokenizer",
32
  "unk_token": {
33
  "__type": "AddedToken",
 
9
  "rstrip": false,
10
  "single_word": false
11
  },
12
+ "clean_up_tokenization_spaces": true,
13
  "eos_token": {
14
  "__type": "AddedToken",
15
  "content": "</s>",
 
28
  "rstrip": false,
29
  "single_word": false
30
  },
 
31
  "tokenizer_class": "GPT2Tokenizer",
32
  "unk_token": {
33
  "__type": "AddedToken",
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 4.0,
3
- "train_loss": 1.7419609642028808,
4
- "train_runtime": 52862.2699,
5
- "train_samples": 10407,
6
- "train_samples_per_second": 0.787,
7
- "train_steps_per_second": 0.025
8
  }
 
1
  {
2
+ "epoch": 2.99,
3
+ "train_loss": 2.010221361334807,
4
+ "train_runtime": 46544.0603,
5
+ "train_samples": 13111,
6
+ "train_samples_per_second": 0.845,
7
+ "train_steps_per_second": 0.007
8
  }
trainer_state.json CHANGED
@@ -1,868 +1,997 @@
1
  {
2
- "best_metric": 2.033745288848877,
3
- "best_model_checkpoint": "rtchat-2.7b-no-anchor/checkpoint-652",
4
- "epoch": 3.9973095032189874,
5
- "global_step": 1300,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
- "epoch": 0.03,
12
- "learning_rate": 3.846153846153847e-06,
13
- "loss": 2.5777,
14
- "step": 10
 
 
 
 
 
 
15
  },
16
  {
17
  "epoch": 0.06,
18
- "learning_rate": 7.692307692307694e-06,
19
- "loss": 2.3741,
20
- "step": 20
21
  },
22
  {
23
- "epoch": 0.09,
24
- "learning_rate": 1.153846153846154e-05,
25
- "loss": 2.2854,
26
- "step": 30
 
 
 
 
 
 
27
  },
28
  {
29
  "epoch": 0.12,
30
- "learning_rate": 1.5384615384615387e-05,
31
- "loss": 2.2656,
32
- "step": 40
33
  },
34
  {
35
- "epoch": 0.15,
36
- "learning_rate": 1.923076923076923e-05,
37
- "loss": 2.2325,
38
- "step": 50
 
 
 
 
 
 
39
  },
40
  {
41
  "epoch": 0.18,
42
- "learning_rate": 2.307692307692308e-05,
43
- "loss": 2.1996,
44
- "step": 60
45
  },
46
  {
47
- "epoch": 0.22,
48
- "learning_rate": 2.6923076923076923e-05,
49
- "loss": 2.1988,
50
- "step": 70
 
 
 
 
 
 
 
 
 
 
 
 
51
  },
52
  {
53
  "epoch": 0.25,
54
- "learning_rate": 3.0769230769230774e-05,
55
- "loss": 2.1614,
56
- "step": 80
57
  },
58
  {
59
- "epoch": 0.28,
60
- "learning_rate": 3.461538461538462e-05,
61
- "loss": 2.1778,
62
- "step": 90
 
 
 
 
 
 
63
  },
64
  {
65
  "epoch": 0.31,
66
- "learning_rate": 3.846153846153846e-05,
67
- "loss": 2.1483,
68
- "step": 100
69
  },
70
  {
71
- "epoch": 0.34,
72
- "learning_rate": 4.230769230769231e-05,
73
- "loss": 2.1579,
74
- "step": 110
 
 
 
 
 
 
75
  },
76
  {
77
  "epoch": 0.37,
78
- "learning_rate": 4.615384615384616e-05,
79
- "loss": 2.1716,
80
- "step": 120
81
  },
82
  {
83
- "epoch": 0.4,
84
- "learning_rate": 5e-05,
85
- "loss": 2.1204,
86
- "step": 130
 
 
 
 
 
 
87
  },
88
  {
89
  "epoch": 0.43,
90
- "learning_rate": 4.9572649572649575e-05,
91
- "loss": 2.1383,
92
- "step": 140
93
  },
94
  {
95
- "epoch": 0.46,
96
- "learning_rate": 4.9145299145299147e-05,
97
- "loss": 2.1599,
98
- "step": 150
 
 
 
 
 
 
99
  },
100
  {
101
  "epoch": 0.49,
102
- "learning_rate": 4.871794871794872e-05,
103
- "loss": 2.1457,
104
- "step": 160
105
  },
106
  {
107
  "epoch": 0.5,
108
- "eval_accuracy": 0.7255127544718182,
109
- "eval_loss": 2.128948450088501,
110
- "eval_runtime": 356.4721,
111
- "eval_samples_per_second": 3.305,
112
- "eval_steps_per_second": 3.305,
113
- "step": 163
114
  },
115
  {
116
- "epoch": 0.52,
117
- "learning_rate": 4.829059829059829e-05,
118
- "loss": 2.1584,
119
- "step": 170
 
 
 
 
 
 
120
  },
121
  {
122
  "epoch": 0.55,
123
- "learning_rate": 4.786324786324787e-05,
124
- "loss": 2.156,
125
- "step": 180
126
  },
127
  {
128
- "epoch": 0.58,
129
- "learning_rate": 4.7435897435897435e-05,
130
- "loss": 2.1439,
131
- "step": 190
 
 
 
 
 
 
132
  },
133
  {
134
  "epoch": 0.61,
135
- "learning_rate": 4.700854700854701e-05,
136
- "loss": 2.1457,
137
- "step": 200
138
  },
139
  {
140
- "epoch": 0.65,
141
- "learning_rate": 4.6581196581196586e-05,
142
- "loss": 2.1178,
143
- "step": 210
 
 
 
 
 
 
 
 
 
 
 
 
144
  },
145
  {
146
  "epoch": 0.68,
147
- "learning_rate": 4.615384615384616e-05,
148
- "loss": 2.1221,
149
- "step": 220
150
  },
151
  {
152
- "epoch": 0.71,
153
- "learning_rate": 4.572649572649573e-05,
154
- "loss": 2.1152,
155
- "step": 230
 
 
 
 
 
 
156
  },
157
  {
158
  "epoch": 0.74,
159
- "learning_rate": 4.52991452991453e-05,
160
- "loss": 2.0846,
161
- "step": 240
162
  },
163
  {
164
- "epoch": 0.77,
165
- "learning_rate": 4.4871794871794874e-05,
166
- "loss": 2.1165,
167
- "step": 250
 
 
 
 
 
 
168
  },
169
  {
170
  "epoch": 0.8,
171
- "learning_rate": 4.4444444444444447e-05,
172
- "loss": 2.0798,
173
- "step": 260
174
  },
175
  {
176
- "epoch": 0.83,
177
- "learning_rate": 4.401709401709402e-05,
178
- "loss": 2.1048,
179
- "step": 270
 
 
 
 
 
 
180
  },
181
  {
182
  "epoch": 0.86,
183
- "learning_rate": 4.358974358974359e-05,
184
- "loss": 2.0861,
185
- "step": 280
186
  },
187
  {
188
- "epoch": 0.89,
189
- "learning_rate": 4.316239316239317e-05,
190
- "loss": 2.1209,
191
- "step": 290
 
 
 
 
 
 
192
  },
193
  {
194
  "epoch": 0.92,
195
- "learning_rate": 4.2735042735042735e-05,
196
- "loss": 2.1218,
197
- "step": 300
 
 
 
 
 
 
198
  },
199
  {
200
- "epoch": 0.95,
201
- "learning_rate": 4.230769230769231e-05,
202
- "loss": 2.0771,
203
- "step": 310
204
  },
205
  {
206
  "epoch": 0.98,
207
- "learning_rate": 4.1880341880341886e-05,
208
- "loss": 2.077,
209
- "step": 320
210
  },
211
  {
212
  "epoch": 1.0,
213
- "eval_accuracy": 0.7300988977487162,
214
- "eval_loss": 2.078002452850342,
215
- "eval_runtime": 356.6324,
216
- "eval_samples_per_second": 3.303,
217
- "eval_steps_per_second": 3.303,
218
- "step": 326
 
 
 
 
 
 
 
 
 
 
 
 
219
  },
220
  {
221
- "epoch": 1.01,
222
- "learning_rate": 4.145299145299146e-05,
223
- "loss": 1.9599,
224
- "step": 330
225
  },
226
  {
227
  "epoch": 1.05,
228
- "learning_rate": 4.1025641025641023e-05,
229
- "loss": 1.8939,
230
- "step": 340
231
  },
232
  {
233
- "epoch": 1.08,
234
- "learning_rate": 4.05982905982906e-05,
235
- "loss": 1.8455,
236
- "step": 350
237
  },
238
  {
239
- "epoch": 1.11,
240
- "learning_rate": 4.0170940170940174e-05,
241
- "loss": 1.8533,
242
- "step": 360
243
  },
244
  {
245
- "epoch": 1.14,
246
- "learning_rate": 3.974358974358974e-05,
247
- "loss": 1.8971,
248
- "step": 370
249
  },
250
  {
251
- "epoch": 1.17,
252
- "learning_rate": 3.931623931623932e-05,
253
- "loss": 1.8823,
254
- "step": 380
255
  },
256
  {
257
- "epoch": 1.2,
258
- "learning_rate": 3.888888888888889e-05,
259
- "loss": 1.8622,
260
- "step": 390
261
  },
262
  {
263
- "epoch": 1.23,
264
- "learning_rate": 3.846153846153846e-05,
265
- "loss": 1.8613,
266
- "step": 400
267
  },
268
  {
269
- "epoch": 1.26,
270
- "learning_rate": 3.8034188034188035e-05,
271
- "loss": 1.8606,
272
- "step": 410
273
  },
274
  {
275
- "epoch": 1.29,
276
- "learning_rate": 3.760683760683761e-05,
277
- "loss": 1.8624,
278
- "step": 420
279
  },
280
  {
281
- "epoch": 1.32,
282
- "learning_rate": 3.717948717948718e-05,
283
- "loss": 1.8538,
284
- "step": 430
285
  },
286
  {
287
- "epoch": 1.35,
288
- "learning_rate": 3.675213675213676e-05,
289
- "loss": 1.8756,
290
- "step": 440
291
  },
292
  {
293
- "epoch": 1.38,
294
- "learning_rate": 3.6324786324786323e-05,
295
- "loss": 1.8423,
296
- "step": 450
297
  },
298
  {
299
- "epoch": 1.41,
300
- "learning_rate": 3.58974358974359e-05,
301
- "loss": 1.8616,
302
- "step": 460
303
  },
304
  {
305
- "epoch": 1.45,
306
- "learning_rate": 3.5470085470085474e-05,
307
- "loss": 1.8915,
308
- "step": 470
309
  },
310
  {
311
- "epoch": 1.48,
312
- "learning_rate": 3.504273504273504e-05,
313
- "loss": 1.8545,
314
- "step": 480
315
  },
316
  {
317
- "epoch": 1.5,
318
- "eval_accuracy": 0.733327595178899,
319
- "eval_loss": 2.048430919647217,
320
- "eval_runtime": 356.6932,
321
- "eval_samples_per_second": 3.303,
322
- "eval_steps_per_second": 3.303,
323
- "step": 489
324
  },
325
  {
326
- "epoch": 1.51,
327
- "learning_rate": 3.461538461538462e-05,
328
- "loss": 1.8426,
329
- "step": 490
330
  },
331
  {
332
- "epoch": 1.54,
333
- "learning_rate": 3.418803418803419e-05,
334
- "loss": 1.8433,
335
- "step": 500
336
  },
337
  {
338
- "epoch": 1.57,
339
- "learning_rate": 3.376068376068376e-05,
340
- "loss": 1.8482,
341
- "step": 510
342
  },
343
  {
344
- "epoch": 1.6,
345
- "learning_rate": 3.3333333333333335e-05,
346
- "loss": 1.8368,
347
- "step": 520
348
  },
349
  {
350
- "epoch": 1.63,
351
- "learning_rate": 3.290598290598291e-05,
352
- "loss": 1.861,
353
- "step": 530
354
  },
355
  {
356
- "epoch": 1.66,
357
- "learning_rate": 3.247863247863248e-05,
358
- "loss": 1.8525,
359
- "step": 540
360
  },
361
  {
362
- "epoch": 1.69,
363
- "learning_rate": 3.205128205128206e-05,
364
- "loss": 1.8455,
365
- "step": 550
366
  },
367
  {
368
- "epoch": 1.72,
369
- "learning_rate": 3.162393162393162e-05,
370
- "loss": 1.8529,
371
- "step": 560
 
 
 
372
  },
373
  {
374
- "epoch": 1.75,
375
- "learning_rate": 3.1196581196581195e-05,
376
- "loss": 1.8744,
377
- "step": 570
378
  },
379
  {
380
- "epoch": 1.78,
381
- "learning_rate": 3.0769230769230774e-05,
382
- "loss": 1.8321,
383
- "step": 580
384
  },
385
  {
386
- "epoch": 1.81,
387
- "learning_rate": 3.034188034188034e-05,
388
- "loss": 1.8503,
389
- "step": 590
390
  },
391
  {
392
- "epoch": 1.84,
393
- "learning_rate": 2.9914529914529915e-05,
394
- "loss": 1.8699,
395
- "step": 600
396
  },
397
  {
398
- "epoch": 1.88,
399
- "learning_rate": 2.948717948717949e-05,
400
- "loss": 1.8421,
401
- "step": 610
402
  },
403
  {
404
- "epoch": 1.91,
405
- "learning_rate": 2.9059829059829063e-05,
406
- "loss": 1.812,
407
- "step": 620
408
  },
409
  {
410
- "epoch": 1.94,
411
- "learning_rate": 2.863247863247863e-05,
412
- "loss": 1.823,
413
- "step": 630
414
  },
415
  {
416
- "epoch": 1.97,
417
- "learning_rate": 2.8205128205128207e-05,
418
- "loss": 1.8191,
419
- "step": 640
420
  },
421
  {
422
- "epoch": 2.0,
423
- "learning_rate": 2.777777777777778e-05,
424
- "loss": 1.852,
425
- "step": 650
426
  },
427
  {
428
- "epoch": 2.0,
429
- "eval_accuracy": 0.7355063417557176,
430
- "eval_loss": 2.033745288848877,
431
- "eval_runtime": 356.5312,
432
- "eval_samples_per_second": 3.304,
433
- "eval_steps_per_second": 3.304,
434
- "step": 652
435
  },
436
  {
437
- "epoch": 2.03,
438
- "learning_rate": 2.7350427350427355e-05,
439
- "loss": 1.5592,
440
- "step": 660
441
  },
442
  {
443
- "epoch": 2.06,
444
- "learning_rate": 2.6923076923076923e-05,
445
- "loss": 1.5729,
446
- "step": 670
447
  },
448
  {
449
- "epoch": 2.09,
450
- "learning_rate": 2.64957264957265e-05,
451
- "loss": 1.5714,
452
- "step": 680
453
  },
454
  {
455
- "epoch": 2.12,
456
- "learning_rate": 2.606837606837607e-05,
457
- "loss": 1.5907,
458
- "step": 690
459
  },
460
  {
461
- "epoch": 2.15,
462
- "learning_rate": 2.564102564102564e-05,
463
- "loss": 1.5564,
464
- "step": 700
465
  },
466
  {
467
- "epoch": 2.18,
468
- "learning_rate": 2.5213675213675215e-05,
469
- "loss": 1.5733,
470
- "step": 710
471
  },
472
  {
473
- "epoch": 2.21,
474
- "learning_rate": 2.4786324786324787e-05,
475
- "loss": 1.5873,
476
- "step": 720
477
  },
478
  {
479
- "epoch": 2.24,
480
- "learning_rate": 2.435897435897436e-05,
481
- "loss": 1.5907,
482
- "step": 730
483
  },
484
  {
485
- "epoch": 2.28,
486
- "learning_rate": 2.3931623931623935e-05,
487
- "loss": 1.5778,
488
- "step": 740
489
  },
490
  {
491
- "epoch": 2.31,
492
- "learning_rate": 2.3504273504273504e-05,
493
- "loss": 1.6039,
494
- "step": 750
495
  },
496
  {
497
- "epoch": 2.34,
498
- "learning_rate": 2.307692307692308e-05,
499
- "loss": 1.5579,
500
- "step": 760
501
  },
502
  {
503
- "epoch": 2.37,
504
- "learning_rate": 2.264957264957265e-05,
505
- "loss": 1.5854,
506
- "step": 770
507
  },
508
  {
509
- "epoch": 2.4,
510
- "learning_rate": 2.2222222222222223e-05,
511
- "loss": 1.5834,
512
- "step": 780
513
  },
514
  {
515
- "epoch": 2.43,
516
- "learning_rate": 2.1794871794871795e-05,
517
- "loss": 1.5733,
518
- "step": 790
519
  },
520
  {
521
- "epoch": 2.46,
522
- "learning_rate": 2.1367521367521368e-05,
523
- "loss": 1.5799,
524
- "step": 800
525
  },
526
  {
527
- "epoch": 2.49,
528
- "learning_rate": 2.0940170940170943e-05,
529
- "loss": 1.5892,
530
- "step": 810
531
  },
532
  {
533
- "epoch": 2.51,
534
- "eval_accuracy": 0.736644336047349,
535
- "eval_loss": 2.043654441833496,
536
- "eval_runtime": 357.9372,
537
- "eval_samples_per_second": 3.291,
538
- "eval_steps_per_second": 3.291,
539
- "step": 815
540
  },
541
  {
542
- "epoch": 2.52,
543
- "learning_rate": 2.0512820512820512e-05,
544
- "loss": 1.5754,
545
- "step": 820
546
  },
547
  {
548
- "epoch": 2.55,
549
- "learning_rate": 2.0085470085470087e-05,
550
- "loss": 1.5797,
551
- "step": 830
552
  },
553
  {
554
- "epoch": 2.58,
555
- "learning_rate": 1.965811965811966e-05,
556
- "loss": 1.59,
557
- "step": 840
558
  },
559
  {
560
- "epoch": 2.61,
561
- "learning_rate": 1.923076923076923e-05,
562
- "loss": 1.5638,
563
- "step": 850
564
  },
565
  {
566
- "epoch": 2.64,
567
- "learning_rate": 1.8803418803418804e-05,
568
- "loss": 1.5584,
569
- "step": 860
570
  },
571
  {
572
- "epoch": 2.68,
573
- "learning_rate": 1.837606837606838e-05,
574
- "loss": 1.6119,
575
- "step": 870
576
  },
577
  {
578
- "epoch": 2.71,
579
- "learning_rate": 1.794871794871795e-05,
580
- "loss": 1.5748,
581
- "step": 880
582
  },
583
  {
584
- "epoch": 2.74,
585
- "learning_rate": 1.752136752136752e-05,
586
- "loss": 1.5728,
587
- "step": 890
588
  },
589
  {
590
- "epoch": 2.77,
591
- "learning_rate": 1.7094017094017095e-05,
592
- "loss": 1.597,
593
- "step": 900
594
  },
595
  {
596
- "epoch": 2.8,
597
- "learning_rate": 1.6666666666666667e-05,
598
- "loss": 1.53,
599
- "step": 910
600
  },
601
  {
602
- "epoch": 2.83,
603
- "learning_rate": 1.623931623931624e-05,
604
- "loss": 1.5694,
605
- "step": 920
606
  },
607
  {
608
- "epoch": 2.86,
609
- "learning_rate": 1.581196581196581e-05,
610
- "loss": 1.5615,
611
- "step": 930
612
  },
613
  {
614
- "epoch": 2.89,
615
- "learning_rate": 1.5384615384615387e-05,
616
- "loss": 1.5645,
617
- "step": 940
618
  },
619
  {
620
- "epoch": 2.92,
621
- "learning_rate": 1.4957264957264958e-05,
622
- "loss": 1.5877,
623
- "step": 950
624
  },
625
  {
626
- "epoch": 2.95,
627
- "learning_rate": 1.4529914529914531e-05,
628
- "loss": 1.5585,
629
- "step": 960
630
  },
631
  {
632
- "epoch": 2.98,
633
- "learning_rate": 1.4102564102564104e-05,
634
- "loss": 1.549,
635
- "step": 970
636
  },
637
  {
638
- "epoch": 3.01,
639
- "eval_accuracy": 0.7367106971463413,
640
- "eval_loss": 2.059032678604126,
641
- "eval_runtime": 357.6465,
642
- "eval_samples_per_second": 3.294,
643
- "eval_steps_per_second": 3.294,
644
- "step": 978
645
  },
646
  {
647
- "epoch": 3.01,
648
- "learning_rate": 1.3675213675213677e-05,
649
- "loss": 1.4812,
650
- "step": 980
651
  },
652
  {
653
- "epoch": 3.04,
654
- "learning_rate": 1.324786324786325e-05,
655
- "loss": 1.395,
656
- "step": 990
657
  },
658
  {
659
- "epoch": 3.07,
660
- "learning_rate": 1.282051282051282e-05,
661
- "loss": 1.3595,
662
- "step": 1000
663
  },
664
  {
665
- "epoch": 3.11,
666
- "learning_rate": 1.2393162393162394e-05,
667
- "loss": 1.3567,
668
- "step": 1010
669
  },
670
  {
671
- "epoch": 3.14,
672
- "learning_rate": 1.1965811965811967e-05,
673
- "loss": 1.3633,
674
- "step": 1020
675
  },
676
  {
677
- "epoch": 3.17,
678
- "learning_rate": 1.153846153846154e-05,
679
- "loss": 1.4042,
680
- "step": 1030
681
  },
682
  {
683
- "epoch": 3.2,
684
- "learning_rate": 1.1111111111111112e-05,
685
- "loss": 1.3838,
686
- "step": 1040
687
  },
688
  {
689
- "epoch": 3.23,
690
- "learning_rate": 1.0683760683760684e-05,
691
- "loss": 1.3808,
692
- "step": 1050
693
  },
694
  {
695
- "epoch": 3.26,
696
- "learning_rate": 1.0256410256410256e-05,
697
- "loss": 1.3894,
698
- "step": 1060
 
 
 
699
  },
700
  {
701
- "epoch": 3.29,
702
- "learning_rate": 9.82905982905983e-06,
703
- "loss": 1.3941,
704
- "step": 1070
705
  },
706
  {
707
- "epoch": 3.32,
708
- "learning_rate": 9.401709401709402e-06,
709
- "loss": 1.3586,
710
- "step": 1080
711
  },
712
  {
713
- "epoch": 3.35,
714
- "learning_rate": 8.974358974358976e-06,
715
- "loss": 1.3731,
716
- "step": 1090
717
  },
718
  {
719
- "epoch": 3.38,
720
- "learning_rate": 8.547008547008548e-06,
721
- "loss": 1.3742,
722
- "step": 1100
723
  },
724
  {
725
- "epoch": 3.41,
726
- "learning_rate": 8.11965811965812e-06,
727
- "loss": 1.3785,
728
- "step": 1110
729
  },
730
  {
731
- "epoch": 3.44,
732
- "learning_rate": 7.692307692307694e-06,
733
- "loss": 1.3669,
734
- "step": 1120
735
  },
736
  {
737
- "epoch": 3.47,
738
- "learning_rate": 7.264957264957266e-06,
739
- "loss": 1.3435,
740
- "step": 1130
741
  },
742
  {
743
- "epoch": 3.51,
744
- "learning_rate": 6.837606837606839e-06,
745
- "loss": 1.3551,
746
- "step": 1140
747
  },
748
  {
749
- "epoch": 3.51,
750
- "eval_accuracy": 0.7373401420258927,
751
- "eval_loss": 2.0624454021453857,
752
- "eval_runtime": 356.4154,
753
- "eval_samples_per_second": 3.305,
754
- "eval_steps_per_second": 3.305,
755
- "step": 1141
756
  },
757
  {
758
- "epoch": 3.54,
759
- "learning_rate": 6.41025641025641e-06,
760
- "loss": 1.3672,
761
- "step": 1150
762
  },
763
  {
764
- "epoch": 3.57,
765
- "learning_rate": 5.982905982905984e-06,
766
- "loss": 1.3426,
767
- "step": 1160
768
  },
769
  {
770
- "epoch": 3.6,
771
- "learning_rate": 5.555555555555556e-06,
772
- "loss": 1.3789,
773
- "step": 1170
774
  },
775
  {
776
- "epoch": 3.63,
777
- "learning_rate": 5.128205128205128e-06,
778
- "loss": 1.3926,
779
- "step": 1180
780
  },
781
  {
782
- "epoch": 3.66,
783
- "learning_rate": 4.700854700854701e-06,
784
- "loss": 1.383,
785
- "step": 1190
786
  },
787
  {
788
- "epoch": 3.69,
789
- "learning_rate": 4.273504273504274e-06,
790
- "loss": 1.3808,
791
- "step": 1200
792
  },
793
  {
794
- "epoch": 3.72,
795
- "learning_rate": 3.846153846153847e-06,
796
- "loss": 1.3616,
797
- "step": 1210
798
  },
799
  {
800
- "epoch": 3.75,
801
- "learning_rate": 3.4188034188034193e-06,
802
- "loss": 1.3928,
803
- "step": 1220
804
  },
805
  {
806
- "epoch": 3.78,
807
- "learning_rate": 2.991452991452992e-06,
808
- "loss": 1.3687,
809
- "step": 1230
810
  },
811
  {
812
- "epoch": 3.81,
813
- "learning_rate": 2.564102564102564e-06,
814
- "loss": 1.3664,
815
- "step": 1240
816
  },
817
  {
818
- "epoch": 3.84,
819
- "learning_rate": 2.136752136752137e-06,
820
- "loss": 1.3667,
821
- "step": 1250
822
  },
823
  {
824
- "epoch": 3.87,
825
- "learning_rate": 1.7094017094017097e-06,
826
- "loss": 1.3415,
827
- "step": 1260
828
  },
829
  {
830
- "epoch": 3.91,
831
- "learning_rate": 1.282051282051282e-06,
832
- "loss": 1.3677,
833
- "step": 1270
834
  },
835
  {
836
- "epoch": 3.94,
837
- "learning_rate": 8.547008547008548e-07,
838
- "loss": 1.3587,
839
- "step": 1280
840
  },
841
  {
842
- "epoch": 3.97,
843
- "learning_rate": 4.273504273504274e-07,
844
- "loss": 1.4172,
845
- "step": 1290
846
  },
847
  {
848
- "epoch": 4.0,
849
- "learning_rate": 0.0,
850
- "loss": 1.3551,
851
- "step": 1300
852
  },
853
  {
854
- "epoch": 4.0,
855
- "step": 1300,
856
- "total_flos": 4.918117451664077e+17,
857
- "train_loss": 1.7419609642028808,
858
- "train_runtime": 52862.2699,
859
- "train_samples_per_second": 0.787,
860
- "train_steps_per_second": 0.025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
861
  }
862
  ],
863
- "max_steps": 1300,
864
- "num_train_epochs": 4,
865
- "total_flos": 4.918117451664077e+17,
866
  "trial_name": null,
867
  "trial_params": null
868
  }
 
1
  {
2
+ "best_metric": 0.6870071488415761,
3
+ "best_model_checkpoint": "opt-2.7b-realtime-chat-v2/checkpoint-204",
4
+ "epoch": 2.9874151475859962,
5
+ "global_step": 306,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
9
  "log_history": [
10
  {
11
+ "epoch": 0.02,
12
+ "learning_rate": 1.935483870967742e-06,
13
+ "loss": 2.6432,
14
+ "step": 2
15
+ },
16
+ {
17
+ "epoch": 0.04,
18
+ "learning_rate": 3.870967741935484e-06,
19
+ "loss": 2.6152,
20
+ "step": 4
21
  },
22
  {
23
  "epoch": 0.06,
24
+ "learning_rate": 5.8064516129032256e-06,
25
+ "loss": 2.5393,
26
+ "step": 6
27
  },
28
  {
29
+ "epoch": 0.08,
30
+ "learning_rate": 7.741935483870968e-06,
31
+ "loss": 2.4402,
32
+ "step": 8
33
+ },
34
+ {
35
+ "epoch": 0.1,
36
+ "learning_rate": 9.67741935483871e-06,
37
+ "loss": 2.3797,
38
+ "step": 10
39
  },
40
  {
41
  "epoch": 0.12,
42
+ "learning_rate": 1.1612903225806451e-05,
43
+ "loss": 2.3533,
44
+ "step": 12
45
  },
46
  {
47
+ "epoch": 0.14,
48
+ "learning_rate": 1.3548387096774194e-05,
49
+ "loss": 2.307,
50
+ "step": 14
51
+ },
52
+ {
53
+ "epoch": 0.16,
54
+ "learning_rate": 1.5483870967741936e-05,
55
+ "loss": 2.266,
56
+ "step": 16
57
  },
58
  {
59
  "epoch": 0.18,
60
+ "learning_rate": 1.741935483870968e-05,
61
+ "loss": 2.2859,
62
+ "step": 18
63
  },
64
  {
65
+ "epoch": 0.2,
66
+ "learning_rate": 1.935483870967742e-05,
67
+ "loss": 2.2434,
68
+ "step": 20
69
+ },
70
+ {
71
+ "epoch": 0.21,
72
+ "learning_rate": 2.1290322580645163e-05,
73
+ "loss": 2.2224,
74
+ "step": 22
75
+ },
76
+ {
77
+ "epoch": 0.23,
78
+ "learning_rate": 2.3225806451612902e-05,
79
+ "loss": 2.2518,
80
+ "step": 24
81
  },
82
  {
83
  "epoch": 0.25,
84
+ "learning_rate": 2.5161290322580648e-05,
85
+ "loss": 2.232,
86
+ "step": 26
87
  },
88
  {
89
+ "epoch": 0.27,
90
+ "learning_rate": 2.7096774193548387e-05,
91
+ "loss": 2.2121,
92
+ "step": 28
93
+ },
94
+ {
95
+ "epoch": 0.29,
96
+ "learning_rate": 2.903225806451613e-05,
97
+ "loss": 2.2042,
98
+ "step": 30
99
  },
100
  {
101
  "epoch": 0.31,
102
+ "learning_rate": 2.9890909090909092e-05,
103
+ "loss": 2.1655,
104
+ "step": 32
105
  },
106
  {
107
+ "epoch": 0.33,
108
+ "learning_rate": 2.9672727272727274e-05,
109
+ "loss": 2.197,
110
+ "step": 34
111
+ },
112
+ {
113
+ "epoch": 0.35,
114
+ "learning_rate": 2.9454545454545456e-05,
115
+ "loss": 2.2383,
116
+ "step": 36
117
  },
118
  {
119
  "epoch": 0.37,
120
+ "learning_rate": 2.9236363636363635e-05,
121
+ "loss": 2.1827,
122
+ "step": 38
123
  },
124
  {
125
+ "epoch": 0.39,
126
+ "learning_rate": 2.901818181818182e-05,
127
+ "loss": 2.1667,
128
+ "step": 40
129
+ },
130
+ {
131
+ "epoch": 0.41,
132
+ "learning_rate": 2.88e-05,
133
+ "loss": 2.1688,
134
+ "step": 42
135
  },
136
  {
137
  "epoch": 0.43,
138
+ "learning_rate": 2.8581818181818184e-05,
139
+ "loss": 2.1722,
140
+ "step": 44
141
  },
142
  {
143
+ "epoch": 0.45,
144
+ "learning_rate": 2.8363636363636363e-05,
145
+ "loss": 2.1771,
146
+ "step": 46
147
+ },
148
+ {
149
+ "epoch": 0.47,
150
+ "learning_rate": 2.8145454545454548e-05,
151
+ "loss": 2.1681,
152
+ "step": 48
153
  },
154
  {
155
  "epoch": 0.49,
156
+ "learning_rate": 2.7927272727272727e-05,
157
+ "loss": 2.0974,
158
+ "step": 50
159
  },
160
  {
161
  "epoch": 0.5,
162
+ "eval_accuracy": 0.6825691202041083,
163
+ "eval_loss": 2.1267497539520264,
164
+ "eval_runtime": 510.4375,
165
+ "eval_samples_per_second": 3.519,
166
+ "eval_steps_per_second": 3.519,
167
+ "step": 51
168
  },
169
  {
170
+ "epoch": 0.51,
171
+ "learning_rate": 2.770909090909091e-05,
172
+ "loss": 2.1322,
173
+ "step": 52
174
+ },
175
+ {
176
+ "epoch": 0.53,
177
+ "learning_rate": 2.749090909090909e-05,
178
+ "loss": 2.1157,
179
+ "step": 54
180
  },
181
  {
182
  "epoch": 0.55,
183
+ "learning_rate": 2.7272727272727273e-05,
184
+ "loss": 2.1325,
185
+ "step": 56
186
  },
187
  {
188
+ "epoch": 0.57,
189
+ "learning_rate": 2.7054545454545455e-05,
190
+ "loss": 2.1346,
191
+ "step": 58
192
+ },
193
+ {
194
+ "epoch": 0.59,
195
+ "learning_rate": 2.6836363636363637e-05,
196
+ "loss": 2.143,
197
+ "step": 60
198
  },
199
  {
200
  "epoch": 0.61,
201
+ "learning_rate": 2.661818181818182e-05,
202
+ "loss": 2.143,
203
+ "step": 62
204
  },
205
  {
206
+ "epoch": 0.62,
207
+ "learning_rate": 2.64e-05,
208
+ "loss": 2.1353,
209
+ "step": 64
210
+ },
211
+ {
212
+ "epoch": 0.64,
213
+ "learning_rate": 2.618181818181818e-05,
214
+ "loss": 2.1296,
215
+ "step": 66
216
+ },
217
+ {
218
+ "epoch": 0.66,
219
+ "learning_rate": 2.5963636363636365e-05,
220
+ "loss": 2.117,
221
+ "step": 68
222
  },
223
  {
224
  "epoch": 0.68,
225
+ "learning_rate": 2.5745454545454544e-05,
226
+ "loss": 2.1093,
227
+ "step": 70
228
  },
229
  {
230
+ "epoch": 0.7,
231
+ "learning_rate": 2.552727272727273e-05,
232
+ "loss": 2.1206,
233
+ "step": 72
234
+ },
235
+ {
236
+ "epoch": 0.72,
237
+ "learning_rate": 2.5309090909090908e-05,
238
+ "loss": 2.1308,
239
+ "step": 74
240
  },
241
  {
242
  "epoch": 0.74,
243
+ "learning_rate": 2.509090909090909e-05,
244
+ "loss": 2.1258,
245
+ "step": 76
246
  },
247
  {
248
+ "epoch": 0.76,
249
+ "learning_rate": 2.4872727272727272e-05,
250
+ "loss": 2.1113,
251
+ "step": 78
252
+ },
253
+ {
254
+ "epoch": 0.78,
255
+ "learning_rate": 2.4654545454545454e-05,
256
+ "loss": 2.0913,
257
+ "step": 80
258
  },
259
  {
260
  "epoch": 0.8,
261
+ "learning_rate": 2.4436363636363636e-05,
262
+ "loss": 2.1067,
263
+ "step": 82
264
  },
265
  {
266
+ "epoch": 0.82,
267
+ "learning_rate": 2.421818181818182e-05,
268
+ "loss": 2.1034,
269
+ "step": 84
270
+ },
271
+ {
272
+ "epoch": 0.84,
273
+ "learning_rate": 2.4e-05,
274
+ "loss": 2.0804,
275
+ "step": 86
276
  },
277
  {
278
  "epoch": 0.86,
279
+ "learning_rate": 2.3781818181818183e-05,
280
+ "loss": 2.1228,
281
+ "step": 88
282
  },
283
  {
284
+ "epoch": 0.88,
285
+ "learning_rate": 2.356363636363636e-05,
286
+ "loss": 2.1044,
287
+ "step": 90
288
+ },
289
+ {
290
+ "epoch": 0.9,
291
+ "learning_rate": 2.3345454545454547e-05,
292
+ "loss": 2.1045,
293
+ "step": 92
294
  },
295
  {
296
  "epoch": 0.92,
297
+ "learning_rate": 2.3127272727272725e-05,
298
+ "loss": 2.0758,
299
+ "step": 94
300
+ },
301
+ {
302
+ "epoch": 0.94,
303
+ "learning_rate": 2.290909090909091e-05,
304
+ "loss": 2.1185,
305
+ "step": 96
306
  },
307
  {
308
+ "epoch": 0.96,
309
+ "learning_rate": 2.269090909090909e-05,
310
+ "loss": 2.0844,
311
+ "step": 98
312
  },
313
  {
314
  "epoch": 0.98,
315
+ "learning_rate": 2.2472727272727275e-05,
316
+ "loss": 2.0817,
317
+ "step": 100
318
  },
319
  {
320
  "epoch": 1.0,
321
+ "learning_rate": 2.2254545454545454e-05,
322
+ "loss": 2.0842,
323
+ "step": 102
324
+ },
325
+ {
326
+ "epoch": 1.0,
327
+ "eval_accuracy": 0.6858646182863831,
328
+ "eval_loss": 2.096801519393921,
329
+ "eval_runtime": 510.9612,
330
+ "eval_samples_per_second": 3.515,
331
+ "eval_steps_per_second": 3.515,
332
+ "step": 102
333
+ },
334
+ {
335
+ "epoch": 1.02,
336
+ "learning_rate": 2.2036363636363636e-05,
337
+ "loss": 2.0166,
338
+ "step": 104
339
  },
340
  {
341
+ "epoch": 1.03,
342
+ "learning_rate": 2.1818181818181818e-05,
343
+ "loss": 1.9987,
344
+ "step": 106
345
  },
346
  {
347
  "epoch": 1.05,
348
+ "learning_rate": 2.16e-05,
349
+ "loss": 1.9678,
350
+ "step": 108
351
  },
352
  {
353
+ "epoch": 1.07,
354
+ "learning_rate": 2.1381818181818182e-05,
355
+ "loss": 2.0026,
356
+ "step": 110
357
  },
358
  {
359
+ "epoch": 1.09,
360
+ "learning_rate": 2.1163636363636364e-05,
361
+ "loss": 1.9867,
362
+ "step": 112
363
  },
364
  {
365
+ "epoch": 1.11,
366
+ "learning_rate": 2.0945454545454546e-05,
367
+ "loss": 2.0086,
368
+ "step": 114
369
  },
370
  {
371
+ "epoch": 1.13,
372
+ "learning_rate": 2.0727272727272728e-05,
373
+ "loss": 1.9611,
374
+ "step": 116
375
  },
376
  {
377
+ "epoch": 1.15,
378
+ "learning_rate": 2.0509090909090907e-05,
379
+ "loss": 1.9681,
380
+ "step": 118
381
  },
382
  {
383
+ "epoch": 1.17,
384
+ "learning_rate": 2.0290909090909092e-05,
385
+ "loss": 1.9623,
386
+ "step": 120
387
  },
388
  {
389
+ "epoch": 1.19,
390
+ "learning_rate": 2.0072727272727274e-05,
391
+ "loss": 1.9776,
392
+ "step": 122
393
  },
394
  {
395
+ "epoch": 1.21,
396
+ "learning_rate": 1.9854545454545456e-05,
397
+ "loss": 1.999,
398
+ "step": 124
399
  },
400
  {
401
+ "epoch": 1.23,
402
+ "learning_rate": 1.963636363636364e-05,
403
+ "loss": 1.9827,
404
+ "step": 126
405
  },
406
  {
407
+ "epoch": 1.25,
408
+ "learning_rate": 1.9418181818181817e-05,
409
+ "loss": 1.9945,
410
+ "step": 128
411
  },
412
  {
413
+ "epoch": 1.27,
414
+ "learning_rate": 1.9200000000000003e-05,
415
+ "loss": 1.9692,
416
+ "step": 130
417
  },
418
  {
419
+ "epoch": 1.29,
420
+ "learning_rate": 1.898181818181818e-05,
421
+ "loss": 1.9519,
422
+ "step": 132
423
  },
424
  {
425
+ "epoch": 1.31,
426
+ "learning_rate": 1.8763636363636367e-05,
427
+ "loss": 1.9432,
428
+ "step": 134
429
  },
430
  {
431
+ "epoch": 1.33,
432
+ "learning_rate": 1.8545454545454545e-05,
433
+ "loss": 1.9454,
434
+ "step": 136
435
  },
436
  {
437
+ "epoch": 1.35,
438
+ "learning_rate": 1.832727272727273e-05,
439
+ "loss": 1.9759,
440
+ "step": 138
 
 
 
441
  },
442
  {
443
+ "epoch": 1.37,
444
+ "learning_rate": 1.810909090909091e-05,
445
+ "loss": 1.9672,
446
+ "step": 140
447
  },
448
  {
449
+ "epoch": 1.39,
450
+ "learning_rate": 1.789090909090909e-05,
451
+ "loss": 1.9527,
452
+ "step": 142
453
  },
454
  {
455
+ "epoch": 1.41,
456
+ "learning_rate": 1.7672727272727274e-05,
457
+ "loss": 1.9465,
458
+ "step": 144
459
  },
460
  {
461
+ "epoch": 1.43,
462
+ "learning_rate": 1.7454545454545456e-05,
463
+ "loss": 1.9948,
464
+ "step": 146
465
  },
466
  {
467
+ "epoch": 1.44,
468
+ "learning_rate": 1.7236363636363638e-05,
469
+ "loss": 1.9685,
470
+ "step": 148
471
  },
472
  {
473
+ "epoch": 1.46,
474
+ "learning_rate": 1.701818181818182e-05,
475
+ "loss": 1.9518,
476
+ "step": 150
477
  },
478
  {
479
+ "epoch": 1.48,
480
+ "learning_rate": 1.6800000000000002e-05,
481
+ "loss": 1.9624,
482
+ "step": 152
483
  },
484
  {
485
+ "epoch": 1.49,
486
+ "eval_accuracy": 0.6862566888822462,
487
+ "eval_loss": 2.0935797691345215,
488
+ "eval_runtime": 510.8965,
489
+ "eval_samples_per_second": 3.515,
490
+ "eval_steps_per_second": 3.515,
491
+ "step": 153
492
  },
493
  {
494
+ "epoch": 1.5,
495
+ "learning_rate": 1.6581818181818184e-05,
496
+ "loss": 1.9587,
497
+ "step": 154
498
  },
499
  {
500
+ "epoch": 1.52,
501
+ "learning_rate": 1.6363636363636363e-05,
502
+ "loss": 1.9752,
503
+ "step": 156
504
  },
505
  {
506
+ "epoch": 1.54,
507
+ "learning_rate": 1.6145454545454548e-05,
508
+ "loss": 1.9559,
509
+ "step": 158
510
  },
511
  {
512
+ "epoch": 1.56,
513
+ "learning_rate": 1.5927272727272727e-05,
514
+ "loss": 1.9707,
515
+ "step": 160
516
  },
517
  {
518
+ "epoch": 1.58,
519
+ "learning_rate": 1.5709090909090912e-05,
520
+ "loss": 1.9844,
521
+ "step": 162
522
  },
523
  {
524
+ "epoch": 1.6,
525
+ "learning_rate": 1.549090909090909e-05,
526
+ "loss": 1.9884,
527
+ "step": 164
528
  },
529
  {
530
+ "epoch": 1.62,
531
+ "learning_rate": 1.5272727272727273e-05,
532
+ "loss": 1.9799,
533
+ "step": 166
534
  },
535
  {
536
+ "epoch": 1.64,
537
+ "learning_rate": 1.5054545454545455e-05,
538
+ "loss": 1.9582,
539
+ "step": 168
540
  },
541
  {
542
+ "epoch": 1.66,
543
+ "learning_rate": 1.4836363636363637e-05,
544
+ "loss": 1.9677,
545
+ "step": 170
546
  },
547
  {
548
+ "epoch": 1.68,
549
+ "learning_rate": 1.4618181818181817e-05,
550
+ "loss": 1.9804,
551
+ "step": 172
 
 
 
552
  },
553
  {
554
+ "epoch": 1.7,
555
+ "learning_rate": 1.44e-05,
556
+ "loss": 1.9816,
557
+ "step": 174
558
  },
559
  {
560
+ "epoch": 1.72,
561
+ "learning_rate": 1.4181818181818181e-05,
562
+ "loss": 1.969,
563
+ "step": 176
564
  },
565
  {
566
+ "epoch": 1.74,
567
+ "learning_rate": 1.3963636363636363e-05,
568
+ "loss": 1.9602,
569
+ "step": 178
570
  },
571
  {
572
+ "epoch": 1.76,
573
+ "learning_rate": 1.3745454545454546e-05,
574
+ "loss": 1.9926,
575
+ "step": 180
576
  },
577
  {
578
+ "epoch": 1.78,
579
+ "learning_rate": 1.3527272727272728e-05,
580
+ "loss": 1.9378,
581
+ "step": 182
582
  },
583
  {
584
+ "epoch": 1.8,
585
+ "learning_rate": 1.330909090909091e-05,
586
+ "loss": 1.9567,
587
+ "step": 184
588
  },
589
  {
590
+ "epoch": 1.82,
591
+ "learning_rate": 1.309090909090909e-05,
592
+ "loss": 1.9849,
593
+ "step": 186
594
  },
595
  {
596
+ "epoch": 1.84,
597
+ "learning_rate": 1.2872727272727272e-05,
598
+ "loss": 1.9622,
599
+ "step": 188
600
  },
601
  {
602
+ "epoch": 1.85,
603
+ "learning_rate": 1.2654545454545454e-05,
604
+ "loss": 1.9447,
605
+ "step": 190
606
  },
607
  {
608
+ "epoch": 1.87,
609
+ "learning_rate": 1.2436363636363636e-05,
610
+ "loss": 1.958,
611
+ "step": 192
612
  },
613
  {
614
+ "epoch": 1.89,
615
+ "learning_rate": 1.2218181818181818e-05,
616
+ "loss": 1.9729,
617
+ "step": 194
618
  },
619
  {
620
+ "epoch": 1.91,
621
+ "learning_rate": 1.2e-05,
622
+ "loss": 1.9402,
623
+ "step": 196
624
  },
625
  {
626
+ "epoch": 1.93,
627
+ "learning_rate": 1.178181818181818e-05,
628
+ "loss": 1.977,
629
+ "step": 198
630
  },
631
  {
632
+ "epoch": 1.95,
633
+ "learning_rate": 1.1563636363636363e-05,
634
+ "loss": 1.9561,
635
+ "step": 200
636
  },
637
  {
638
+ "epoch": 1.97,
639
+ "learning_rate": 1.1345454545454545e-05,
640
+ "loss": 1.9576,
641
+ "step": 202
642
  },
643
  {
644
+ "epoch": 1.99,
645
+ "learning_rate": 1.1127272727272727e-05,
646
+ "loss": 1.9476,
647
+ "step": 204
648
  },
649
  {
650
+ "epoch": 1.99,
651
+ "eval_accuracy": 0.6870071488415761,
652
+ "eval_loss": 2.0888001918792725,
653
+ "eval_runtime": 510.9696,
654
+ "eval_samples_per_second": 3.515,
655
+ "eval_steps_per_second": 3.515,
656
+ "step": 204
657
  },
658
  {
659
+ "epoch": 2.01,
660
+ "learning_rate": 1.0909090909090909e-05,
661
+ "loss": 1.9169,
662
+ "step": 206
663
  },
664
  {
665
+ "epoch": 2.03,
666
+ "learning_rate": 1.0690909090909091e-05,
667
+ "loss": 1.8484,
668
+ "step": 208
669
  },
670
  {
671
+ "epoch": 2.05,
672
+ "learning_rate": 1.0472727272727273e-05,
673
+ "loss": 1.8525,
674
+ "step": 210
675
  },
676
  {
677
+ "epoch": 2.07,
678
+ "learning_rate": 1.0254545454545453e-05,
679
+ "loss": 1.8692,
680
+ "step": 212
681
  },
682
  {
683
+ "epoch": 2.09,
684
+ "learning_rate": 1.0036363636363637e-05,
685
+ "loss": 1.8738,
686
+ "step": 214
687
  },
688
  {
689
+ "epoch": 2.11,
690
+ "learning_rate": 9.81818181818182e-06,
691
+ "loss": 1.8889,
692
+ "step": 216
693
  },
694
  {
695
+ "epoch": 2.13,
696
+ "learning_rate": 9.600000000000001e-06,
697
+ "loss": 1.8661,
698
+ "step": 218
699
  },
700
  {
701
+ "epoch": 2.15,
702
+ "learning_rate": 9.381818181818183e-06,
703
+ "loss": 1.8699,
704
+ "step": 220
705
  },
706
  {
707
+ "epoch": 2.17,
708
+ "learning_rate": 9.163636363636365e-06,
709
+ "loss": 1.8902,
710
+ "step": 222
711
  },
712
  {
713
+ "epoch": 2.19,
714
+ "learning_rate": 8.945454545454546e-06,
715
+ "loss": 1.8766,
716
+ "step": 224
717
  },
718
  {
719
+ "epoch": 2.21,
720
+ "learning_rate": 8.727272727272728e-06,
721
+ "loss": 1.8634,
722
+ "step": 226
723
  },
724
  {
725
+ "epoch": 2.23,
726
+ "learning_rate": 8.50909090909091e-06,
727
+ "loss": 1.8549,
728
+ "step": 228
729
  },
730
  {
731
+ "epoch": 2.25,
732
+ "learning_rate": 8.290909090909092e-06,
733
+ "loss": 1.8592,
734
+ "step": 230
735
  },
736
  {
737
+ "epoch": 2.26,
738
+ "learning_rate": 8.072727272727274e-06,
739
+ "loss": 1.8695,
740
+ "step": 232
741
  },
742
  {
743
+ "epoch": 2.28,
744
+ "learning_rate": 7.854545454545456e-06,
745
+ "loss": 1.8753,
746
+ "step": 234
747
  },
748
  {
749
+ "epoch": 2.3,
750
+ "learning_rate": 7.636363636363636e-06,
751
+ "loss": 1.8599,
752
+ "step": 236
753
  },
754
  {
755
+ "epoch": 2.32,
756
+ "learning_rate": 7.4181818181818185e-06,
757
+ "loss": 1.8847,
758
+ "step": 238
 
 
 
759
  },
760
  {
761
+ "epoch": 2.34,
762
+ "learning_rate": 7.2e-06,
763
+ "loss": 1.8712,
764
+ "step": 240
765
  },
766
  {
767
+ "epoch": 2.36,
768
+ "learning_rate": 6.981818181818182e-06,
769
+ "loss": 1.8673,
770
+ "step": 242
771
  },
772
  {
773
+ "epoch": 2.38,
774
+ "learning_rate": 6.763636363636364e-06,
775
+ "loss": 1.8617,
776
+ "step": 244
777
  },
778
  {
779
+ "epoch": 2.4,
780
+ "learning_rate": 6.545454545454545e-06,
781
+ "loss": 1.8297,
782
+ "step": 246
783
  },
784
  {
785
+ "epoch": 2.42,
786
+ "learning_rate": 6.327272727272727e-06,
787
+ "loss": 1.8624,
788
+ "step": 248
789
  },
790
  {
791
+ "epoch": 2.44,
792
+ "learning_rate": 6.109090909090909e-06,
793
+ "loss": 1.8753,
794
+ "step": 250
795
  },
796
  {
797
+ "epoch": 2.46,
798
+ "learning_rate": 5.89090909090909e-06,
799
+ "loss": 1.8789,
800
+ "step": 252
801
  },
802
  {
803
+ "epoch": 2.48,
804
+ "learning_rate": 5.672727272727272e-06,
805
+ "loss": 1.888,
806
+ "step": 254
807
  },
808
  {
809
+ "epoch": 2.49,
810
+ "eval_accuracy": 0.6863803621574514,
811
+ "eval_loss": 2.0992937088012695,
812
+ "eval_runtime": 511.1737,
813
+ "eval_samples_per_second": 3.513,
814
+ "eval_steps_per_second": 3.513,
815
+ "step": 255
816
  },
817
  {
818
+ "epoch": 2.5,
819
+ "learning_rate": 5.4545454545454545e-06,
820
+ "loss": 1.8462,
821
+ "step": 256
822
  },
823
  {
824
+ "epoch": 2.52,
825
+ "learning_rate": 5.2363636363636365e-06,
826
+ "loss": 1.8716,
827
+ "step": 258
828
  },
829
  {
830
+ "epoch": 2.54,
831
+ "learning_rate": 5.0181818181818186e-06,
832
+ "loss": 1.8823,
833
+ "step": 260
834
  },
835
  {
836
+ "epoch": 2.56,
837
+ "learning_rate": 4.800000000000001e-06,
838
+ "loss": 1.8718,
839
+ "step": 262
840
  },
841
  {
842
+ "epoch": 2.58,
843
+ "learning_rate": 4.581818181818183e-06,
844
+ "loss": 1.8745,
845
+ "step": 264
846
  },
847
  {
848
+ "epoch": 2.6,
849
+ "learning_rate": 4.363636363636364e-06,
850
+ "loss": 1.908,
851
+ "step": 266
852
  },
853
  {
854
+ "epoch": 2.62,
855
+ "learning_rate": 4.145454545454546e-06,
856
+ "loss": 1.8412,
857
+ "step": 268
858
  },
859
  {
860
+ "epoch": 2.64,
861
+ "learning_rate": 3.927272727272728e-06,
862
+ "loss": 1.8462,
863
+ "step": 270
864
  },
865
  {
866
+ "epoch": 2.66,
867
+ "learning_rate": 3.7090909090909092e-06,
868
+ "loss": 1.8683,
869
+ "step": 272
 
 
 
870
  },
871
  {
872
+ "epoch": 2.68,
873
+ "learning_rate": 3.490909090909091e-06,
874
+ "loss": 1.872,
875
+ "step": 274
876
  },
877
  {
878
+ "epoch": 2.69,
879
+ "learning_rate": 3.2727272727272725e-06,
880
+ "loss": 1.852,
881
+ "step": 276
882
  },
883
  {
884
+ "epoch": 2.71,
885
+ "learning_rate": 3.0545454545454546e-06,
886
+ "loss": 1.8619,
887
+ "step": 278
888
  },
889
  {
890
+ "epoch": 2.73,
891
+ "learning_rate": 2.836363636363636e-06,
892
+ "loss": 1.8542,
893
+ "step": 280
894
  },
895
  {
896
+ "epoch": 2.75,
897
+ "learning_rate": 2.6181818181818183e-06,
898
+ "loss": 1.8953,
899
+ "step": 282
900
  },
901
  {
902
+ "epoch": 2.77,
903
+ "learning_rate": 2.4000000000000003e-06,
904
+ "loss": 1.8341,
905
+ "step": 284
906
  },
907
  {
908
+ "epoch": 2.79,
909
+ "learning_rate": 2.181818181818182e-06,
910
+ "loss": 1.8484,
911
+ "step": 286
912
  },
913
  {
914
+ "epoch": 2.81,
915
+ "learning_rate": 1.963636363636364e-06,
916
+ "loss": 1.8473,
917
+ "step": 288
918
  },
919
  {
920
+ "epoch": 2.83,
921
+ "learning_rate": 1.7454545454545454e-06,
922
+ "loss": 1.8827,
923
+ "step": 290
924
  },
925
  {
926
+ "epoch": 2.85,
927
+ "learning_rate": 1.5272727272727273e-06,
928
+ "loss": 1.8572,
929
+ "step": 292
930
  },
931
  {
932
+ "epoch": 2.87,
933
+ "learning_rate": 1.3090909090909091e-06,
934
+ "loss": 1.8721,
935
+ "step": 294
936
  },
937
  {
938
+ "epoch": 2.89,
939
+ "learning_rate": 1.090909090909091e-06,
940
+ "loss": 1.8363,
941
+ "step": 296
942
  },
943
  {
944
+ "epoch": 2.91,
945
+ "learning_rate": 8.727272727272727e-07,
946
+ "loss": 1.8762,
947
+ "step": 298
948
  },
949
  {
950
+ "epoch": 2.93,
951
+ "learning_rate": 6.545454545454546e-07,
952
+ "loss": 1.8749,
953
+ "step": 300
954
  },
955
  {
956
+ "epoch": 2.95,
957
+ "learning_rate": 4.3636363636363636e-07,
958
+ "loss": 1.8522,
959
+ "step": 302
960
  },
961
  {
962
+ "epoch": 2.97,
963
+ "learning_rate": 2.1818181818181818e-07,
964
+ "loss": 1.8716,
965
+ "step": 304
966
  },
967
  {
968
+ "epoch": 2.99,
969
+ "learning_rate": 0.0,
970
+ "loss": 1.8687,
971
+ "step": 306
972
+ },
973
+ {
974
+ "epoch": 2.99,
975
+ "eval_accuracy": 0.6864508822377811,
976
+ "eval_loss": 2.0993993282318115,
977
+ "eval_runtime": 511.0772,
978
+ "eval_samples_per_second": 3.514,
979
+ "eval_steps_per_second": 3.514,
980
+ "step": 306
981
+ },
982
+ {
983
+ "epoch": 2.99,
984
+ "step": 306,
985
+ "total_flos": 4.2741192183760896e+17,
986
+ "train_loss": 2.010221361334807,
987
+ "train_runtime": 46544.0603,
988
+ "train_samples_per_second": 0.845,
989
+ "train_steps_per_second": 0.007
990
  }
991
  ],
992
+ "max_steps": 306,
993
+ "num_train_epochs": 3,
994
+ "total_flos": 4.2741192183760896e+17,
995
  "trial_name": null,
996
  "trial_params": null
997
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:418b5fa25c2039e356036e9aca24417d092f2a76c689cd926136a74b8a063dc2
3
  size 3579
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a71ef2368d7982b99f77d7631603c4ef031cdc2d890bf705f3324ac76a0bcffc
3
  size 3579