theblackcat102 commited on
Commit
03c9cb9
1 Parent(s): 05b1d74

Upload 8 files

Browse files
added_tokens.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "<|assistant|>": 128002,
3
- "<|prefix_begin|>": 128001,
4
  "<|prefix_end|>": 128004,
5
- "<|prompter|>": 128005,
6
  "<|system|>": 128003,
7
  "[MASK]": 128000
8
  }
 
1
  {
2
+ "<|assistant|>": 128001,
3
+ "<|prefix_begin|>": 128005,
4
  "<|prefix_end|>": 128004,
5
+ "<|prompter|>": 128002,
6
  "<|system|>": 128003,
7
  "[MASK]": 128000
8
  }
config.json CHANGED
@@ -14,22 +14,26 @@
14
  "3": "sexual_content",
15
  "4": "quality",
16
  "5": "toxicity",
17
- "6": "humor",
18
- "7": "creativity",
19
- "8": "violence"
 
 
20
  },
21
  "initializer_range": 0.02,
22
  "intermediate_size": 4096,
23
  "label2id": {
24
  "reward": 0,
25
  "not_appropriate": 1,
 
26
  "hate_speech": 2,
27
  "sexual_content": 3,
28
  "quality": 4,
29
  "toxicity": 5,
30
- "humor": 6,
31
- "creativity": 7,
32
- "violence": 8
 
33
  },
34
  "layer_norm_eps": 1e-07,
35
  "max_position_embeddings": 512,
@@ -51,7 +55,7 @@
51
  "relative_attention": true,
52
  "share_att_key": true,
53
  "torch_dtype": "float16",
54
- "transformers_version": "4.28.0.dev0",
55
  "type_vocab_size": 0,
56
  "vocab_size": 128100
57
  }
 
14
  "3": "sexual_content",
15
  "4": "quality",
16
  "5": "toxicity",
17
+ "6": "fails_task",
18
+ "7": "humor",
19
+ "8": "creativity",
20
+ "9": "violence",
21
+ "10": "spam"
22
  },
23
  "initializer_range": 0.02,
24
  "intermediate_size": 4096,
25
  "label2id": {
26
  "reward": 0,
27
  "not_appropriate": 1,
28
+ "spam": 10,
29
  "hate_speech": 2,
30
  "sexual_content": 3,
31
  "quality": 4,
32
  "toxicity": 5,
33
+ "fails_task": 6,
34
+ "humor": 7,
35
+ "creativity": 8,
36
+ "violence": 9
37
  },
38
  "layer_norm_eps": 1e-07,
39
  "max_position_embeddings": 512,
 
55
  "relative_attention": true,
56
  "share_att_key": true,
57
  "torch_dtype": "float16",
58
+ "transformers_version": "4.28.1",
59
  "type_vocab_size": 0,
60
  "vocab_size": 128100
61
  }
special_tokens_map.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|prefix_begin|>",
4
  "<|assistant|>",
 
5
  "<|system|>",
6
  "<|prefix_end|>",
7
- "<|prompter|>"
8
  ],
9
  "bos_token": "[CLS]",
10
  "cls_token": "[CLS]",
 
1
  {
2
  "additional_special_tokens": [
 
3
  "<|assistant|>",
4
+ "<|prompter|>",
5
  "<|system|>",
6
  "<|prefix_end|>",
7
+ "<|prefix_begin|>"
8
  ],
9
  "bos_token": "[CLS]",
10
  "cls_token": "[CLS]",
tokenizer.json CHANGED
@@ -50,7 +50,7 @@
50
  },
51
  {
52
  "id": 128001,
53
- "content": "<|prefix_begin|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
@@ -59,7 +59,7 @@
59
  },
60
  {
61
  "id": 128002,
62
- "content": "<|assistant|>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
@@ -86,7 +86,7 @@
86
  },
87
  {
88
  "id": 128005,
89
- "content": "<|prompter|>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
 
50
  },
51
  {
52
  "id": 128001,
53
+ "content": "<|assistant|>",
54
  "single_word": false,
55
  "lstrip": false,
56
  "rstrip": false,
 
59
  },
60
  {
61
  "id": 128002,
62
+ "content": "<|prompter|>",
63
  "single_word": false,
64
  "lstrip": false,
65
  "rstrip": false,
 
86
  },
87
  {
88
  "id": 128005,
89
+ "content": "<|prefix_begin|>",
90
  "single_word": false,
91
  "lstrip": false,
92
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -9,7 +9,6 @@
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
  "sp_model_kwargs": {},
12
- "special_tokens_map_file": null,
13
  "split_by_punct": false,
14
  "tokenizer_class": "DebertaV2Tokenizer",
15
  "unk_token": "[UNK]",
 
9
  "pad_token": "[PAD]",
10
  "sep_token": "[SEP]",
11
  "sp_model_kwargs": {},
 
12
  "split_by_punct": false,
13
  "tokenizer_class": "DebertaV2Tokenizer",
14
  "unk_token": "[UNK]",
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.8449049873931491,
5
  "global_step": 1500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
@@ -9,967 +9,941 @@
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
- "learning_rate": 4.5801353078493935e-06,
13
- "loss": 0.8726,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.02,
18
- "learning_rate": 7.087352805422317e-06,
19
- "loss": 0.8672,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.04,
24
- "learning_rate": 8.228161798644422e-06,
25
- "loss": 0.8565,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.05,
30
- "learning_rate": 9.014161010104347e-06,
31
- "loss": 0.8213,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.06,
36
- "learning_rate": 9.673229499590088e-06,
37
- "loss": 0.7977,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.07,
42
- "learning_rate": 9.987442444537463e-06,
43
- "loss": 0.7521,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.09,
48
- "learning_rate": 9.94558392632901e-06,
49
- "loss": 0.7423,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.1,
54
- "learning_rate": 9.903725408120554e-06,
55
- "loss": 0.7026,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.11,
60
- "learning_rate": 9.861866889912098e-06,
61
- "loss": 0.6808,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 0.12,
66
- "learning_rate": 9.820008371703642e-06,
67
- "loss": 0.6901,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 0.14,
72
- "learning_rate": 9.778149853495186e-06,
73
- "loss": 0.6514,
74
  "step": 110
75
  },
76
  {
77
  "epoch": 0.15,
78
- "learning_rate": 9.736291335286732e-06,
79
- "loss": 0.6467,
80
  "step": 120
81
  },
82
  {
83
  "epoch": 0.16,
84
- "learning_rate": 9.694432817078277e-06,
85
- "loss": 0.6641,
86
  "step": 130
87
  },
88
  {
89
  "epoch": 0.17,
90
- "learning_rate": 9.652574298869821e-06,
91
- "loss": 0.6616,
92
  "step": 140
93
  },
94
  {
95
  "epoch": 0.18,
96
- "learning_rate": 9.610715780661365e-06,
97
- "loss": 0.6617,
98
  "step": 150
99
  },
100
  {
101
  "epoch": 0.2,
102
- "learning_rate": 9.56885726245291e-06,
103
- "loss": 0.6295,
104
  "step": 160
105
  },
106
  {
107
  "epoch": 0.21,
108
- "learning_rate": 9.526998744244454e-06,
109
- "loss": 0.6071,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 0.22,
114
- "learning_rate": 9.485140226036e-06,
115
- "loss": 0.6354,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 0.23,
120
- "learning_rate": 9.443281707827544e-06,
121
- "loss": 0.6211,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 0.25,
126
- "learning_rate": 9.401423189619088e-06,
127
- "loss": 0.6272,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 0.26,
132
- "learning_rate": 9.359564671410633e-06,
133
- "loss": 0.641,
134
  "step": 210
135
  },
136
  {
137
  "epoch": 0.27,
138
- "learning_rate": 9.317706153202177e-06,
139
- "loss": 0.6205,
140
  "step": 220
141
  },
142
  {
143
  "epoch": 0.28,
144
- "learning_rate": 9.275847634993721e-06,
145
- "loss": 0.6263,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 0.3,
150
- "learning_rate": 9.233989116785267e-06,
151
- "loss": 0.6148,
152
  "step": 240
153
  },
154
  {
155
  "epoch": 0.31,
156
- "learning_rate": 9.192130598576812e-06,
157
- "loss": 0.6521,
158
  "step": 250
159
  },
160
- {
161
- "epoch": 0.31,
162
- "eval_oasst_export_w_label_accuracy": 0.7346153846153847,
163
- "eval_oasst_export_w_label_kendalltau": 0.3859706959706959,
164
- "eval_oasst_export_w_label_loss": 0.626953125,
165
- "eval_oasst_export_w_label_neg_score": 0.2529296875,
166
- "eval_oasst_export_w_label_pos_score": 0.91064453125,
167
- "eval_oasst_export_w_label_runtime": 190.5015,
168
- "eval_oasst_export_w_label_samples_per_second": 9.554,
169
- "eval_oasst_export_w_label_score_diff": 0.65771484375,
170
- "eval_oasst_export_w_label_steps_per_second": 2.388,
171
- "step": 251
172
- },
173
  {
174
  "epoch": 0.32,
175
- "learning_rate": 9.150272080368356e-06,
176
- "loss": 0.602,
177
  "step": 260
178
  },
179
  {
180
  "epoch": 0.33,
181
- "learning_rate": 9.1084135621599e-06,
182
- "loss": 0.5637,
183
  "step": 270
184
  },
185
  {
186
  "epoch": 0.34,
187
- "learning_rate": 9.066555043951444e-06,
188
- "loss": 0.5948,
189
  "step": 280
190
  },
191
  {
192
  "epoch": 0.36,
193
- "learning_rate": 9.02469652574299e-06,
194
- "loss": 0.6245,
195
  "step": 290
196
  },
197
  {
198
  "epoch": 0.37,
199
- "learning_rate": 8.982838007534535e-06,
200
- "loss": 0.5943,
201
  "step": 300
202
  },
203
  {
204
  "epoch": 0.38,
205
- "learning_rate": 8.940979489326079e-06,
206
- "loss": 0.5934,
207
  "step": 310
208
  },
209
  {
210
  "epoch": 0.39,
211
- "learning_rate": 8.899120971117623e-06,
212
- "loss": 0.5426,
213
  "step": 320
214
  },
215
  {
216
  "epoch": 0.41,
217
- "learning_rate": 8.857262452909168e-06,
218
- "loss": 0.6191,
219
  "step": 330
220
  },
221
  {
222
  "epoch": 0.42,
223
- "learning_rate": 8.815403934700712e-06,
224
- "loss": 0.6316,
225
  "step": 340
226
  },
227
  {
228
  "epoch": 0.43,
229
- "learning_rate": 8.773545416492258e-06,
230
- "loss": 0.611,
231
  "step": 350
232
  },
233
  {
234
  "epoch": 0.44,
235
- "learning_rate": 8.7316868982838e-06,
236
- "loss": 0.6434,
237
  "step": 360
238
  },
239
  {
240
  "epoch": 0.46,
241
- "learning_rate": 8.689828380075346e-06,
242
- "loss": 0.6327,
243
  "step": 370
244
  },
245
  {
246
  "epoch": 0.47,
247
- "learning_rate": 8.64796986186689e-06,
248
- "loss": 0.6021,
249
  "step": 380
250
  },
251
  {
252
  "epoch": 0.48,
253
- "learning_rate": 8.606111343658435e-06,
254
- "loss": 0.5986,
255
  "step": 390
256
  },
257
  {
258
  "epoch": 0.49,
259
- "learning_rate": 8.564252825449981e-06,
260
- "loss": 0.5798,
261
  "step": 400
262
  },
263
  {
264
  "epoch": 0.5,
265
- "learning_rate": 8.522394307241524e-06,
266
- "loss": 0.6055,
267
  "step": 410
268
  },
269
  {
270
  "epoch": 0.52,
271
- "learning_rate": 8.480535789033068e-06,
272
- "loss": 0.6172,
273
  "step": 420
274
  },
275
  {
276
  "epoch": 0.53,
277
- "learning_rate": 8.438677270824614e-06,
278
- "loss": 0.6657,
279
  "step": 430
280
  },
281
  {
282
  "epoch": 0.54,
283
- "learning_rate": 8.396818752616158e-06,
284
- "loss": 0.6206,
285
  "step": 440
286
  },
287
  {
288
  "epoch": 0.55,
289
- "learning_rate": 8.354960234407702e-06,
290
- "loss": 0.6177,
291
  "step": 450
292
  },
293
  {
294
  "epoch": 0.57,
295
- "learning_rate": 8.313101716199248e-06,
296
- "loss": 0.6201,
297
  "step": 460
298
  },
299
  {
300
  "epoch": 0.58,
301
- "learning_rate": 8.271243197990791e-06,
302
- "loss": 0.5299,
303
  "step": 470
304
  },
305
  {
306
  "epoch": 0.59,
307
- "learning_rate": 8.229384679782337e-06,
308
- "loss": 0.6733,
309
  "step": 480
310
  },
311
  {
312
  "epoch": 0.6,
313
- "learning_rate": 8.187526161573881e-06,
314
- "loss": 0.6546,
315
  "step": 490
316
  },
317
  {
318
  "epoch": 0.61,
319
- "learning_rate": 8.145667643365426e-06,
320
- "loss": 0.6234,
321
  "step": 500
322
  },
323
  {
324
- "epoch": 0.62,
325
- "eval_oasst_export_w_label_accuracy": 0.7423076923076923,
326
- "eval_oasst_export_w_label_kendalltau": 0.3931501831501834,
327
- "eval_oasst_export_w_label_loss": 0.60888671875,
328
- "eval_oasst_export_w_label_neg_score": 0.53515625,
329
- "eval_oasst_export_w_label_pos_score": 1.1513671875,
330
- "eval_oasst_export_w_label_runtime": 190.2556,
331
- "eval_oasst_export_w_label_samples_per_second": 9.566,
332
- "eval_oasst_export_w_label_score_diff": 0.6162109375,
333
- "eval_oasst_export_w_label_steps_per_second": 2.392,
334
- "step": 502
335
  },
336
  {
337
  "epoch": 0.63,
338
- "learning_rate": 8.10380912515697e-06,
339
- "loss": 0.6087,
340
  "step": 510
341
  },
342
  {
343
  "epoch": 0.64,
344
- "learning_rate": 8.061950606948514e-06,
345
- "loss": 0.6345,
346
  "step": 520
347
  },
348
  {
349
  "epoch": 0.65,
350
- "learning_rate": 8.020092088740058e-06,
351
- "loss": 0.5881,
352
  "step": 530
353
  },
354
  {
355
  "epoch": 0.66,
356
- "learning_rate": 7.978233570531604e-06,
357
- "loss": 0.5954,
358
  "step": 540
359
  },
360
  {
361
  "epoch": 0.68,
362
- "learning_rate": 7.936375052323149e-06,
363
- "loss": 0.61,
364
  "step": 550
365
  },
366
  {
367
  "epoch": 0.69,
368
- "learning_rate": 7.894516534114693e-06,
369
- "loss": 0.5808,
370
  "step": 560
371
  },
372
  {
373
  "epoch": 0.7,
374
- "learning_rate": 7.852658015906237e-06,
375
- "loss": 0.6579,
376
  "step": 570
377
  },
378
  {
379
  "epoch": 0.71,
380
- "learning_rate": 7.810799497697782e-06,
381
- "loss": 0.6155,
382
  "step": 580
383
  },
384
  {
385
  "epoch": 0.73,
386
- "learning_rate": 7.768940979489327e-06,
387
- "loss": 0.6049,
388
  "step": 590
389
  },
390
  {
391
  "epoch": 0.74,
392
- "learning_rate": 7.727082461280872e-06,
393
- "loss": 0.6135,
394
  "step": 600
395
  },
396
  {
397
  "epoch": 0.75,
398
- "learning_rate": 7.685223943072416e-06,
399
- "loss": 0.5793,
400
  "step": 610
401
  },
402
  {
403
  "epoch": 0.76,
404
- "learning_rate": 7.64336542486396e-06,
405
- "loss": 0.6091,
406
  "step": 620
407
  },
408
  {
409
  "epoch": 0.77,
410
- "learning_rate": 7.601506906655505e-06,
411
- "loss": 0.6196,
412
  "step": 630
413
  },
414
  {
415
  "epoch": 0.79,
416
- "learning_rate": 7.55964838844705e-06,
417
- "loss": 0.5951,
418
  "step": 640
419
  },
420
  {
421
  "epoch": 0.8,
422
- "learning_rate": 7.517789870238594e-06,
423
- "loss": 0.6366,
424
  "step": 650
425
  },
426
  {
427
  "epoch": 0.81,
428
- "learning_rate": 7.475931352030139e-06,
429
- "loss": 0.6109,
430
  "step": 660
431
  },
432
  {
433
  "epoch": 0.82,
434
- "learning_rate": 7.434072833821683e-06,
435
- "loss": 0.5869,
436
  "step": 670
437
  },
438
  {
439
  "epoch": 0.84,
440
- "learning_rate": 7.392214315613228e-06,
441
- "loss": 0.612,
442
  "step": 680
443
  },
444
  {
445
  "epoch": 0.85,
446
- "learning_rate": 7.350355797404772e-06,
447
- "loss": 0.6019,
448
  "step": 690
449
  },
450
  {
451
  "epoch": 0.86,
452
- "learning_rate": 7.308497279196317e-06,
453
- "loss": 0.6265,
454
  "step": 700
455
  },
456
  {
457
  "epoch": 0.87,
458
- "learning_rate": 7.2666387609878615e-06,
459
- "loss": 0.5994,
460
  "step": 710
461
  },
462
  {
463
  "epoch": 0.89,
464
- "learning_rate": 7.224780242779407e-06,
465
- "loss": 0.5607,
466
  "step": 720
467
  },
468
  {
469
  "epoch": 0.9,
470
- "learning_rate": 7.18292172457095e-06,
471
- "loss": 0.5985,
472
  "step": 730
473
  },
474
  {
475
  "epoch": 0.91,
476
- "learning_rate": 7.141063206362495e-06,
477
- "loss": 0.5915,
478
  "step": 740
479
  },
480
  {
481
  "epoch": 0.92,
482
- "learning_rate": 7.09920468815404e-06,
483
- "loss": 0.5442,
484
  "step": 750
485
  },
486
  {
487
  "epoch": 0.93,
488
- "eval_oasst_export_w_label_accuracy": 0.7478021978021978,
489
- "eval_oasst_export_w_label_kendalltau": 0.4016483516483517,
490
- "eval_oasst_export_w_label_loss": 0.62255859375,
491
- "eval_oasst_export_w_label_neg_score": -0.1964111328125,
492
- "eval_oasst_export_w_label_pos_score": 0.82470703125,
493
- "eval_oasst_export_w_label_runtime": 190.3739,
494
- "eval_oasst_export_w_label_samples_per_second": 9.56,
495
- "eval_oasst_export_w_label_score_diff": 1.021484375,
496
- "eval_oasst_export_w_label_steps_per_second": 2.39,
497
- "step": 753
498
- },
499
- {
500
- "epoch": 0.93,
501
- "learning_rate": 7.057346169945585e-06,
502
- "loss": 0.6559,
503
  "step": 760
504
  },
505
  {
506
  "epoch": 0.95,
507
- "learning_rate": 7.01548765173713e-06,
508
- "loss": 0.5909,
509
  "step": 770
510
  },
511
  {
512
  "epoch": 0.96,
513
- "learning_rate": 6.973629133528673e-06,
514
- "loss": 0.6263,
515
  "step": 780
516
  },
517
  {
518
  "epoch": 0.97,
519
- "learning_rate": 6.931770615320218e-06,
520
- "loss": 0.6288,
521
  "step": 790
522
  },
523
  {
524
  "epoch": 0.98,
525
- "learning_rate": 6.889912097111763e-06,
526
- "loss": 0.5627,
527
  "step": 800
528
  },
529
  {
530
  "epoch": 1.0,
531
- "learning_rate": 6.848053578903308e-06,
532
- "loss": 0.6076,
533
  "step": 810
534
  },
535
  {
536
  "epoch": 1.01,
537
- "learning_rate": 6.806195060694852e-06,
538
- "loss": 0.567,
539
  "step": 820
540
  },
541
  {
542
  "epoch": 1.02,
543
- "learning_rate": 6.764336542486396e-06,
544
- "loss": 0.565,
545
  "step": 830
546
  },
547
  {
548
  "epoch": 1.03,
549
- "learning_rate": 6.722478024277941e-06,
550
- "loss": 0.6091,
551
  "step": 840
552
  },
553
  {
554
  "epoch": 1.05,
555
- "learning_rate": 6.680619506069486e-06,
556
- "loss": 0.5579,
557
  "step": 850
558
  },
559
  {
560
  "epoch": 1.06,
561
- "learning_rate": 6.63876098786103e-06,
562
- "loss": 0.5943,
563
  "step": 860
564
  },
565
  {
566
  "epoch": 1.07,
567
- "learning_rate": 6.596902469652575e-06,
568
- "loss": 0.566,
569
  "step": 870
570
  },
571
  {
572
  "epoch": 1.08,
573
- "learning_rate": 6.555043951444119e-06,
574
- "loss": 0.5757,
575
  "step": 880
576
  },
577
  {
578
  "epoch": 1.09,
579
- "learning_rate": 6.513185433235664e-06,
580
- "loss": 0.5752,
581
  "step": 890
582
  },
583
  {
584
  "epoch": 1.11,
585
- "learning_rate": 6.471326915027208e-06,
586
- "loss": 0.5595,
587
  "step": 900
588
  },
589
  {
590
  "epoch": 1.12,
591
- "learning_rate": 6.429468396818753e-06,
592
- "loss": 0.6051,
593
  "step": 910
594
  },
595
  {
596
  "epoch": 1.13,
597
- "learning_rate": 6.387609878610298e-06,
598
- "loss": 0.5308,
599
  "step": 920
600
  },
601
  {
602
  "epoch": 1.14,
603
- "learning_rate": 6.345751360401843e-06,
604
- "loss": 0.5341,
605
  "step": 930
606
  },
607
  {
608
  "epoch": 1.16,
609
- "learning_rate": 6.303892842193386e-06,
610
- "loss": 0.5976,
611
  "step": 940
612
  },
613
  {
614
  "epoch": 1.17,
615
- "learning_rate": 6.262034323984931e-06,
616
- "loss": 0.568,
617
  "step": 950
618
  },
619
  {
620
  "epoch": 1.18,
621
- "learning_rate": 6.220175805776476e-06,
622
- "loss": 0.5865,
623
  "step": 960
624
  },
625
  {
626
  "epoch": 1.19,
627
- "learning_rate": 6.178317287568021e-06,
628
- "loss": 0.5486,
629
  "step": 970
630
  },
631
  {
632
  "epoch": 1.21,
633
- "learning_rate": 6.136458769359566e-06,
634
- "loss": 0.5699,
635
  "step": 980
636
  },
637
  {
638
  "epoch": 1.22,
639
- "learning_rate": 6.094600251151109e-06,
640
- "loss": 0.5831,
641
  "step": 990
642
  },
643
  {
644
  "epoch": 1.23,
645
- "learning_rate": 6.052741732942654e-06,
646
- "loss": 0.5558,
647
  "step": 1000
648
  },
649
  {
650
  "epoch": 1.23,
651
- "eval_oasst_export_w_label_accuracy": 0.743956043956044,
652
- "eval_oasst_export_w_label_kendalltau": 0.3847985347985353,
653
- "eval_oasst_export_w_label_loss": 0.61865234375,
654
- "eval_oasst_export_w_label_neg_score": -0.09515380859375,
655
- "eval_oasst_export_w_label_pos_score": 0.93212890625,
656
- "eval_oasst_export_w_label_runtime": 190.324,
657
- "eval_oasst_export_w_label_samples_per_second": 9.563,
658
- "eval_oasst_export_w_label_score_diff": 1.02734375,
659
- "eval_oasst_export_w_label_steps_per_second": 2.391,
660
- "step": 1004
661
  },
662
  {
663
  "epoch": 1.24,
664
- "learning_rate": 6.010883214734199e-06,
665
- "loss": 0.5286,
666
  "step": 1010
667
  },
668
  {
669
  "epoch": 1.25,
670
- "learning_rate": 5.969024696525744e-06,
671
- "loss": 0.5657,
672
  "step": 1020
673
  },
674
  {
675
  "epoch": 1.27,
676
- "learning_rate": 5.927166178317288e-06,
677
- "loss": 0.5376,
678
  "step": 1030
679
  },
680
  {
681
  "epoch": 1.28,
682
- "learning_rate": 5.885307660108832e-06,
683
- "loss": 0.5375,
684
  "step": 1040
685
  },
686
  {
687
  "epoch": 1.29,
688
- "learning_rate": 5.843449141900377e-06,
689
- "loss": 0.5513,
690
  "step": 1050
691
  },
692
  {
693
  "epoch": 1.3,
694
- "learning_rate": 5.801590623691922e-06,
695
- "loss": 0.5778,
696
  "step": 1060
697
  },
698
  {
699
  "epoch": 1.32,
700
- "learning_rate": 5.759732105483467e-06,
701
- "loss": 0.5445,
702
  "step": 1070
703
  },
704
  {
705
  "epoch": 1.33,
706
- "learning_rate": 5.717873587275011e-06,
707
- "loss": 0.5657,
708
  "step": 1080
709
  },
710
  {
711
  "epoch": 1.34,
712
- "learning_rate": 5.676015069066555e-06,
713
- "loss": 0.5808,
714
  "step": 1090
715
  },
716
  {
717
  "epoch": 1.35,
718
- "learning_rate": 5.6341565508581e-06,
719
- "loss": 0.5723,
720
  "step": 1100
721
  },
722
  {
723
  "epoch": 1.37,
724
- "learning_rate": 5.592298032649645e-06,
725
- "loss": 0.5413,
726
  "step": 1110
727
  },
728
  {
729
  "epoch": 1.38,
730
- "learning_rate": 5.550439514441189e-06,
731
- "loss": 0.6384,
732
  "step": 1120
733
  },
734
  {
735
  "epoch": 1.39,
736
- "learning_rate": 5.508580996232734e-06,
737
- "loss": 0.5336,
738
  "step": 1130
739
  },
740
  {
741
  "epoch": 1.4,
742
- "learning_rate": 5.466722478024279e-06,
743
- "loss": 0.5754,
744
  "step": 1140
745
  },
746
  {
747
  "epoch": 1.41,
748
- "learning_rate": 5.424863959815823e-06,
749
- "loss": 0.5171,
750
  "step": 1150
751
  },
752
  {
753
  "epoch": 1.43,
754
- "learning_rate": 5.383005441607367e-06,
755
- "loss": 0.5536,
756
  "step": 1160
757
  },
758
  {
759
  "epoch": 1.44,
760
- "learning_rate": 5.341146923398912e-06,
761
- "loss": 0.5812,
762
  "step": 1170
763
  },
764
  {
765
  "epoch": 1.45,
766
- "learning_rate": 5.299288405190457e-06,
767
- "loss": 0.5604,
768
  "step": 1180
769
  },
770
  {
771
  "epoch": 1.46,
772
- "learning_rate": 5.257429886982002e-06,
773
- "loss": 0.5433,
774
  "step": 1190
775
  },
776
  {
777
  "epoch": 1.48,
778
- "learning_rate": 5.215571368773545e-06,
779
- "loss": 0.5704,
780
  "step": 1200
781
  },
782
  {
783
  "epoch": 1.49,
784
- "learning_rate": 5.17371285056509e-06,
785
- "loss": 0.592,
786
  "step": 1210
787
  },
788
  {
789
  "epoch": 1.5,
790
- "learning_rate": 5.131854332356635e-06,
791
- "loss": 0.5406,
792
  "step": 1220
793
  },
794
  {
795
  "epoch": 1.51,
796
- "learning_rate": 5.08999581414818e-06,
797
- "loss": 0.5597,
798
  "step": 1230
799
  },
800
  {
801
  "epoch": 1.53,
802
- "learning_rate": 5.048137295939725e-06,
803
- "loss": 0.6011,
804
  "step": 1240
805
  },
806
  {
807
  "epoch": 1.54,
808
- "learning_rate": 5.006278777731268e-06,
809
- "loss": 0.5071,
810
  "step": 1250
811
  },
812
- {
813
- "epoch": 1.54,
814
- "eval_oasst_export_w_label_accuracy": 0.7478021978021978,
815
- "eval_oasst_export_w_label_kendalltau": 0.4045054945054951,
816
- "eval_oasst_export_w_label_loss": 0.61669921875,
817
- "eval_oasst_export_w_label_neg_score": -0.1842041015625,
818
- "eval_oasst_export_w_label_pos_score": 1.02734375,
819
- "eval_oasst_export_w_label_runtime": 190.3207,
820
- "eval_oasst_export_w_label_samples_per_second": 9.563,
821
- "eval_oasst_export_w_label_score_diff": 1.2109375,
822
- "eval_oasst_export_w_label_steps_per_second": 2.391,
823
- "step": 1255
824
- },
825
  {
826
  "epoch": 1.55,
827
- "learning_rate": 4.964420259522813e-06,
828
- "loss": 0.5374,
829
  "step": 1260
830
  },
831
  {
832
  "epoch": 1.56,
833
- "learning_rate": 4.922561741314358e-06,
834
- "loss": 0.5613,
835
  "step": 1270
836
  },
837
  {
838
  "epoch": 1.57,
839
- "learning_rate": 4.880703223105903e-06,
840
- "loss": 0.5643,
841
  "step": 1280
842
  },
843
  {
844
  "epoch": 1.59,
845
- "learning_rate": 4.838844704897447e-06,
846
- "loss": 0.5755,
847
  "step": 1290
848
  },
849
  {
850
  "epoch": 1.6,
851
- "learning_rate": 4.7969861866889915e-06,
852
- "loss": 0.5631,
853
  "step": 1300
854
  },
855
  {
856
  "epoch": 1.61,
857
- "learning_rate": 4.755127668480537e-06,
858
- "loss": 0.5557,
859
  "step": 1310
860
  },
861
  {
862
  "epoch": 1.62,
863
- "learning_rate": 4.713269150272081e-06,
864
- "loss": 0.5173,
865
  "step": 1320
866
  },
867
  {
868
  "epoch": 1.64,
869
- "learning_rate": 4.671410632063625e-06,
870
- "loss": 0.5938,
871
  "step": 1330
872
  },
873
  {
874
  "epoch": 1.65,
875
- "learning_rate": 4.6295521138551695e-06,
876
- "loss": 0.6386,
877
  "step": 1340
878
  },
879
  {
880
  "epoch": 1.66,
881
- "learning_rate": 4.587693595646715e-06,
882
- "loss": 0.5652,
883
  "step": 1350
884
  },
885
  {
886
  "epoch": 1.67,
887
- "learning_rate": 4.545835077438259e-06,
888
- "loss": 0.6097,
889
  "step": 1360
890
  },
891
  {
892
- "epoch": 1.69,
893
- "learning_rate": 4.503976559229803e-06,
894
- "loss": 0.5369,
895
  "step": 1370
896
  },
897
  {
898
  "epoch": 1.7,
899
- "learning_rate": 4.462118041021348e-06,
900
- "loss": 0.539,
901
  "step": 1380
902
  },
903
  {
904
  "epoch": 1.71,
905
- "learning_rate": 4.4202595228128935e-06,
906
- "loss": 0.5854,
907
  "step": 1390
908
  },
909
  {
910
  "epoch": 1.72,
911
- "learning_rate": 4.378401004604437e-06,
912
- "loss": 0.5574,
913
  "step": 1400
914
  },
915
  {
916
  "epoch": 1.73,
917
- "learning_rate": 4.336542486395982e-06,
918
- "loss": 0.5079,
919
  "step": 1410
920
  },
921
  {
922
  "epoch": 1.75,
923
- "learning_rate": 4.294683968187526e-06,
924
- "loss": 0.5889,
925
  "step": 1420
926
  },
927
  {
928
  "epoch": 1.76,
929
- "learning_rate": 4.2528254499790715e-06,
930
- "loss": 0.5908,
931
  "step": 1430
932
  },
933
  {
934
  "epoch": 1.77,
935
- "learning_rate": 4.210966931770616e-06,
936
- "loss": 0.5254,
937
  "step": 1440
938
  },
939
  {
940
  "epoch": 1.78,
941
- "learning_rate": 4.16910841356216e-06,
942
- "loss": 0.5566,
943
  "step": 1450
944
  },
945
  {
946
  "epoch": 1.8,
947
- "learning_rate": 4.127249895353705e-06,
948
- "loss": 0.5743,
949
  "step": 1460
950
  },
951
  {
952
  "epoch": 1.81,
953
- "learning_rate": 4.0853913771452495e-06,
954
- "loss": 0.5208,
955
  "step": 1470
956
  },
957
  {
958
  "epoch": 1.82,
959
- "learning_rate": 4.043532858936794e-06,
960
- "loss": 0.5486,
961
  "step": 1480
962
  },
963
  {
964
  "epoch": 1.83,
965
- "learning_rate": 4.001674340728339e-06,
966
- "loss": 0.587,
967
  "step": 1490
968
  },
969
  {
970
  "epoch": 1.84,
971
- "learning_rate": 3.959815822519883e-06,
972
- "loss": 0.5781,
 
 
 
 
 
 
 
 
 
 
 
 
 
973
  "step": 1500
974
  }
975
  ],
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.844791538556143,
5
  "global_step": 1500,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
 
9
  "log_history": [
10
  {
11
  "epoch": 0.01,
12
+ "learning_rate": 8e-08,
13
+ "loss": 1.1212,
14
  "step": 10
15
  },
16
  {
17
  "epoch": 0.02,
18
+ "learning_rate": 2.8e-07,
19
+ "loss": 1.1134,
20
  "step": 20
21
  },
22
  {
23
  "epoch": 0.04,
24
+ "learning_rate": 4.800000000000001e-07,
25
+ "loss": 1.1174,
26
  "step": 30
27
  },
28
  {
29
  "epoch": 0.05,
30
+ "learning_rate": 6.800000000000001e-07,
31
+ "loss": 1.1076,
32
  "step": 40
33
  },
34
  {
35
  "epoch": 0.06,
36
+ "learning_rate": 8.8e-07,
37
+ "loss": 1.1042,
38
  "step": 50
39
  },
40
  {
41
  "epoch": 0.07,
42
+ "learning_rate": 1.08e-06,
43
+ "loss": 1.1135,
44
  "step": 60
45
  },
46
  {
47
  "epoch": 0.09,
48
+ "learning_rate": 1.28e-06,
49
+ "loss": 1.1096,
50
  "step": 70
51
  },
52
  {
53
  "epoch": 0.1,
54
+ "learning_rate": 1.48e-06,
55
+ "loss": 1.1024,
56
  "step": 80
57
  },
58
  {
59
  "epoch": 0.11,
60
+ "learning_rate": 1.6800000000000002e-06,
61
+ "loss": 1.094,
62
  "step": 90
63
  },
64
  {
65
  "epoch": 0.12,
66
+ "learning_rate": 1.8600000000000002e-06,
67
+ "loss": 1.1022,
68
  "step": 100
69
  },
70
  {
71
  "epoch": 0.14,
72
+ "learning_rate": 2.06e-06,
73
+ "loss": 1.0981,
74
  "step": 110
75
  },
76
  {
77
  "epoch": 0.15,
78
+ "learning_rate": 2.2600000000000004e-06,
79
+ "loss": 1.0915,
80
  "step": 120
81
  },
82
  {
83
  "epoch": 0.16,
84
+ "learning_rate": 2.46e-06,
85
+ "loss": 1.0812,
86
  "step": 130
87
  },
88
  {
89
  "epoch": 0.17,
90
+ "learning_rate": 2.6600000000000004e-06,
91
+ "loss": 1.0832,
92
  "step": 140
93
  },
94
  {
95
  "epoch": 0.18,
96
+ "learning_rate": 2.86e-06,
97
+ "loss": 1.0731,
98
  "step": 150
99
  },
100
  {
101
  "epoch": 0.2,
102
+ "learning_rate": 3.0600000000000003e-06,
103
+ "loss": 1.0723,
104
  "step": 160
105
  },
106
  {
107
  "epoch": 0.21,
108
+ "learning_rate": 3.2600000000000006e-06,
109
+ "loss": 1.0505,
110
  "step": 170
111
  },
112
  {
113
  "epoch": 0.22,
114
+ "learning_rate": 3.46e-06,
115
+ "loss": 1.0352,
116
  "step": 180
117
  },
118
  {
119
  "epoch": 0.23,
120
+ "learning_rate": 3.66e-06,
121
+ "loss": 0.9904,
122
  "step": 190
123
  },
124
  {
125
  "epoch": 0.25,
126
+ "learning_rate": 3.86e-06,
127
+ "loss": 0.9888,
128
  "step": 200
129
  },
130
  {
131
  "epoch": 0.26,
132
+ "learning_rate": 4.060000000000001e-06,
133
+ "loss": 0.9637,
134
  "step": 210
135
  },
136
  {
137
  "epoch": 0.27,
138
+ "learning_rate": 4.26e-06,
139
+ "loss": 0.934,
140
  "step": 220
141
  },
142
  {
143
  "epoch": 0.28,
144
+ "learning_rate": 4.4600000000000005e-06,
145
+ "loss": 0.9193,
146
  "step": 230
147
  },
148
  {
149
  "epoch": 0.3,
150
+ "learning_rate": 4.66e-06,
151
+ "loss": 0.877,
152
  "step": 240
153
  },
154
  {
155
  "epoch": 0.31,
156
+ "learning_rate": 4.86e-06,
157
+ "loss": 0.8693,
158
  "step": 250
159
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  {
161
  "epoch": 0.32,
162
+ "learning_rate": 5.060000000000001e-06,
163
+ "loss": 0.862,
164
  "step": 260
165
  },
166
  {
167
  "epoch": 0.33,
168
+ "learning_rate": 5.2600000000000005e-06,
169
+ "loss": 0.7961,
170
  "step": 270
171
  },
172
  {
173
  "epoch": 0.34,
174
+ "learning_rate": 5.460000000000001e-06,
175
+ "loss": 0.82,
176
  "step": 280
177
  },
178
  {
179
  "epoch": 0.36,
180
+ "learning_rate": 5.66e-06,
181
+ "loss": 0.7964,
182
  "step": 290
183
  },
184
  {
185
  "epoch": 0.37,
186
+ "learning_rate": 5.86e-06,
187
+ "loss": 0.7721,
188
  "step": 300
189
  },
190
  {
191
  "epoch": 0.38,
192
+ "learning_rate": 6.0600000000000004e-06,
193
+ "loss": 0.7876,
194
  "step": 310
195
  },
196
  {
197
  "epoch": 0.39,
198
+ "learning_rate": 6.26e-06,
199
+ "loss": 0.7579,
200
  "step": 320
201
  },
202
  {
203
  "epoch": 0.41,
204
+ "learning_rate": 6.460000000000001e-06,
205
+ "loss": 0.7519,
206
  "step": 330
207
  },
208
  {
209
  "epoch": 0.42,
210
+ "learning_rate": 6.660000000000001e-06,
211
+ "loss": 0.741,
212
  "step": 340
213
  },
214
  {
215
  "epoch": 0.43,
216
+ "learning_rate": 6.860000000000001e-06,
217
+ "loss": 0.7496,
218
  "step": 350
219
  },
220
  {
221
  "epoch": 0.44,
222
+ "learning_rate": 7.06e-06,
223
+ "loss": 0.7587,
224
  "step": 360
225
  },
226
  {
227
  "epoch": 0.46,
228
+ "learning_rate": 7.260000000000001e-06,
229
+ "loss": 0.7373,
230
  "step": 370
231
  },
232
  {
233
  "epoch": 0.47,
234
+ "learning_rate": 7.4600000000000006e-06,
235
+ "loss": 0.695,
236
  "step": 380
237
  },
238
  {
239
  "epoch": 0.48,
240
+ "learning_rate": 7.660000000000001e-06,
241
+ "loss": 0.717,
242
  "step": 390
243
  },
244
  {
245
  "epoch": 0.49,
246
+ "learning_rate": 7.860000000000001e-06,
247
+ "loss": 0.6464,
248
  "step": 400
249
  },
250
  {
251
  "epoch": 0.5,
252
+ "learning_rate": 8.06e-06,
253
+ "loss": 0.7189,
254
  "step": 410
255
  },
256
  {
257
  "epoch": 0.52,
258
+ "learning_rate": 8.26e-06,
259
+ "loss": 0.7103,
260
  "step": 420
261
  },
262
  {
263
  "epoch": 0.53,
264
+ "learning_rate": 8.46e-06,
265
+ "loss": 0.7119,
266
  "step": 430
267
  },
268
  {
269
  "epoch": 0.54,
270
+ "learning_rate": 8.66e-06,
271
+ "loss": 0.6992,
272
  "step": 440
273
  },
274
  {
275
  "epoch": 0.55,
276
+ "learning_rate": 8.860000000000002e-06,
277
+ "loss": 0.7076,
278
  "step": 450
279
  },
280
  {
281
  "epoch": 0.57,
282
+ "learning_rate": 9.060000000000001e-06,
283
+ "loss": 0.6979,
284
  "step": 460
285
  },
286
  {
287
  "epoch": 0.58,
288
+ "learning_rate": 9.260000000000001e-06,
289
+ "loss": 0.7038,
290
  "step": 470
291
  },
292
  {
293
  "epoch": 0.59,
294
+ "learning_rate": 9.460000000000001e-06,
295
+ "loss": 0.6698,
296
  "step": 480
297
  },
298
  {
299
  "epoch": 0.6,
300
+ "learning_rate": 9.66e-06,
301
+ "loss": 0.6826,
302
  "step": 490
303
  },
304
  {
305
  "epoch": 0.61,
306
+ "learning_rate": 9.86e-06,
307
+ "loss": 0.7071,
308
  "step": 500
309
  },
310
  {
311
+ "epoch": 0.61,
312
+ "eval_oasst_export_w_label_accuracy": 0.75,
313
+ "eval_oasst_export_w_label_kendalltau": 0.4063736263736262,
314
+ "eval_oasst_export_w_label_loss": 0.67431640625,
315
+ "eval_oasst_export_w_label_neg_score": 0.7998046875,
316
+ "eval_oasst_export_w_label_pos_score": 1.5751953125,
317
+ "eval_oasst_export_w_label_runtime": 150.7481,
318
+ "eval_oasst_export_w_label_samples_per_second": 12.073,
319
+ "eval_oasst_export_w_label_score_diff": 0.775390625,
320
+ "eval_oasst_export_w_label_steps_per_second": 6.037,
321
+ "step": 500
322
  },
323
  {
324
  "epoch": 0.63,
325
+ "learning_rate": 9.98452810727179e-06,
326
+ "loss": 0.7206,
327
  "step": 510
328
  },
329
  {
330
  "epoch": 0.64,
331
+ "learning_rate": 9.932955131511089e-06,
332
+ "loss": 0.688,
333
  "step": 520
334
  },
335
  {
336
  "epoch": 0.65,
337
+ "learning_rate": 9.881382155750388e-06,
338
+ "loss": 0.6715,
339
  "step": 530
340
  },
341
  {
342
  "epoch": 0.66,
343
+ "learning_rate": 9.829809179989687e-06,
344
+ "loss": 0.6948,
345
  "step": 540
346
  },
347
  {
348
  "epoch": 0.68,
349
+ "learning_rate": 9.778236204228985e-06,
350
+ "loss": 0.6869,
351
  "step": 550
352
  },
353
  {
354
  "epoch": 0.69,
355
+ "learning_rate": 9.726663228468284e-06,
356
+ "loss": 0.6541,
357
  "step": 560
358
  },
359
  {
360
  "epoch": 0.7,
361
+ "learning_rate": 9.675090252707581e-06,
362
+ "loss": 0.7238,
363
  "step": 570
364
  },
365
  {
366
  "epoch": 0.71,
367
+ "learning_rate": 9.62351727694688e-06,
368
+ "loss": 0.6599,
369
  "step": 580
370
  },
371
  {
372
  "epoch": 0.73,
373
+ "learning_rate": 9.571944301186179e-06,
374
+ "loss": 0.6549,
375
  "step": 590
376
  },
377
  {
378
  "epoch": 0.74,
379
+ "learning_rate": 9.520371325425478e-06,
380
+ "loss": 0.6855,
381
  "step": 600
382
  },
383
  {
384
  "epoch": 0.75,
385
+ "learning_rate": 9.468798349664777e-06,
386
+ "loss": 0.6486,
387
  "step": 610
388
  },
389
  {
390
  "epoch": 0.76,
391
+ "learning_rate": 9.417225373904075e-06,
392
+ "loss": 0.64,
393
  "step": 620
394
  },
395
  {
396
  "epoch": 0.77,
397
+ "learning_rate": 9.365652398143374e-06,
398
+ "loss": 0.7051,
399
  "step": 630
400
  },
401
  {
402
  "epoch": 0.79,
403
+ "learning_rate": 9.314079422382673e-06,
404
+ "loss": 0.6982,
405
  "step": 640
406
  },
407
  {
408
  "epoch": 0.8,
409
+ "learning_rate": 9.26250644662197e-06,
410
+ "loss": 0.6508,
411
  "step": 650
412
  },
413
  {
414
  "epoch": 0.81,
415
+ "learning_rate": 9.210933470861269e-06,
416
+ "loss": 0.661,
417
  "step": 660
418
  },
419
  {
420
  "epoch": 0.82,
421
+ "learning_rate": 9.159360495100568e-06,
422
+ "loss": 0.6907,
423
  "step": 670
424
  },
425
  {
426
  "epoch": 0.84,
427
+ "learning_rate": 9.107787519339866e-06,
428
+ "loss": 0.6872,
429
  "step": 680
430
  },
431
  {
432
  "epoch": 0.85,
433
+ "learning_rate": 9.056214543579165e-06,
434
+ "loss": 0.6979,
435
  "step": 690
436
  },
437
  {
438
  "epoch": 0.86,
439
+ "learning_rate": 9.004641567818464e-06,
440
+ "loss": 0.6683,
441
  "step": 700
442
  },
443
  {
444
  "epoch": 0.87,
445
+ "learning_rate": 8.953068592057763e-06,
446
+ "loss": 0.6973,
447
  "step": 710
448
  },
449
  {
450
  "epoch": 0.89,
451
+ "learning_rate": 8.901495616297062e-06,
452
+ "loss": 0.6384,
453
  "step": 720
454
  },
455
  {
456
  "epoch": 0.9,
457
+ "learning_rate": 8.849922640536359e-06,
458
+ "loss": 0.6686,
459
  "step": 730
460
  },
461
  {
462
  "epoch": 0.91,
463
+ "learning_rate": 8.798349664775658e-06,
464
+ "loss": 0.6709,
465
  "step": 740
466
  },
467
  {
468
  "epoch": 0.92,
469
+ "learning_rate": 8.746776689014956e-06,
470
+ "loss": 0.6693,
471
  "step": 750
472
  },
473
  {
474
  "epoch": 0.93,
475
+ "learning_rate": 8.695203713254255e-06,
476
+ "loss": 0.6405,
 
 
 
 
 
 
 
 
 
 
 
 
 
477
  "step": 760
478
  },
479
  {
480
  "epoch": 0.95,
481
+ "learning_rate": 8.643630737493554e-06,
482
+ "loss": 0.6937,
483
  "step": 770
484
  },
485
  {
486
  "epoch": 0.96,
487
+ "learning_rate": 8.592057761732853e-06,
488
+ "loss": 0.6322,
489
  "step": 780
490
  },
491
  {
492
  "epoch": 0.97,
493
+ "learning_rate": 8.540484785972152e-06,
494
+ "loss": 0.6862,
495
  "step": 790
496
  },
497
  {
498
  "epoch": 0.98,
499
+ "learning_rate": 8.48891181021145e-06,
500
+ "loss": 0.6507,
501
  "step": 800
502
  },
503
  {
504
  "epoch": 1.0,
505
+ "learning_rate": 8.437338834450749e-06,
506
+ "loss": 0.6452,
507
  "step": 810
508
  },
509
  {
510
  "epoch": 1.01,
511
+ "learning_rate": 8.385765858690046e-06,
512
+ "loss": 0.6387,
513
  "step": 820
514
  },
515
  {
516
  "epoch": 1.02,
517
+ "learning_rate": 8.334192882929347e-06,
518
+ "loss": 0.6422,
519
  "step": 830
520
  },
521
  {
522
  "epoch": 1.03,
523
+ "learning_rate": 8.282619907168644e-06,
524
+ "loss": 0.6492,
525
  "step": 840
526
  },
527
  {
528
  "epoch": 1.05,
529
+ "learning_rate": 8.231046931407943e-06,
530
+ "loss": 0.6533,
531
  "step": 850
532
  },
533
  {
534
  "epoch": 1.06,
535
+ "learning_rate": 8.179473955647241e-06,
536
+ "loss": 0.6747,
537
  "step": 860
538
  },
539
  {
540
  "epoch": 1.07,
541
+ "learning_rate": 8.12790097988654e-06,
542
+ "loss": 0.67,
543
  "step": 870
544
  },
545
  {
546
  "epoch": 1.08,
547
+ "learning_rate": 8.076328004125839e-06,
548
+ "loss": 0.6337,
549
  "step": 880
550
  },
551
  {
552
  "epoch": 1.09,
553
+ "learning_rate": 8.024755028365138e-06,
554
+ "loss": 0.6612,
555
  "step": 890
556
  },
557
  {
558
  "epoch": 1.11,
559
+ "learning_rate": 7.973182052604435e-06,
560
+ "loss": 0.6354,
561
  "step": 900
562
  },
563
  {
564
  "epoch": 1.12,
565
+ "learning_rate": 7.921609076843735e-06,
566
+ "loss": 0.6236,
567
  "step": 910
568
  },
569
  {
570
  "epoch": 1.13,
571
+ "learning_rate": 7.870036101083033e-06,
572
+ "loss": 0.6394,
573
  "step": 920
574
  },
575
  {
576
  "epoch": 1.14,
577
+ "learning_rate": 7.818463125322331e-06,
578
+ "loss": 0.631,
579
  "step": 930
580
  },
581
  {
582
  "epoch": 1.16,
583
+ "learning_rate": 7.76689014956163e-06,
584
+ "loss": 0.6378,
585
  "step": 940
586
  },
587
  {
588
  "epoch": 1.17,
589
+ "learning_rate": 7.715317173800929e-06,
590
+ "loss": 0.6328,
591
  "step": 950
592
  },
593
  {
594
  "epoch": 1.18,
595
+ "learning_rate": 7.663744198040228e-06,
596
+ "loss": 0.6036,
597
  "step": 960
598
  },
599
  {
600
  "epoch": 1.19,
601
+ "learning_rate": 7.6121712222795265e-06,
602
+ "loss": 0.6193,
603
  "step": 970
604
  },
605
  {
606
  "epoch": 1.21,
607
+ "learning_rate": 7.5605982465188245e-06,
608
+ "loss": 0.6667,
609
  "step": 980
610
  },
611
  {
612
  "epoch": 1.22,
613
+ "learning_rate": 7.509025270758123e-06,
614
+ "loss": 0.6434,
615
  "step": 990
616
  },
617
  {
618
  "epoch": 1.23,
619
+ "learning_rate": 7.457452294997421e-06,
620
+ "loss": 0.5941,
621
  "step": 1000
622
  },
623
  {
624
  "epoch": 1.23,
625
+ "eval_oasst_export_w_label_accuracy": 0.7576923076923077,
626
+ "eval_oasst_export_w_label_kendalltau": 0.3999267399267405,
627
+ "eval_oasst_export_w_label_loss": 0.6787109375,
628
+ "eval_oasst_export_w_label_neg_score": -1.3857421875,
629
+ "eval_oasst_export_w_label_pos_score": -0.15380859375,
630
+ "eval_oasst_export_w_label_runtime": 150.4368,
631
+ "eval_oasst_export_w_label_samples_per_second": 12.098,
632
+ "eval_oasst_export_w_label_score_diff": 1.232421875,
633
+ "eval_oasst_export_w_label_steps_per_second": 6.049,
634
+ "step": 1000
635
  },
636
  {
637
  "epoch": 1.24,
638
+ "learning_rate": 7.405879319236721e-06,
639
+ "loss": 0.6572,
640
  "step": 1010
641
  },
642
  {
643
  "epoch": 1.25,
644
+ "learning_rate": 7.354306343476019e-06,
645
+ "loss": 0.6466,
646
  "step": 1020
647
  },
648
  {
649
  "epoch": 1.27,
650
+ "learning_rate": 7.302733367715318e-06,
651
+ "loss": 0.6361,
652
  "step": 1030
653
  },
654
  {
655
  "epoch": 1.28,
656
+ "learning_rate": 7.251160391954616e-06,
657
+ "loss": 0.6522,
658
  "step": 1040
659
  },
660
  {
661
  "epoch": 1.29,
662
+ "learning_rate": 7.199587416193915e-06,
663
+ "loss": 0.6014,
664
  "step": 1050
665
  },
666
  {
667
  "epoch": 1.3,
668
+ "learning_rate": 7.148014440433214e-06,
669
+ "loss": 0.593,
670
  "step": 1060
671
  },
672
  {
673
  "epoch": 1.32,
674
+ "learning_rate": 7.096441464672512e-06,
675
+ "loss": 0.643,
676
  "step": 1070
677
  },
678
  {
679
  "epoch": 1.33,
680
+ "learning_rate": 7.044868488911812e-06,
681
+ "loss": 0.6474,
682
  "step": 1080
683
  },
684
  {
685
  "epoch": 1.34,
686
+ "learning_rate": 6.9932955131511096e-06,
687
+ "loss": 0.6338,
688
  "step": 1090
689
  },
690
  {
691
  "epoch": 1.35,
692
+ "learning_rate": 6.941722537390408e-06,
693
+ "loss": 0.6268,
694
  "step": 1100
695
  },
696
  {
697
  "epoch": 1.37,
698
+ "learning_rate": 6.890149561629706e-06,
699
+ "loss": 0.6358,
700
  "step": 1110
701
  },
702
  {
703
  "epoch": 1.38,
704
+ "learning_rate": 6.838576585869006e-06,
705
+ "loss": 0.6247,
706
  "step": 1120
707
  },
708
  {
709
  "epoch": 1.39,
710
+ "learning_rate": 6.787003610108304e-06,
711
+ "loss": 0.5791,
712
  "step": 1130
713
  },
714
  {
715
  "epoch": 1.4,
716
+ "learning_rate": 6.735430634347603e-06,
717
+ "loss": 0.6074,
718
  "step": 1140
719
  },
720
  {
721
  "epoch": 1.41,
722
+ "learning_rate": 6.683857658586901e-06,
723
+ "loss": 0.61,
724
  "step": 1150
725
  },
726
  {
727
  "epoch": 1.43,
728
+ "learning_rate": 6.6322846828262e-06,
729
+ "loss": 0.6167,
730
  "step": 1160
731
  },
732
  {
733
  "epoch": 1.44,
734
+ "learning_rate": 6.580711707065498e-06,
735
+ "loss": 0.6437,
736
  "step": 1170
737
  },
738
  {
739
  "epoch": 1.45,
740
+ "learning_rate": 6.529138731304797e-06,
741
+ "loss": 0.6182,
742
  "step": 1180
743
  },
744
  {
745
  "epoch": 1.46,
746
+ "learning_rate": 6.477565755544095e-06,
747
+ "loss": 0.6184,
748
  "step": 1190
749
  },
750
  {
751
  "epoch": 1.48,
752
+ "learning_rate": 6.425992779783395e-06,
753
+ "loss": 0.6191,
754
  "step": 1200
755
  },
756
  {
757
  "epoch": 1.49,
758
+ "learning_rate": 6.374419804022693e-06,
759
+ "loss": 0.6136,
760
  "step": 1210
761
  },
762
  {
763
  "epoch": 1.5,
764
+ "learning_rate": 6.322846828261991e-06,
765
+ "loss": 0.6355,
766
  "step": 1220
767
  },
768
  {
769
  "epoch": 1.51,
770
+ "learning_rate": 6.271273852501289e-06,
771
+ "loss": 0.5557,
772
  "step": 1230
773
  },
774
  {
775
  "epoch": 1.53,
776
+ "learning_rate": 6.219700876740589e-06,
777
+ "loss": 0.6393,
778
  "step": 1240
779
  },
780
  {
781
  "epoch": 1.54,
782
+ "learning_rate": 6.168127900979887e-06,
783
+ "loss": 0.5874,
784
  "step": 1250
785
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  {
787
  "epoch": 1.55,
788
+ "learning_rate": 6.116554925219186e-06,
789
+ "loss": 0.6041,
790
  "step": 1260
791
  },
792
  {
793
  "epoch": 1.56,
794
+ "learning_rate": 6.064981949458484e-06,
795
+ "loss": 0.6061,
796
  "step": 1270
797
  },
798
  {
799
  "epoch": 1.57,
800
+ "learning_rate": 6.013408973697783e-06,
801
+ "loss": 0.5858,
802
  "step": 1280
803
  },
804
  {
805
  "epoch": 1.59,
806
+ "learning_rate": 5.961835997937081e-06,
807
+ "loss": 0.6646,
808
  "step": 1290
809
  },
810
  {
811
  "epoch": 1.6,
812
+ "learning_rate": 5.91026302217638e-06,
813
+ "loss": 0.6314,
814
  "step": 1300
815
  },
816
  {
817
  "epoch": 1.61,
818
+ "learning_rate": 5.858690046415678e-06,
819
+ "loss": 0.6332,
820
  "step": 1310
821
  },
822
  {
823
  "epoch": 1.62,
824
+ "learning_rate": 5.807117070654978e-06,
825
+ "loss": 0.613,
826
  "step": 1320
827
  },
828
  {
829
  "epoch": 1.64,
830
+ "learning_rate": 5.755544094894276e-06,
831
+ "loss": 0.6441,
832
  "step": 1330
833
  },
834
  {
835
  "epoch": 1.65,
836
+ "learning_rate": 5.7039711191335744e-06,
837
+ "loss": 0.6369,
838
  "step": 1340
839
  },
840
  {
841
  "epoch": 1.66,
842
+ "learning_rate": 5.652398143372873e-06,
843
+ "loss": 0.6157,
844
  "step": 1350
845
  },
846
  {
847
  "epoch": 1.67,
848
+ "learning_rate": 5.600825167612172e-06,
849
+ "loss": 0.658,
850
  "step": 1360
851
  },
852
  {
853
+ "epoch": 1.68,
854
+ "learning_rate": 5.549252191851471e-06,
855
+ "loss": 0.621,
856
  "step": 1370
857
  },
858
  {
859
  "epoch": 1.7,
860
+ "learning_rate": 5.497679216090769e-06,
861
+ "loss": 0.6134,
862
  "step": 1380
863
  },
864
  {
865
  "epoch": 1.71,
866
+ "learning_rate": 5.446106240330068e-06,
867
+ "loss": 0.6128,
868
  "step": 1390
869
  },
870
  {
871
  "epoch": 1.72,
872
+ "learning_rate": 5.394533264569366e-06,
873
+ "loss": 0.6307,
874
  "step": 1400
875
  },
876
  {
877
  "epoch": 1.73,
878
+ "learning_rate": 5.342960288808665e-06,
879
+ "loss": 0.6131,
880
  "step": 1410
881
  },
882
  {
883
  "epoch": 1.75,
884
+ "learning_rate": 5.291387313047963e-06,
885
+ "loss": 0.5757,
886
  "step": 1420
887
  },
888
  {
889
  "epoch": 1.76,
890
+ "learning_rate": 5.239814337287262e-06,
891
+ "loss": 0.7221,
892
  "step": 1430
893
  },
894
  {
895
  "epoch": 1.77,
896
+ "learning_rate": 5.188241361526561e-06,
897
+ "loss": 0.6262,
898
  "step": 1440
899
  },
900
  {
901
  "epoch": 1.78,
902
+ "learning_rate": 5.1366683857658595e-06,
903
+ "loss": 0.6628,
904
  "step": 1450
905
  },
906
  {
907
  "epoch": 1.8,
908
+ "learning_rate": 5.0850954100051575e-06,
909
+ "loss": 0.6105,
910
  "step": 1460
911
  },
912
  {
913
  "epoch": 1.81,
914
+ "learning_rate": 5.033522434244456e-06,
915
+ "loss": 0.6422,
916
  "step": 1470
917
  },
918
  {
919
  "epoch": 1.82,
920
+ "learning_rate": 4.981949458483755e-06,
921
+ "loss": 0.6443,
922
  "step": 1480
923
  },
924
  {
925
  "epoch": 1.83,
926
+ "learning_rate": 4.930376482723053e-06,
927
+ "loss": 0.6067,
928
  "step": 1490
929
  },
930
  {
931
  "epoch": 1.84,
932
+ "learning_rate": 4.878803506962353e-06,
933
+ "loss": 0.6264,
934
+ "step": 1500
935
+ },
936
+ {
937
+ "epoch": 1.84,
938
+ "eval_oasst_export_w_label_accuracy": 0.7708791208791209,
939
+ "eval_oasst_export_w_label_kendalltau": 0.42186813186813227,
940
+ "eval_oasst_export_w_label_loss": 0.63671875,
941
+ "eval_oasst_export_w_label_neg_score": -0.201904296875,
942
+ "eval_oasst_export_w_label_pos_score": 1.01171875,
943
+ "eval_oasst_export_w_label_runtime": 150.6985,
944
+ "eval_oasst_export_w_label_samples_per_second": 12.077,
945
+ "eval_oasst_export_w_label_score_diff": 1.2138671875,
946
+ "eval_oasst_export_w_label_steps_per_second": 6.039,
947
  "step": 1500
948
  }
949
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5700a4afb75b927bce7ba70711d04ded78b4b39072e14a647f2798bcd9ea3f4
3
- size 4667
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:934466ec8fdbec43455be2dd56a20218b5f3598489dd22e7478cef97e1d62f6f
3
+ size 4859