wzhouad commited on
Commit
3a033a8
1 Parent(s): 24db621

Model save

Browse files
README.md CHANGED
@@ -32,10 +32,10 @@ More information needed
32
  ### Training hyperparameters
33
 
34
  The following hyperparameters were used during training:
35
- - learning_rate: 2e-06
36
  - train_batch_size: 2
37
  - eval_batch_size: 8
38
- - seed: 5
39
  - distributed_type: multi-GPU
40
  - num_devices: 8
41
  - gradient_accumulation_steps: 8
 
32
  ### Training hyperparameters
33
 
34
  The following hyperparameters were used during training:
35
+ - learning_rate: 1e-06
36
  - train_batch_size: 2
37
  - eval_batch_size: 8
38
+ - seed: 2
39
  - distributed_type: multi-GPU
40
  - num_devices: 8
41
  - gradient_accumulation_steps: 8
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.21128464219435839,
4
- "train_runtime": 10587.5728,
5
  "train_samples": 45548,
6
- "train_samples_per_second": 8.604,
7
- "train_steps_per_second": 0.067
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.30386436640376774,
4
+ "train_runtime": 10428.2681,
5
  "train_samples": 45548,
6
+ "train_samples_per_second": 8.735,
7
+ "train_steps_per_second": 0.068
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8edcf20d9c2d026fabec9dd725ddcb99eb9468d119764f360eb50eb30cbd68da
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fad56123ab40af80b65ee5b63fa2134010a944b41f9ae923d9221170956f7649
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9bf81822232bc24fae74426ebfdabfed75c1be44be206a3ef5b25a37f3bff5e3
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e97d5c4777c113244429fccde8dc290fc576953853257deac3d98bb33f36025d
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1597fd9f46a0099cfde0de1c18d44e1b23c4e2dfa75bb389ea71848dbbbb6af
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94e1fd4e76c84221a4e3f2fffd9f145b436a33225719831f05393709b2c48b8e
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b5e65c84cef0798a2d14db45a99b308a303d9843d5be5de1f10c03f8530dfe2d
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bd1298e2d78e75607e9514d3b106ce709ec2d57ea16b46c0f79bf9fc4a696d7
3
  size 1168138808
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 2.0,
3
- "train_loss": 0.21128464219435839,
4
- "train_runtime": 10587.5728,
5
  "train_samples": 45548,
6
- "train_samples_per_second": 8.604,
7
- "train_steps_per_second": 0.067
8
  }
 
1
  {
2
  "epoch": 2.0,
3
+ "train_loss": 0.30386436640376774,
4
+ "train_runtime": 10428.2681,
5
  "train_samples": 45548,
6
+ "train_samples_per_second": 8.735,
7
+ "train_steps_per_second": 0.068
8
  }
trainer_state.json CHANGED
@@ -10,1006 +10,1006 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
- "learning_rate": 2.8169014084507043e-07,
14
- "logits/chosen": -0.01849319413304329,
15
- "logits/rejected": 0.04447399824857712,
16
- "logps/chosen": -322.30413818359375,
17
- "logps/rejected": -218.52719116210938,
18
- "loss": 0.5192,
19
- "rewards/accuracies": 0.4124999940395355,
20
- "rewards/chosen": 0.0008185860933735967,
21
- "rewards/margins": 0.0013399553718045354,
22
- "rewards/rejected": -0.0005213693948462605,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.06,
27
- "learning_rate": 5.633802816901409e-07,
28
- "logits/chosen": -0.04029911756515503,
29
- "logits/rejected": -0.037409596145153046,
30
- "logps/chosen": -334.2061767578125,
31
- "logps/rejected": -200.04428100585938,
32
- "loss": 0.5024,
33
- "rewards/accuracies": 0.543749988079071,
34
- "rewards/chosen": 0.001267000799998641,
35
- "rewards/margins": 0.0049448576755821705,
36
- "rewards/rejected": -0.0036778573412448168,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.08,
41
- "learning_rate": 8.450704225352112e-07,
42
- "logits/chosen": -0.09712442010641098,
43
- "logits/rejected": -0.016413463279604912,
44
- "logps/chosen": -428.060302734375,
45
- "logps/rejected": -255.72323608398438,
46
- "loss": 0.5169,
47
- "rewards/accuracies": 0.59375,
48
- "rewards/chosen": 0.025075193494558334,
49
- "rewards/margins": 0.06105799600481987,
50
- "rewards/rejected": -0.035982806235551834,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.11,
55
- "learning_rate": 1.1267605633802817e-06,
56
- "logits/chosen": 0.01622619666159153,
57
- "logits/rejected": 0.05923817679286003,
58
- "logps/chosen": -367.3197326660156,
59
- "logps/rejected": -274.74176025390625,
60
- "loss": 0.531,
61
- "rewards/accuracies": 0.675000011920929,
62
- "rewards/chosen": -0.02706316113471985,
63
- "rewards/margins": 0.09234372526407242,
64
- "rewards/rejected": -0.11940689384937286,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.14,
69
- "learning_rate": 1.408450704225352e-06,
70
- "logits/chosen": 0.048659004271030426,
71
- "logits/rejected": 0.06660661846399307,
72
- "logps/chosen": -283.6951599121094,
73
- "logps/rejected": -219.31021118164062,
74
- "loss": 0.5435,
75
- "rewards/accuracies": 0.59375,
76
- "rewards/chosen": -0.0874737948179245,
77
- "rewards/margins": 0.12040810286998749,
78
- "rewards/rejected": -0.207881897687912,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.17,
83
- "learning_rate": 1.6901408450704225e-06,
84
- "logits/chosen": 0.08613400161266327,
85
- "logits/rejected": 0.1546694040298462,
86
- "logps/chosen": -362.0628662109375,
87
- "logps/rejected": -212.6298828125,
88
- "loss": 0.5299,
89
- "rewards/accuracies": 0.675000011920929,
90
- "rewards/chosen": 0.0036375909112393856,
91
- "rewards/margins": 0.32788988947868347,
92
- "rewards/rejected": -0.32425227761268616,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.2,
97
- "learning_rate": 1.971830985915493e-06,
98
- "logits/chosen": 0.010642724111676216,
99
- "logits/rejected": 0.03363212198019028,
100
- "logps/chosen": -288.052978515625,
101
- "logps/rejected": -233.4365997314453,
102
- "loss": 0.5253,
103
- "rewards/accuracies": 0.612500011920929,
104
- "rewards/chosen": -0.11775505542755127,
105
- "rewards/margins": 0.16322237253189087,
106
- "rewards/rejected": -0.28097742795944214,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.22,
111
- "learning_rate": 1.9990212265199736e-06,
112
- "logits/chosen": -0.18143758177757263,
113
- "logits/rejected": -0.11922919750213623,
114
- "logps/chosen": -372.3688049316406,
115
- "logps/rejected": -212.8850555419922,
116
- "loss": 0.4977,
117
- "rewards/accuracies": 0.65625,
118
- "rewards/chosen": 0.05031196400523186,
119
- "rewards/margins": 0.2945536971092224,
120
- "rewards/rejected": -0.24424175918102264,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.25,
125
- "learning_rate": 1.995640271796129e-06,
126
- "logits/chosen": -0.10464553534984589,
127
- "logits/rejected": -0.10386872291564941,
128
- "logps/chosen": -271.26727294921875,
129
- "logps/rejected": -232.1707763671875,
130
- "loss": 0.4754,
131
- "rewards/accuracies": 0.581250011920929,
132
- "rewards/chosen": -0.046878136694431305,
133
- "rewards/margins": 0.11948816478252411,
134
- "rewards/rejected": -0.166366308927536,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.28,
139
- "learning_rate": 1.9898532207817787e-06,
140
- "logits/chosen": -0.10582619905471802,
141
- "logits/rejected": -0.06019941717386246,
142
- "logps/chosen": -338.7351989746094,
143
- "logps/rejected": -235.93240356445312,
144
- "loss": 0.4609,
145
- "rewards/accuracies": 0.612500011920929,
146
- "rewards/chosen": -0.09618537873029709,
147
- "rewards/margins": 0.24425363540649414,
148
- "rewards/rejected": -0.34043899178504944,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.31,
153
- "learning_rate": 1.9816740586504575e-06,
154
- "logits/chosen": -0.15991339087486267,
155
- "logits/rejected": -0.08021946251392365,
156
- "logps/chosen": -347.1170959472656,
157
- "logps/rejected": -260.5694580078125,
158
- "loss": 0.4339,
159
- "rewards/accuracies": 0.625,
160
- "rewards/chosen": -0.1928514689207077,
161
- "rewards/margins": 0.27897369861602783,
162
- "rewards/rejected": -0.4718252122402191,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.34,
167
- "learning_rate": 1.971122551428331e-06,
168
- "logits/chosen": -0.173538938164711,
169
- "logits/rejected": -0.14118380844593048,
170
- "logps/chosen": -322.6651306152344,
171
- "logps/rejected": -249.2017822265625,
172
- "loss": 0.4292,
173
- "rewards/accuracies": 0.5687500238418579,
174
- "rewards/chosen": -0.25639423727989197,
175
- "rewards/margins": 0.15481036901474,
176
- "rewards/rejected": -0.41120463609695435,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 0.37,
181
- "learning_rate": 1.9582241982269803e-06,
182
- "logits/chosen": -0.171969935297966,
183
- "logits/rejected": -0.11264105141162872,
184
- "logps/chosen": -367.5609130859375,
185
- "logps/rejected": -256.74334716796875,
186
- "loss": 0.4099,
187
- "rewards/accuracies": 0.6312500238418579,
188
- "rewards/chosen": -0.23346960544586182,
189
- "rewards/margins": 0.23451845347881317,
190
- "rewards/rejected": -0.4679880142211914,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 0.39,
195
- "learning_rate": 1.9430101696214336e-06,
196
- "logits/chosen": -0.2663022577762604,
197
- "logits/rejected": -0.17533616721630096,
198
- "logps/chosen": -351.01544189453125,
199
- "logps/rejected": -236.74423217773438,
200
- "loss": 0.4005,
201
- "rewards/accuracies": 0.637499988079071,
202
- "rewards/chosen": -0.3526380658149719,
203
- "rewards/margins": 0.29506126046180725,
204
- "rewards/rejected": -0.6476993560791016,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 0.42,
209
- "learning_rate": 1.9255172323223463e-06,
210
- "logits/chosen": -0.2499375343322754,
211
- "logits/rejected": -0.2070433646440506,
212
- "logps/chosen": -352.9437561035156,
213
- "logps/rejected": -292.28277587890625,
214
- "loss": 0.3763,
215
- "rewards/accuracies": 0.5874999761581421,
216
- "rewards/chosen": -0.4557357430458069,
217
- "rewards/margins": 0.24104443192481995,
218
- "rewards/rejected": -0.6967801451683044,
219
  "step": 150
220
  },
221
  {
222
  "epoch": 0.45,
223
- "learning_rate": 1.905787660324391e-06,
224
- "logits/chosen": -0.29798978567123413,
225
- "logits/rejected": -0.23274393379688263,
226
- "logps/chosen": -361.73760986328125,
227
- "logps/rejected": -273.33648681640625,
228
- "loss": 0.358,
229
- "rewards/accuracies": 0.637499988079071,
230
- "rewards/chosen": -0.4945393204689026,
231
- "rewards/margins": 0.26065993309020996,
232
- "rewards/rejected": -0.7551992535591125,
233
  "step": 160
234
  },
235
  {
236
  "epoch": 0.48,
237
- "learning_rate": 1.8838691327455609e-06,
238
- "logits/chosen": -0.3041500151157379,
239
- "logits/rejected": -0.251315176486969,
240
- "logps/chosen": -417.890625,
241
- "logps/rejected": -283.72235107421875,
242
- "loss": 0.3654,
243
- "rewards/accuracies": 0.6937500238418579,
244
- "rewards/chosen": -0.44542360305786133,
245
- "rewards/margins": 0.34727293252944946,
246
- "rewards/rejected": -0.792696475982666,
247
  "step": 170
248
  },
249
  {
250
  "epoch": 0.51,
251
- "learning_rate": 1.8598146186042808e-06,
252
- "logits/chosen": -0.3496856689453125,
253
- "logits/rejected": -0.3132048547267914,
254
- "logps/chosen": -389.0525817871094,
255
- "logps/rejected": -277.38348388671875,
256
- "loss": 0.3465,
257
- "rewards/accuracies": 0.637499988079071,
258
- "rewards/chosen": -0.5126581788063049,
259
- "rewards/margins": 0.2432871311903,
260
- "rewards/rejected": -0.7559453248977661,
261
  "step": 180
262
  },
263
  {
264
  "epoch": 0.53,
265
- "learning_rate": 1.8336822488127723e-06,
266
- "logits/chosen": -0.31627652049064636,
267
- "logits/rejected": -0.2888433635234833,
268
- "logps/chosen": -325.4978942871094,
269
- "logps/rejected": -290.9458923339844,
270
- "loss": 0.3386,
271
- "rewards/accuracies": 0.5625,
272
- "rewards/chosen": -0.706427276134491,
273
- "rewards/margins": 0.11679848283529282,
274
- "rewards/rejected": -0.8232257962226868,
275
  "step": 190
276
  },
277
  {
278
  "epoch": 0.56,
279
- "learning_rate": 1.805535175696026e-06,
280
- "logits/chosen": -0.28905901312828064,
281
- "logits/rejected": -0.2691965699195862,
282
- "logps/chosen": -340.1316833496094,
283
- "logps/rejected": -311.392822265625,
284
- "loss": 0.3529,
285
- "rewards/accuracies": 0.612500011920929,
286
- "rewards/chosen": -0.6956604719161987,
287
- "rewards/margins": 0.2750135064125061,
288
- "rewards/rejected": -0.9706739187240601,
289
  "step": 200
290
  },
291
  {
292
  "epoch": 0.59,
293
- "learning_rate": 1.7754414203758602e-06,
294
- "logits/chosen": -0.3484232425689697,
295
- "logits/rejected": -0.29674482345581055,
296
- "logps/chosen": -359.6042785644531,
297
- "logps/rejected": -294.1127014160156,
298
- "loss": 0.3339,
299
- "rewards/accuracies": 0.643750011920929,
300
- "rewards/chosen": -0.667249858379364,
301
- "rewards/margins": 0.21498624980449677,
302
- "rewards/rejected": -0.882236123085022,
303
  "step": 210
304
  },
305
  {
306
  "epoch": 0.62,
307
- "learning_rate": 1.7434737083888904e-06,
308
- "logits/chosen": -0.3094923198223114,
309
- "logits/rejected": -0.27373385429382324,
310
- "logps/chosen": -407.7154846191406,
311
- "logps/rejected": -308.6554870605469,
312
- "loss": 0.3407,
313
- "rewards/accuracies": 0.625,
314
- "rewards/chosen": -0.6901127099990845,
315
- "rewards/margins": 0.24910616874694824,
316
- "rewards/rejected": -0.9392188787460327,
317
  "step": 220
318
  },
319
  {
320
  "epoch": 0.65,
321
- "learning_rate": 1.709709293935662e-06,
322
- "logits/chosen": -0.3457449972629547,
323
- "logits/rejected": -0.296016126871109,
324
- "logps/chosen": -364.3536682128906,
325
- "logps/rejected": -335.47021484375,
326
- "loss": 0.3523,
327
- "rewards/accuracies": 0.625,
328
- "rewards/chosen": -0.7736875414848328,
329
- "rewards/margins": 0.3423411250114441,
330
- "rewards/rejected": -1.1160286664962769,
331
  "step": 230
332
  },
333
  {
334
  "epoch": 0.67,
335
- "learning_rate": 1.6742297731856636e-06,
336
- "logits/chosen": -0.3264350891113281,
337
- "logits/rejected": -0.3357524275779724,
338
- "logps/chosen": -403.2978515625,
339
- "logps/rejected": -324.7033996582031,
340
- "loss": 0.3164,
341
- "rewards/accuracies": 0.625,
342
- "rewards/chosen": -0.7830929756164551,
343
- "rewards/margins": 0.2251376211643219,
344
- "rewards/rejected": -1.0082306861877441,
345
  "step": 240
346
  },
347
  {
348
  "epoch": 0.7,
349
- "learning_rate": 1.6371208870894001e-06,
350
- "logits/chosen": -0.39360159635543823,
351
- "logits/rejected": -0.3303549587726593,
352
- "logps/chosen": -440.0787658691406,
353
- "logps/rejected": -334.0160217285156,
354
- "loss": 0.3344,
355
- "rewards/accuracies": 0.59375,
356
- "rewards/chosen": -0.8457928895950317,
357
- "rewards/margins": 0.20620958507061005,
358
- "rewards/rejected": -1.0520025491714478,
359
  "step": 250
360
  },
361
  {
362
  "epoch": 0.73,
363
- "learning_rate": 1.5984723141740574e-06,
364
- "logits/chosen": -0.3018794655799866,
365
- "logits/rejected": -0.24929973483085632,
366
- "logps/chosen": -420.9669494628906,
367
- "logps/rejected": -344.3638000488281,
368
- "loss": 0.3565,
369
- "rewards/accuracies": 0.6312500238418579,
370
- "rewards/chosen": -0.7072012424468994,
371
- "rewards/margins": 0.3442048728466034,
372
- "rewards/rejected": -1.0514062643051147,
373
  "step": 260
374
  },
375
  {
376
  "epoch": 0.76,
377
- "learning_rate": 1.5583774538234882e-06,
378
- "logits/chosen": -0.26736167073249817,
379
- "logits/rejected": -0.21991169452667236,
380
- "logps/chosen": -406.1842346191406,
381
- "logps/rejected": -344.8274841308594,
382
- "loss": 0.3229,
383
- "rewards/accuracies": 0.606249988079071,
384
- "rewards/chosen": -0.8611620664596558,
385
- "rewards/margins": 0.3229338824748993,
386
- "rewards/rejected": -1.1840959787368774,
387
  "step": 270
388
  },
389
  {
390
  "epoch": 0.79,
391
- "learning_rate": 1.5169332005662589e-06,
392
- "logits/chosen": -0.33921122550964355,
393
- "logits/rejected": -0.2950724959373474,
394
- "logps/chosen": -423.4300231933594,
395
- "logps/rejected": -334.3074645996094,
396
- "loss": 0.3151,
397
- "rewards/accuracies": 0.737500011920929,
398
- "rewards/chosen": -0.9226363897323608,
399
- "rewards/margins": 0.36767420172691345,
400
- "rewards/rejected": -1.2903106212615967,
401
  "step": 280
402
  },
403
  {
404
  "epoch": 0.81,
405
- "learning_rate": 1.474239709917218e-06,
406
- "logits/chosen": -0.3765650689601898,
407
- "logits/rejected": -0.34539324045181274,
408
- "logps/chosen": -417.9461364746094,
409
- "logps/rejected": -340.34716796875,
410
- "loss": 0.2955,
411
- "rewards/accuracies": 0.71875,
412
- "rewards/chosen": -0.843783974647522,
413
- "rewards/margins": 0.4681572914123535,
414
- "rewards/rejected": -1.311941146850586,
415
  "step": 290
416
  },
417
  {
418
  "epoch": 0.84,
419
- "learning_rate": 1.430400156338457e-06,
420
- "logits/chosen": -0.45963913202285767,
421
- "logits/rejected": -0.3955768346786499,
422
- "logps/chosen": -441.5418395996094,
423
- "logps/rejected": -356.2896423339844,
424
- "loss": 0.2834,
425
- "rewards/accuracies": 0.6499999761581421,
426
- "rewards/chosen": -0.962236225605011,
427
- "rewards/margins": 0.41569775342941284,
428
- "rewards/rejected": -1.3779337406158447,
429
  "step": 300
430
  },
431
  {
432
  "epoch": 0.87,
433
- "learning_rate": 1.3855204839045892e-06,
434
- "logits/chosen": -0.48744335770606995,
435
- "logits/rejected": -0.43016400933265686,
436
- "logps/chosen": -453.83502197265625,
437
- "logps/rejected": -341.6754455566406,
438
- "loss": 0.2911,
439
- "rewards/accuracies": 0.6312500238418579,
440
- "rewards/chosen": -1.0828750133514404,
441
- "rewards/margins": 0.3926982283592224,
442
- "rewards/rejected": -1.4755733013153076,
443
  "step": 310
444
  },
445
  {
446
  "epoch": 0.9,
447
- "learning_rate": 1.3397091502748927e-06,
448
- "logits/chosen": -0.4381836950778961,
449
- "logits/rejected": -0.38170838356018066,
450
- "logps/chosen": -443.5067443847656,
451
- "logps/rejected": -344.43914794921875,
452
- "loss": 0.299,
453
- "rewards/accuracies": 0.6187499761581421,
454
- "rewards/chosen": -1.0263779163360596,
455
- "rewards/margins": 0.3381286859512329,
456
- "rewards/rejected": -1.3645066022872925,
457
  "step": 320
458
  },
459
  {
460
  "epoch": 0.93,
461
- "learning_rate": 1.2930768645910449e-06,
462
- "logits/chosen": -0.40465015172958374,
463
- "logits/rejected": -0.35638627409935,
464
- "logps/chosen": -396.4947204589844,
465
- "logps/rejected": -333.97650146484375,
466
- "loss": 0.3027,
467
- "rewards/accuracies": 0.606249988079071,
468
- "rewards/chosen": -0.8788374662399292,
469
- "rewards/margins": 0.32027482986450195,
470
- "rewards/rejected": -1.1991122961044312,
471
  "step": 330
472
  },
473
  {
474
  "epoch": 0.96,
475
- "learning_rate": 1.2457363199338495e-06,
476
- "logits/chosen": -0.46247321367263794,
477
- "logits/rejected": -0.3894230127334595,
478
- "logps/chosen": -367.4734802246094,
479
- "logps/rejected": -282.1875,
480
- "loss": 0.3023,
481
- "rewards/accuracies": 0.550000011920929,
482
- "rewards/chosen": -0.8268691301345825,
483
- "rewards/margins": 0.23292645812034607,
484
- "rewards/rejected": -1.059795618057251,
485
  "step": 340
486
  },
487
  {
488
  "epoch": 0.98,
489
- "learning_rate": 1.1978019209855173e-06,
490
- "logits/chosen": -0.46343177556991577,
491
- "logits/rejected": -0.4581407904624939,
492
- "logps/chosen": -406.019775390625,
493
- "logps/rejected": -358.7392883300781,
494
- "loss": 0.3244,
495
- "rewards/accuracies": 0.668749988079071,
496
- "rewards/chosen": -0.9359294772148132,
497
- "rewards/margins": 0.3501017987728119,
498
- "rewards/rejected": -1.2860312461853027,
499
  "step": 350
500
  },
501
  {
502
  "epoch": 1.01,
503
- "learning_rate": 1.14938950755563e-06,
504
- "logits/chosen": -0.5021234154701233,
505
- "logits/rejected": -0.4483606219291687,
506
- "logps/chosen": -400.4501037597656,
507
- "logps/rejected": -345.84906005859375,
508
- "loss": 0.2429,
509
  "rewards/accuracies": 0.737500011920929,
510
- "rewards/chosen": -0.8362014889717102,
511
- "rewards/margins": 0.6666104197502136,
512
- "rewards/rejected": -1.5028117895126343,
513
  "step": 360
514
  },
515
  {
516
  "epoch": 1.04,
517
- "learning_rate": 1.1006160746389332e-06,
518
- "logits/chosen": -0.48441916704177856,
519
- "logits/rejected": -0.4754874110221863,
520
- "logps/chosen": -414.5755310058594,
521
- "logps/rejected": -454.7470703125,
522
- "loss": 0.1045,
523
- "rewards/accuracies": 0.7875000238418579,
524
- "rewards/chosen": -1.2475640773773193,
525
- "rewards/margins": 1.0972559452056885,
526
- "rewards/rejected": -2.344820261001587,
527
  "step": 370
528
  },
529
  {
530
  "epoch": 1.07,
531
- "learning_rate": 1.0515994896814731e-06,
532
- "logits/chosen": -0.535969614982605,
533
- "logits/rejected": -0.5428508520126343,
534
- "logps/chosen": -626.1917724609375,
535
- "logps/rejected": -632.7891845703125,
536
- "loss": 0.0454,
537
- "rewards/accuracies": 0.824999988079071,
538
- "rewards/chosen": -2.6970226764678955,
539
- "rewards/margins": 1.3931865692138672,
540
- "rewards/rejected": -4.090209007263184,
541
  "step": 380
542
  },
543
  {
544
  "epoch": 1.1,
545
- "learning_rate": 1.002458207738333e-06,
546
- "logits/chosen": -0.3777693808078766,
547
- "logits/rejected": -0.3439493179321289,
548
- "logps/chosen": -622.327880859375,
549
- "logps/rejected": -571.1327514648438,
550
- "loss": 0.0574,
551
- "rewards/accuracies": 0.8687499761581421,
552
- "rewards/chosen": -2.0124690532684326,
553
- "rewards/margins": 1.4971076250076294,
554
- "rewards/rejected": -3.5095767974853516,
555
  "step": 390
556
  },
557
  {
558
  "epoch": 1.12,
559
- "learning_rate": 9.533109852113413e-07,
560
- "logits/chosen": -0.40328603982925415,
561
- "logits/rejected": -0.34989938139915466,
562
- "logps/chosen": -453.33563232421875,
563
- "logps/rejected": -487.29742431640625,
564
- "loss": 0.0638,
565
- "rewards/accuracies": 0.84375,
566
- "rewards/chosen": -1.598747968673706,
567
- "rewards/margins": 1.2498724460601807,
568
- "rewards/rejected": -2.8486204147338867,
569
  "step": 400
570
  },
571
  {
572
  "epoch": 1.15,
573
- "learning_rate": 9.042765928585326e-07,
574
- "logits/chosen": -0.3043842315673828,
575
- "logits/rejected": -0.31053781509399414,
576
- "logps/chosen": -483.6888122558594,
577
- "logps/rejected": -555.2555541992188,
578
- "loss": 0.0511,
579
- "rewards/accuracies": 0.8500000238418579,
580
- "rewards/chosen": -1.9158967733383179,
581
- "rewards/margins": 1.495742917060852,
582
- "rewards/rejected": -3.41163969039917,
583
  "step": 410
584
  },
585
  {
586
  "epoch": 1.18,
587
- "learning_rate": 8.554735287689148e-07,
588
- "logits/chosen": -0.2635629177093506,
589
- "logits/rejected": -0.22284087538719177,
590
- "logps/chosen": -522.8436279296875,
591
- "logps/rejected": -567.716552734375,
592
- "loss": 0.0467,
593
- "rewards/accuracies": 0.856249988079071,
594
- "rewards/chosen": -1.8879692554473877,
595
- "rewards/margins": 1.4974491596221924,
596
- "rewards/rejected": -3.385418653488159,
597
  "step": 420
598
  },
599
  {
600
  "epoch": 1.21,
601
- "learning_rate": 8.070197319961782e-07,
602
- "logits/chosen": -0.14604279398918152,
603
- "logits/rejected": -0.1439618021249771,
604
- "logps/chosen": -484.59967041015625,
605
- "logps/rejected": -506.89483642578125,
606
- "loss": 0.0408,
607
- "rewards/accuracies": 0.8500000238418579,
608
- "rewards/chosen": -1.8626312017440796,
609
- "rewards/margins": 1.5483224391937256,
610
- "rewards/rejected": -3.4109535217285156,
611
  "step": 430
612
  },
613
  {
614
  "epoch": 1.24,
615
- "learning_rate": 7.590322975433856e-07,
616
- "logits/chosen": -0.22616323828697205,
617
- "logits/rejected": -0.14803537726402283,
618
- "logps/chosen": -720.8937377929688,
619
- "logps/rejected": -669.2205200195312,
620
- "loss": 0.0367,
621
- "rewards/accuracies": 0.8500000238418579,
622
- "rewards/chosen": -2.2607662677764893,
623
- "rewards/margins": 1.8761478662490845,
624
- "rewards/rejected": -4.136914253234863,
625
  "step": 440
626
  },
627
  {
628
  "epoch": 1.26,
629
- "learning_rate": 7.116271933874245e-07,
630
- "logits/chosen": -0.16615112125873566,
631
- "logits/rejected": -0.06526105105876923,
632
- "logps/chosen": -621.0664672851562,
633
- "logps/rejected": -645.3672485351562,
634
- "loss": 0.03,
635
- "rewards/accuracies": 0.8187500238418579,
636
- "rewards/chosen": -2.6638753414154053,
637
- "rewards/margins": 1.425157904624939,
638
- "rewards/rejected": -4.089033603668213,
639
  "step": 450
640
  },
641
  {
642
  "epoch": 1.29,
643
- "learning_rate": 6.649189802270652e-07,
644
- "logits/chosen": -0.12236519157886505,
645
- "logits/rejected": -0.05993504449725151,
646
- "logps/chosen": -556.0875244140625,
647
- "logps/rejected": -621.5553588867188,
648
- "loss": 0.0311,
649
- "rewards/accuracies": 0.8500000238418579,
650
- "rewards/chosen": -2.585132122039795,
651
- "rewards/margins": 1.5961207151412964,
652
- "rewards/rejected": -4.181252479553223,
653
  "step": 460
654
  },
655
  {
656
  "epoch": 1.32,
657
- "learning_rate": 6.190205346318926e-07,
658
- "logits/chosen": -0.07673145830631256,
659
- "logits/rejected": -0.08171101659536362,
660
- "logps/chosen": -561.4466552734375,
661
- "logps/rejected": -633.3453369140625,
662
- "loss": 0.0306,
663
  "rewards/accuracies": 0.8374999761581421,
664
- "rewards/chosen": -2.4036738872528076,
665
- "rewards/margins": 1.636997938156128,
666
- "rewards/rejected": -4.040672302246094,
667
  "step": 470
668
  },
669
  {
670
  "epoch": 1.35,
671
- "learning_rate": 5.740427762611604e-07,
672
- "logits/chosen": -0.09908205270767212,
673
- "logits/rejected": -0.03128683939576149,
674
- "logps/chosen": -573.7949829101562,
675
- "logps/rejected": -645.1226806640625,
676
- "loss": 0.0314,
677
- "rewards/accuracies": 0.8125,
678
- "rewards/chosen": -2.471958637237549,
679
- "rewards/margins": 1.6115013360977173,
680
- "rewards/rejected": -4.083459377288818,
681
  "step": 480
682
  },
683
  {
684
  "epoch": 1.38,
685
- "learning_rate": 5.300943998117749e-07,
686
- "logits/chosen": 0.031324755400419235,
687
- "logits/rejected": 0.16325363516807556,
688
- "logps/chosen": -535.8555908203125,
689
- "logps/rejected": -618.23193359375,
690
- "loss": 0.0266,
691
- "rewards/accuracies": 0.831250011920929,
692
- "rewards/chosen": -2.483048915863037,
693
- "rewards/margins": 1.7440725564956665,
694
- "rewards/rejected": -4.227121829986572,
695
  "step": 490
696
  },
697
  {
698
  "epoch": 1.4,
699
- "learning_rate": 4.872816123431976e-07,
700
- "logits/chosen": -0.025389358401298523,
701
- "logits/rejected": 0.06323707848787308,
702
- "logps/chosen": -554.9032592773438,
703
- "logps/rejected": -576.2711181640625,
704
- "loss": 0.029,
705
- "rewards/accuracies": 0.8374999761581421,
706
- "rewards/chosen": -2.5237717628479004,
707
- "rewards/margins": 1.4061567783355713,
708
- "rewards/rejected": -3.9299285411834717,
709
  "step": 500
710
  },
711
  {
712
  "epoch": 1.43,
713
- "learning_rate": 4.4570787661405e-07,
714
- "logits/chosen": 0.053199104964733124,
715
- "logits/rejected": 0.13226789236068726,
716
- "logps/chosen": -601.7017822265625,
717
- "logps/rejected": -650.9783935546875,
718
- "loss": 0.0294,
719
- "rewards/accuracies": 0.8500000238418579,
720
- "rewards/chosen": -2.425492763519287,
721
- "rewards/margins": 1.747205138206482,
722
- "rewards/rejected": -4.172698020935059,
723
  "step": 510
724
  },
725
  {
726
  "epoch": 1.46,
727
- "learning_rate": 4.0547366105068347e-07,
728
- "logits/chosen": 0.018769674003124237,
729
- "logits/rejected": 0.1149587631225586,
730
- "logps/chosen": -578.7161865234375,
731
- "logps/rejected": -623.023193359375,
732
- "loss": 0.0286,
733
- "rewards/accuracies": 0.84375,
734
- "rewards/chosen": -2.5578720569610596,
735
- "rewards/margins": 1.705985426902771,
736
- "rewards/rejected": -4.263857841491699,
737
  "step": 520
738
  },
739
  {
740
  "epoch": 1.49,
741
- "learning_rate": 3.666761969519528e-07,
742
- "logits/chosen": 0.045023586601018906,
743
- "logits/rejected": 0.06367478519678116,
744
- "logps/chosen": -585.3010864257812,
745
- "logps/rejected": -640.3782958984375,
746
- "loss": 0.0262,
747
- "rewards/accuracies": 0.875,
748
- "rewards/chosen": -2.601722240447998,
749
- "rewards/margins": 1.804703950881958,
750
- "rewards/rejected": -4.406426429748535,
751
  "step": 530
752
  },
753
  {
754
  "epoch": 1.52,
755
- "learning_rate": 3.2940924351693213e-07,
756
- "logits/chosen": 0.11350098997354507,
757
- "logits/rejected": 0.18628571927547455,
758
- "logps/chosen": -568.986328125,
759
- "logps/rejected": -641.9813232421875,
760
- "loss": 0.0238,
761
- "rewards/accuracies": 0.793749988079071,
762
- "rewards/chosen": -2.5491175651550293,
763
- "rewards/margins": 1.643070936203003,
764
- "rewards/rejected": -4.192188262939453,
765
  "step": 540
766
  },
767
  {
768
  "epoch": 1.55,
769
- "learning_rate": 2.937628612634184e-07,
770
- "logits/chosen": 0.013865552842617035,
771
- "logits/rejected": 0.027750706300139427,
772
- "logps/chosen": -620.2090454101562,
773
- "logps/rejected": -650.0667724609375,
774
- "loss": 0.0239,
775
  "rewards/accuracies": 0.8500000238418579,
776
- "rewards/chosen": -2.666903018951416,
777
- "rewards/margins": 1.803750991821289,
778
- "rewards/rejected": -4.470653533935547,
779
  "step": 550
780
  },
781
  {
782
  "epoch": 1.57,
783
- "learning_rate": 2.598231943847916e-07,
784
- "logits/chosen": 0.07076794654130936,
785
- "logits/rejected": 0.15127721428871155,
786
- "logps/chosen": -636.1734008789062,
787
- "logps/rejected": -626.8192138671875,
788
- "loss": 0.0229,
789
- "rewards/accuracies": 0.831250011920929,
790
- "rewards/chosen": -2.7836403846740723,
791
- "rewards/margins": 1.6109874248504639,
792
- "rewards/rejected": -4.394627571105957,
793
  "step": 560
794
  },
795
  {
796
  "epoch": 1.6,
797
- "learning_rate": 2.276722625711861e-07,
798
- "logits/chosen": 0.008342927321791649,
799
- "logits/rejected": 0.10422797501087189,
800
- "logps/chosen": -604.8076782226562,
801
- "logps/rejected": -674.343505859375,
802
- "loss": 0.0205,
803
- "rewards/accuracies": 0.8125,
804
- "rewards/chosen": -2.9131510257720947,
805
- "rewards/margins": 1.7855117321014404,
806
- "rewards/rejected": -4.698662757873535,
807
  "step": 570
808
  },
809
  {
810
  "epoch": 1.63,
811
- "learning_rate": 1.973877627980699e-07,
812
- "logits/chosen": 0.03685791790485382,
813
- "logits/rejected": 0.02010912261903286,
814
- "logps/chosen": -591.6558227539062,
815
- "logps/rejected": -699.0025634765625,
816
- "loss": 0.0194,
817
- "rewards/accuracies": 0.800000011920929,
818
- "rewards/chosen": -3.0175013542175293,
819
- "rewards/margins": 1.4967237710952759,
820
- "rewards/rejected": -4.514225482940674,
821
  "step": 580
822
  },
823
  {
824
  "epoch": 1.66,
825
- "learning_rate": 1.6904288156123636e-07,
826
- "logits/chosen": 0.039042066782712936,
827
- "logits/rejected": 0.11644144356250763,
828
- "logps/chosen": -615.1423950195312,
829
- "logps/rejected": -660.23486328125,
830
- "loss": 0.0198,
831
- "rewards/accuracies": 0.793749988079071,
832
- "rewards/chosen": -2.8786990642547607,
833
- "rewards/margins": 1.8589880466461182,
834
- "rewards/rejected": -4.737687110900879,
835
  "step": 590
836
  },
837
  {
838
  "epoch": 1.69,
839
- "learning_rate": 1.4270611801196642e-07,
840
- "logits/chosen": 0.1002052053809166,
841
- "logits/rejected": 0.1534217894077301,
842
- "logps/chosen": -646.0867309570312,
843
- "logps/rejected": -681.2635498046875,
844
- "loss": 0.0202,
845
- "rewards/accuracies": 0.831250011920929,
846
- "rewards/chosen": -2.978548526763916,
847
- "rewards/margins": 1.8796141147613525,
848
- "rewards/rejected": -4.858162879943848,
849
  "step": 600
850
  },
851
  {
852
  "epoch": 1.71,
853
- "learning_rate": 1.1844111841977633e-07,
854
- "logits/chosen": 0.22774501144886017,
855
- "logits/rejected": 0.21065323054790497,
856
- "logps/chosen": -563.5538330078125,
857
- "logps/rejected": -615.5939331054688,
858
- "loss": 0.0205,
859
- "rewards/accuracies": 0.8062499761581421,
860
- "rewards/chosen": -2.7306342124938965,
861
- "rewards/margins": 1.688939094543457,
862
- "rewards/rejected": -4.4195733070373535,
863
  "step": 610
864
  },
865
  {
866
  "epoch": 1.74,
867
- "learning_rate": 9.630652236279625e-08,
868
- "logits/chosen": 0.14569208025932312,
869
- "logits/rejected": 0.17595478892326355,
870
- "logps/chosen": -612.38671875,
871
- "logps/rejected": -682.3800048828125,
872
- "loss": 0.0196,
873
- "rewards/accuracies": 0.862500011920929,
874
- "rewards/chosen": -2.906216859817505,
875
- "rewards/margins": 1.8156111240386963,
876
- "rewards/rejected": -4.721828460693359,
877
  "step": 620
878
  },
879
  {
880
  "epoch": 1.77,
881
- "learning_rate": 7.63558210174814e-08,
882
- "logits/chosen": 0.10530801862478256,
883
- "logits/rejected": 0.17252102494239807,
884
- "logps/chosen": -622.7254638671875,
885
- "logps/rejected": -673.253173828125,
886
- "loss": 0.0194,
887
- "rewards/accuracies": 0.856249988079071,
888
- "rewards/chosen": -2.9717555046081543,
889
- "rewards/margins": 1.84465754032135,
890
- "rewards/rejected": -4.816412925720215,
891
  "step": 630
892
  },
893
  {
894
  "epoch": 1.8,
895
- "learning_rate": 5.8637227890115273e-08,
896
- "logits/chosen": 0.05660830810666084,
897
- "logits/rejected": 0.15187661349773407,
898
- "logps/chosen": -602.2574462890625,
899
- "logps/rejected": -671.6920776367188,
900
- "loss": 0.0204,
901
- "rewards/accuracies": 0.856249988079071,
902
- "rewards/chosen": -2.8500285148620605,
903
- "rewards/margins": 1.8192436695098877,
904
- "rewards/rejected": -4.669272422790527,
905
  "step": 640
906
  },
907
  {
908
  "epoch": 1.83,
909
- "learning_rate": 4.3193562302499046e-08,
910
- "logits/chosen": 0.00826293509453535,
911
- "logits/rejected": -0.00432767765596509,
912
- "logps/chosen": -674.0474853515625,
913
- "logps/rejected": -713.0694580078125,
914
- "loss": 0.0175,
915
- "rewards/accuracies": 0.887499988079071,
916
- "rewards/chosen": -3.099088668823242,
917
- "rewards/margins": 1.819131851196289,
918
- "rewards/rejected": -4.918220520019531,
919
  "step": 650
920
  },
921
  {
922
  "epoch": 1.85,
923
- "learning_rate": 3.006214591340339e-08,
924
- "logits/chosen": 0.14368341863155365,
925
- "logits/rejected": 0.2905717194080353,
926
- "logps/chosen": -601.8983764648438,
927
- "logps/rejected": -663.1688232421875,
928
- "loss": 0.019,
929
- "rewards/accuracies": 0.7875000238418579,
930
- "rewards/chosen": -2.9037740230560303,
931
- "rewards/margins": 1.60433828830719,
932
- "rewards/rejected": -4.50811243057251,
933
  "step": 660
934
  },
935
  {
936
  "epoch": 1.88,
937
- "learning_rate": 1.9274712525847447e-08,
938
- "logits/chosen": 0.0023043565452098846,
939
- "logits/rejected": 0.013824631460011005,
940
- "logps/chosen": -618.7401123046875,
941
- "logps/rejected": -675.178466796875,
942
- "loss": 0.0212,
943
- "rewards/accuracies": 0.8187500238418579,
944
- "rewards/chosen": -2.900660276412964,
945
- "rewards/margins": 1.8997567892074585,
946
- "rewards/rejected": -4.800417423248291,
947
  "step": 670
948
  },
949
  {
950
  "epoch": 1.91,
951
- "learning_rate": 1.0857331398169577e-08,
952
- "logits/chosen": 0.062491677701473236,
953
- "logits/rejected": 0.17225809395313263,
954
- "logps/chosen": -628.8497314453125,
955
- "logps/rejected": -684.4832763671875,
956
- "loss": 0.019,
957
- "rewards/accuracies": 0.831250011920929,
958
- "rewards/chosen": -2.9637694358825684,
959
- "rewards/margins": 1.725229024887085,
960
- "rewards/rejected": -4.688998699188232,
961
  "step": 680
962
  },
963
  {
964
  "epoch": 1.94,
965
- "learning_rate": 4.830344244220686e-09,
966
- "logits/chosen": 0.13519100844860077,
967
- "logits/rejected": 0.12914660573005676,
968
- "logps/chosen": -621.2411499023438,
969
- "logps/rejected": -750.7539672851562,
970
- "loss": 0.0209,
971
- "rewards/accuracies": 0.8374999761581421,
972
- "rewards/chosen": -3.056195020675659,
973
- "rewards/margins": 1.9564968347549438,
974
- "rewards/rejected": -5.012692451477051,
975
  "step": 690
976
  },
977
  {
978
  "epoch": 1.97,
979
- "learning_rate": 1.2083160749236653e-09,
980
- "logits/chosen": 0.21589338779449463,
981
- "logits/rejected": 0.24260704219341278,
982
- "logps/chosen": -601.9577026367188,
983
- "logps/rejected": -672.21875,
984
- "loss": 0.0185,
985
- "rewards/accuracies": 0.862500011920929,
986
- "rewards/chosen": -3.022331476211548,
987
- "rewards/margins": 1.748324990272522,
988
- "rewards/rejected": -4.770656108856201,
989
  "step": 700
990
  },
991
  {
992
  "epoch": 2.0,
993
  "learning_rate": 0.0,
994
- "logits/chosen": 0.1099398136138916,
995
- "logits/rejected": 0.22721083462238312,
996
- "logps/chosen": -618.4613037109375,
997
- "logps/rejected": -655.667236328125,
998
- "loss": 0.0185,
999
- "rewards/accuracies": 0.8125,
1000
- "rewards/chosen": -2.7860031127929688,
1001
- "rewards/margins": 1.7313239574432373,
1002
- "rewards/rejected": -4.517327308654785,
1003
  "step": 710
1004
  },
1005
  {
1006
  "epoch": 2.0,
1007
  "step": 710,
1008
  "total_flos": 0.0,
1009
- "train_loss": 0.21128464219435839,
1010
- "train_runtime": 10587.5728,
1011
- "train_samples_per_second": 8.604,
1012
- "train_steps_per_second": 0.067
1013
  }
1014
  ],
1015
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.03,
13
+ "learning_rate": 1.4084507042253522e-07,
14
+ "logits/chosen": -0.023548124358057976,
15
+ "logits/rejected": 0.04590621590614319,
16
+ "logps/chosen": -317.1582336425781,
17
+ "logps/rejected": -207.426513671875,
18
+ "loss": 0.5133,
19
+ "rewards/accuracies": 0.36250001192092896,
20
+ "rewards/chosen": -0.000816057319752872,
21
+ "rewards/margins": -0.0010325554758310318,
22
+ "rewards/rejected": 0.00021649803966283798,
23
  "step": 10
24
  },
25
  {
26
  "epoch": 0.06,
27
+ "learning_rate": 2.8169014084507043e-07,
28
+ "logits/chosen": -0.04698050394654274,
29
+ "logits/rejected": -0.011301965452730656,
30
+ "logps/chosen": -294.093994140625,
31
+ "logps/rejected": -201.62612915039062,
32
+ "loss": 0.5062,
33
+ "rewards/accuracies": 0.5562499761581421,
34
+ "rewards/chosen": -0.00026954649365507066,
35
+ "rewards/margins": 0.002362610073760152,
36
+ "rewards/rejected": -0.002632156480103731,
37
  "step": 20
38
  },
39
  {
40
  "epoch": 0.08,
41
+ "learning_rate": 4.225352112676056e-07,
42
+ "logits/chosen": -0.04627276584506035,
43
+ "logits/rejected": 0.00557746272534132,
44
+ "logps/chosen": -361.0042724609375,
45
+ "logps/rejected": -249.05715942382812,
46
+ "loss": 0.5083,
47
+ "rewards/accuracies": 0.5625,
48
+ "rewards/chosen": 0.00011544860899448395,
49
+ "rewards/margins": 0.010346856899559498,
50
+ "rewards/rejected": -0.01023140735924244,
51
  "step": 30
52
  },
53
  {
54
  "epoch": 0.11,
55
+ "learning_rate": 5.633802816901409e-07,
56
+ "logits/chosen": -0.052781809121370316,
57
+ "logits/rejected": -0.003899569856002927,
58
+ "logps/chosen": -290.2967834472656,
59
+ "logps/rejected": -194.12432861328125,
60
+ "loss": 0.5245,
61
+ "rewards/accuracies": 0.6187499761581421,
62
+ "rewards/chosen": -0.004079835955053568,
63
+ "rewards/margins": 0.03794458881020546,
64
+ "rewards/rejected": -0.042024414986371994,
65
  "step": 40
66
  },
67
  {
68
  "epoch": 0.14,
69
+ "learning_rate": 7.04225352112676e-07,
70
+ "logits/chosen": 0.019576847553253174,
71
+ "logits/rejected": 0.06315603107213974,
72
+ "logps/chosen": -370.23211669921875,
73
+ "logps/rejected": -221.93215942382812,
74
+ "loss": 0.5437,
75
+ "rewards/accuracies": 0.637499988079071,
76
+ "rewards/chosen": -0.02125033549964428,
77
+ "rewards/margins": 0.08855441212654114,
78
+ "rewards/rejected": -0.10980476438999176,
79
  "step": 50
80
  },
81
  {
82
  "epoch": 0.17,
83
+ "learning_rate": 8.450704225352112e-07,
84
+ "logits/chosen": -0.042748045176267624,
85
+ "logits/rejected": 0.01742837205529213,
86
+ "logps/chosen": -321.1835632324219,
87
+ "logps/rejected": -227.04336547851562,
88
+ "loss": 0.5284,
89
+ "rewards/accuracies": 0.574999988079071,
90
+ "rewards/chosen": -0.013542826287448406,
91
+ "rewards/margins": 0.09336896240711212,
92
+ "rewards/rejected": -0.10691177845001221,
93
  "step": 60
94
  },
95
  {
96
  "epoch": 0.2,
97
+ "learning_rate": 9.859154929577465e-07,
98
+ "logits/chosen": -0.0619073323905468,
99
+ "logits/rejected": -0.0089653879404068,
100
+ "logps/chosen": -356.95880126953125,
101
+ "logps/rejected": -257.0391845703125,
102
+ "loss": 0.5376,
103
+ "rewards/accuracies": 0.6000000238418579,
104
+ "rewards/chosen": -0.05503483861684799,
105
+ "rewards/margins": 0.16907523572444916,
106
+ "rewards/rejected": -0.22411008179187775,
107
  "step": 70
108
  },
109
  {
110
  "epoch": 0.22,
111
+ "learning_rate": 9.995106132599868e-07,
112
+ "logits/chosen": 0.04335876554250717,
113
+ "logits/rejected": 0.09913833439350128,
114
+ "logps/chosen": -323.42095947265625,
115
+ "logps/rejected": -238.894775390625,
116
+ "loss": 0.5539,
117
+ "rewards/accuracies": 0.543749988079071,
118
+ "rewards/chosen": -0.10817662626504898,
119
+ "rewards/margins": 0.1278904229402542,
120
+ "rewards/rejected": -0.2360670566558838,
121
  "step": 80
122
  },
123
  {
124
  "epoch": 0.25,
125
+ "learning_rate": 9.978201358980644e-07,
126
+ "logits/chosen": -0.012843596749007702,
127
+ "logits/rejected": 0.04404326155781746,
128
+ "logps/chosen": -363.57000732421875,
129
+ "logps/rejected": -281.42022705078125,
130
+ "loss": 0.5228,
131
+ "rewards/accuracies": 0.6312500238418579,
132
+ "rewards/chosen": 0.005774746648967266,
133
+ "rewards/margins": 0.13524329662322998,
134
+ "rewards/rejected": -0.12946854531764984,
135
  "step": 90
136
  },
137
  {
138
  "epoch": 0.28,
139
+ "learning_rate": 9.949266103908894e-07,
140
+ "logits/chosen": -0.006999261677265167,
141
+ "logits/rejected": 0.056761473417282104,
142
+ "logps/chosen": -276.5388488769531,
143
+ "logps/rejected": -212.882568359375,
144
+ "loss": 0.5063,
145
+ "rewards/accuracies": 0.518750011920929,
146
+ "rewards/chosen": 0.031200706958770752,
147
+ "rewards/margins": 0.1093803197145462,
148
+ "rewards/rejected": -0.07817960530519485,
149
  "step": 100
150
  },
151
  {
152
  "epoch": 0.31,
153
+ "learning_rate": 9.908370293252287e-07,
154
+ "logits/chosen": -0.0054739052429795265,
155
+ "logits/rejected": 0.018744127824902534,
156
+ "logps/chosen": -324.8765869140625,
157
+ "logps/rejected": -246.21212768554688,
158
+ "loss": 0.5016,
159
+ "rewards/accuracies": 0.5874999761581421,
160
+ "rewards/chosen": 0.05711355805397034,
161
+ "rewards/margins": 0.14952385425567627,
162
+ "rewards/rejected": -0.09241029620170593,
163
  "step": 110
164
  },
165
  {
166
  "epoch": 0.34,
167
+ "learning_rate": 9.855612757141654e-07,
168
+ "logits/chosen": -0.06719125807285309,
169
+ "logits/rejected": -0.008927728049457073,
170
+ "logps/chosen": -338.69720458984375,
171
+ "logps/rejected": -210.80349731445312,
172
+ "loss": 0.5033,
173
+ "rewards/accuracies": 0.637499988079071,
174
+ "rewards/chosen": 0.007685990538448095,
175
+ "rewards/margins": 0.2579048275947571,
176
+ "rewards/rejected": -0.25021880865097046,
177
  "step": 120
178
  },
179
  {
180
  "epoch": 0.37,
181
+ "learning_rate": 9.791120991134902e-07,
182
+ "logits/chosen": -0.10314974933862686,
183
+ "logits/rejected": -0.011623701080679893,
184
+ "logps/chosen": -380.0775451660156,
185
+ "logps/rejected": -255.47286987304688,
186
+ "loss": 0.5122,
187
+ "rewards/accuracies": 0.6499999761581421,
188
+ "rewards/chosen": -0.06153721362352371,
189
+ "rewards/margins": 0.2917006313800812,
190
+ "rewards/rejected": -0.3532378077507019,
191
  "step": 130
192
  },
193
  {
194
  "epoch": 0.39,
195
+ "learning_rate": 9.715050848107168e-07,
196
+ "logits/chosen": -0.17015981674194336,
197
+ "logits/rejected": -0.08929729461669922,
198
+ "logps/chosen": -344.5144958496094,
199
+ "logps/rejected": -232.58786010742188,
200
+ "loss": 0.4817,
201
+ "rewards/accuracies": 0.625,
202
+ "rewards/chosen": -0.038360703736543655,
203
+ "rewards/margins": 0.26252132654190063,
204
+ "rewards/rejected": -0.3008820414543152,
205
  "step": 140
206
  },
207
  {
208
  "epoch": 0.42,
209
+ "learning_rate": 9.627586161611731e-07,
210
+ "logits/chosen": -0.09996357560157776,
211
+ "logits/rejected": -0.0434463731944561,
212
+ "logps/chosen": -285.2746887207031,
213
+ "logps/rejected": -217.32177734375,
214
+ "loss": 0.479,
215
+ "rewards/accuracies": 0.6187499761581421,
216
+ "rewards/chosen": -0.06696902960538864,
217
+ "rewards/margins": 0.18583206832408905,
218
+ "rewards/rejected": -0.2528010904788971,
219
  "step": 150
220
  },
221
  {
222
  "epoch": 0.45,
223
+ "learning_rate": 9.528938301621955e-07,
224
+ "logits/chosen": -0.17547622323036194,
225
+ "logits/rejected": -0.11714746057987213,
226
+ "logps/chosen": -346.12451171875,
227
+ "logps/rejected": -225.3998565673828,
228
+ "loss": 0.4459,
229
+ "rewards/accuracies": 0.606249988079071,
230
+ "rewards/chosen": -0.02809586003422737,
231
+ "rewards/margins": 0.17596405744552612,
232
+ "rewards/rejected": -0.2040599286556244,
233
  "step": 160
234
  },
235
  {
236
  "epoch": 0.48,
237
+ "learning_rate": 9.419345663727804e-07,
238
+ "logits/chosen": -0.14170362055301666,
239
+ "logits/rejected": -0.08402098715305328,
240
+ "logps/chosen": -348.41986083984375,
241
+ "logps/rejected": -234.4894561767578,
242
+ "loss": 0.4488,
243
+ "rewards/accuracies": 0.637499988079071,
244
+ "rewards/chosen": -0.03395242244005203,
245
+ "rewards/margins": 0.22700819373130798,
246
+ "rewards/rejected": -0.2609606385231018,
247
  "step": 170
248
  },
249
  {
250
  "epoch": 0.51,
251
+ "learning_rate": 9.299073093021404e-07,
252
+ "logits/chosen": -0.07009784877300262,
253
+ "logits/rejected": -0.014887778088450432,
254
+ "logps/chosen": -359.72296142578125,
255
+ "logps/rejected": -259.1075439453125,
256
+ "loss": 0.472,
257
+ "rewards/accuracies": 0.643750011920929,
258
+ "rewards/chosen": -0.18398258090019226,
259
+ "rewards/margins": 0.25430697202682495,
260
+ "rewards/rejected": -0.4382895827293396,
261
  "step": 180
262
  },
263
  {
264
  "epoch": 0.53,
265
+ "learning_rate": 9.168411244063861e-07,
266
+ "logits/chosen": -0.15516254305839539,
267
+ "logits/rejected": -0.09202875196933746,
268
+ "logps/chosen": -376.22027587890625,
269
+ "logps/rejected": -267.2409362792969,
270
+ "loss": 0.4593,
271
+ "rewards/accuracies": 0.581250011920929,
272
+ "rewards/chosen": -0.2544471025466919,
273
+ "rewards/margins": 0.2352083921432495,
274
+ "rewards/rejected": -0.4896554946899414,
275
  "step": 190
276
  },
277
  {
278
  "epoch": 0.56,
279
+ "learning_rate": 9.02767587848013e-07,
280
+ "logits/chosen": -0.16307282447814941,
281
+ "logits/rejected": -0.11575134098529816,
282
+ "logps/chosen": -327.9281311035156,
283
+ "logps/rejected": -276.331787109375,
284
+ "loss": 0.448,
285
+ "rewards/accuracies": 0.581250011920929,
286
+ "rewards/chosen": -0.21112385392189026,
287
+ "rewards/margins": 0.1783614158630371,
288
+ "rewards/rejected": -0.38948526978492737,
289
  "step": 200
290
  },
291
  {
292
  "epoch": 0.59,
293
+ "learning_rate": 8.877207101879301e-07,
294
+ "logits/chosen": -0.11954480409622192,
295
+ "logits/rejected": -0.06289126724004745,
296
+ "logps/chosen": -306.99871826171875,
297
+ "logps/rejected": -228.0065460205078,
298
+ "loss": 0.4238,
299
+ "rewards/accuracies": 0.6499999761581421,
300
+ "rewards/chosen": -0.18940094113349915,
301
+ "rewards/margins": 0.19614948332309723,
302
+ "rewards/rejected": -0.3855504095554352,
303
  "step": 210
304
  },
305
  {
306
  "epoch": 0.62,
307
+ "learning_rate": 8.717368541944452e-07,
308
+ "logits/chosen": -0.14218950271606445,
309
+ "logits/rejected": -0.12026125192642212,
310
+ "logps/chosen": -317.53515625,
311
+ "logps/rejected": -235.4711456298828,
312
+ "loss": 0.4305,
313
+ "rewards/accuracies": 0.581250011920929,
314
+ "rewards/chosen": -0.2156684845685959,
315
+ "rewards/margins": 0.20358431339263916,
316
+ "rewards/rejected": -0.41925281286239624,
317
  "step": 220
318
  },
319
  {
320
  "epoch": 0.65,
321
+ "learning_rate": 8.54854646967831e-07,
322
+ "logits/chosen": -0.2029605209827423,
323
+ "logits/rejected": -0.1495995819568634,
324
+ "logps/chosen": -356.8483581542969,
325
+ "logps/rejected": -289.8570251464844,
326
+ "loss": 0.4198,
327
+ "rewards/accuracies": 0.6312500238418579,
328
+ "rewards/chosen": -0.2576315402984619,
329
+ "rewards/margins": 0.24145260453224182,
330
+ "rewards/rejected": -0.49908414483070374,
331
  "step": 230
332
  },
333
  {
334
  "epoch": 0.67,
335
+ "learning_rate": 8.371148865928318e-07,
336
+ "logits/chosen": -0.14190950989723206,
337
+ "logits/rejected": -0.10926549136638641,
338
+ "logps/chosen": -341.60406494140625,
339
+ "logps/rejected": -277.4010009765625,
340
+ "loss": 0.3943,
341
+ "rewards/accuracies": 0.5562499761581421,
342
+ "rewards/chosen": -0.26126688718795776,
343
+ "rewards/margins": 0.23097483813762665,
344
+ "rewards/rejected": -0.492241770029068,
345
  "step": 240
346
  },
347
  {
348
  "epoch": 0.7,
349
+ "learning_rate": 8.185604435447001e-07,
350
+ "logits/chosen": -0.11840251833200455,
351
+ "logits/rejected": -0.07302571833133698,
352
+ "logps/chosen": -352.67669677734375,
353
+ "logps/rejected": -278.2833557128906,
354
+ "loss": 0.4072,
355
+ "rewards/accuracies": 0.6187499761581421,
356
+ "rewards/chosen": -0.27463454008102417,
357
+ "rewards/margins": 0.2554408013820648,
358
+ "rewards/rejected": -0.5300754308700562,
359
  "step": 250
360
  },
361
  {
362
  "epoch": 0.73,
363
+ "learning_rate": 7.992361570870287e-07,
364
+ "logits/chosen": -0.20119109749794006,
365
+ "logits/rejected": -0.1608552783727646,
366
+ "logps/chosen": -312.8297424316406,
367
+ "logps/rejected": -253.52206420898438,
368
+ "loss": 0.4189,
369
+ "rewards/accuracies": 0.668749988079071,
370
+ "rewards/chosen": -0.3588634431362152,
371
+ "rewards/margins": 0.18340489268302917,
372
+ "rewards/rejected": -0.5422683358192444,
373
  "step": 260
374
  },
375
  {
376
  "epoch": 0.76,
377
+ "learning_rate": 7.791887269117441e-07,
378
+ "logits/chosen": -0.130401611328125,
379
+ "logits/rejected": -0.061286091804504395,
380
+ "logps/chosen": -357.2626953125,
381
+ "logps/rejected": -254.2506866455078,
382
+ "loss": 0.4325,
383
+ "rewards/accuracies": 0.65625,
384
+ "rewards/chosen": -0.39726734161376953,
385
+ "rewards/margins": 0.31203722953796387,
386
+ "rewards/rejected": -0.7093045711517334,
387
  "step": 270
388
  },
389
  {
390
  "epoch": 0.79,
391
+ "learning_rate": 7.584666002831294e-07,
392
+ "logits/chosen": -0.21146509051322937,
393
+ "logits/rejected": -0.15420952439308167,
394
+ "logps/chosen": -403.4113464355469,
395
+ "logps/rejected": -285.39385986328125,
396
+ "loss": 0.411,
397
+ "rewards/accuracies": 0.6937500238418579,
398
+ "rewards/chosen": -0.3059775233268738,
399
+ "rewards/margins": 0.321936696767807,
400
+ "rewards/rejected": -0.6279141902923584,
401
  "step": 280
402
  },
403
  {
404
  "epoch": 0.81,
405
+ "learning_rate": 7.37119854958609e-07,
406
+ "logits/chosen": -0.20524680614471436,
407
+ "logits/rejected": -0.16661730408668518,
408
+ "logps/chosen": -373.66961669921875,
409
+ "logps/rejected": -300.1796569824219,
410
+ "loss": 0.3785,
411
+ "rewards/accuracies": 0.668749988079071,
412
+ "rewards/chosen": -0.35920846462249756,
413
+ "rewards/margins": 0.23060360550880432,
414
+ "rewards/rejected": -0.5898120403289795,
415
  "step": 290
416
  },
417
  {
418
  "epoch": 0.84,
419
+ "learning_rate": 7.152000781692285e-07,
420
+ "logits/chosen": -0.2269669473171234,
421
+ "logits/rejected": -0.20886960625648499,
422
+ "logps/chosen": -376.932373046875,
423
+ "logps/rejected": -305.97137451171875,
424
+ "loss": 0.353,
425
+ "rewards/accuracies": 0.606249988079071,
426
+ "rewards/chosen": -0.34051477909088135,
427
+ "rewards/margins": 0.1766359806060791,
428
+ "rewards/rejected": -0.5171507596969604,
429
  "step": 300
430
  },
431
  {
432
  "epoch": 0.87,
433
+ "learning_rate": 6.927602419522946e-07,
434
+ "logits/chosen": -0.1920723021030426,
435
+ "logits/rejected": -0.19167286157608032,
436
+ "logps/chosen": -328.34930419921875,
437
+ "logps/rejected": -271.737548828125,
438
+ "loss": 0.3549,
439
+ "rewards/accuracies": 0.5687500238418579,
440
+ "rewards/chosen": -0.4537748396396637,
441
+ "rewards/margins": 0.15218928456306458,
442
+ "rewards/rejected": -0.6059640645980835,
443
  "step": 310
444
  },
445
  {
446
  "epoch": 0.9,
447
+ "learning_rate": 6.698545751374463e-07,
448
+ "logits/chosen": -0.2686254382133484,
449
+ "logits/rejected": -0.18076984584331512,
450
+ "logps/chosen": -424.98052978515625,
451
+ "logps/rejected": -309.3459777832031,
452
+ "loss": 0.38,
453
+ "rewards/accuracies": 0.6499999761581421,
454
+ "rewards/chosen": -0.4298086166381836,
455
+ "rewards/margins": 0.33044153451919556,
456
+ "rewards/rejected": -0.7602501511573792,
457
  "step": 320
458
  },
459
  {
460
  "epoch": 0.93,
461
+ "learning_rate": 6.465384322955224e-07,
462
+ "logits/chosen": -0.21811941266059875,
463
+ "logits/rejected": -0.1839132010936737,
464
+ "logps/chosen": -372.3172607421875,
465
+ "logps/rejected": -272.31597900390625,
466
+ "loss": 0.3957,
467
+ "rewards/accuracies": 0.699999988079071,
468
+ "rewards/chosen": -0.42889633774757385,
469
+ "rewards/margins": 0.3255406320095062,
470
+ "rewards/rejected": -0.7544369697570801,
471
  "step": 330
472
  },
473
  {
474
  "epoch": 0.96,
475
+ "learning_rate": 6.228681599669248e-07,
476
+ "logits/chosen": -0.19736522436141968,
477
+ "logits/rejected": -0.13541147112846375,
478
+ "logps/chosen": -430.74932861328125,
479
+ "logps/rejected": -293.2500915527344,
480
+ "loss": 0.3951,
481
+ "rewards/accuracies": 0.6937500238418579,
482
+ "rewards/chosen": -0.36328303813934326,
483
+ "rewards/margins": 0.36417144536972046,
484
+ "rewards/rejected": -0.7274545431137085,
485
  "step": 340
486
  },
487
  {
488
  "epoch": 0.98,
489
+ "learning_rate": 5.989009604927586e-07,
490
+ "logits/chosen": -0.18117669224739075,
491
+ "logits/rejected": -0.08865699172019958,
492
+ "logps/chosen": -387.6150207519531,
493
+ "logps/rejected": -295.8270263671875,
494
+ "loss": 0.3683,
495
+ "rewards/accuracies": 0.6625000238418579,
496
+ "rewards/chosen": -0.2865923345088959,
497
+ "rewards/margins": 0.37007588148117065,
498
+ "rewards/rejected": -0.6566681861877441,
499
  "step": 350
500
  },
501
  {
502
  "epoch": 1.01,
503
+ "learning_rate": 5.74694753777815e-07,
504
+ "logits/chosen": -0.18364325165748596,
505
+ "logits/rejected": -0.16212408244609833,
506
+ "logps/chosen": -325.68133544921875,
507
+ "logps/rejected": -274.33734130859375,
508
+ "loss": 0.3395,
509
  "rewards/accuracies": 0.737500011920929,
510
+ "rewards/chosen": -0.40283432602882385,
511
+ "rewards/margins": 0.3764493465423584,
512
+ "rewards/rejected": -0.7792836427688599,
513
  "step": 360
514
  },
515
  {
516
  "epoch": 1.04,
517
+ "learning_rate": 5.503080373194666e-07,
518
+ "logits/chosen": -0.28244373202323914,
519
+ "logits/rejected": -0.2197529822587967,
520
+ "logps/chosen": -375.904541015625,
521
+ "logps/rejected": -331.39263916015625,
522
+ "loss": 0.2837,
523
+ "rewards/accuracies": 0.8062499761581421,
524
+ "rewards/chosen": -0.45602947473526,
525
+ "rewards/margins": 0.6367942094802856,
526
+ "rewards/rejected": -1.0928236246109009,
527
  "step": 370
528
  },
529
  {
530
  "epoch": 1.07,
531
+ "learning_rate": 5.257997448407366e-07,
532
+ "logits/chosen": -0.24875327944755554,
533
+ "logits/rejected": -0.1415044665336609,
534
+ "logps/chosen": -409.04010009765625,
535
+ "logps/rejected": -306.00079345703125,
536
+ "loss": 0.2459,
537
+ "rewards/accuracies": 0.78125,
538
+ "rewards/chosen": -0.48139676451683044,
539
+ "rewards/margins": 0.6319422721862793,
540
+ "rewards/rejected": -1.1133390665054321,
541
  "step": 380
542
  },
543
  {
544
  "epoch": 1.1,
545
+ "learning_rate": 5.012291038691665e-07,
546
+ "logits/chosen": -0.2663780748844147,
547
+ "logits/rejected": -0.18359437584877014,
548
+ "logps/chosen": -421.8829650878906,
549
+ "logps/rejected": -379.4888916015625,
550
+ "loss": 0.2079,
551
+ "rewards/accuracies": 0.800000011920929,
552
+ "rewards/chosen": -0.7042829394340515,
553
+ "rewards/margins": 0.6811956167221069,
554
+ "rewards/rejected": -1.3854784965515137,
555
  "step": 390
556
  },
557
  {
558
  "epoch": 1.12,
559
+ "learning_rate": 4.7665549260567063e-07,
560
+ "logits/chosen": -0.30431362986564636,
561
+ "logits/rejected": -0.24137239158153534,
562
+ "logps/chosen": -431.7137145996094,
563
+ "logps/rejected": -361.01959228515625,
564
+ "loss": 0.1728,
565
+ "rewards/accuracies": 0.8500000238418579,
566
+ "rewards/chosen": -0.903692364692688,
567
+ "rewards/margins": 0.7888330221176147,
568
+ "rewards/rejected": -1.6925252676010132,
569
  "step": 400
570
  },
571
  {
572
  "epoch": 1.15,
573
+ "learning_rate": 4.521382964292663e-07,
574
+ "logits/chosen": -0.23668956756591797,
575
+ "logits/rejected": -0.19462040066719055,
576
+ "logps/chosen": -423.92254638671875,
577
+ "logps/rejected": -366.35723876953125,
578
+ "loss": 0.1768,
579
+ "rewards/accuracies": 0.7875000238418579,
580
+ "rewards/chosen": -0.9347349405288696,
581
+ "rewards/margins": 0.7054546475410461,
582
+ "rewards/rejected": -1.640189528465271,
583
  "step": 410
584
  },
585
  {
586
  "epoch": 1.18,
587
+ "learning_rate": 4.277367643844574e-07,
588
+ "logits/chosen": -0.2629498541355133,
589
+ "logits/rejected": -0.19311530888080597,
590
+ "logps/chosen": -465.82305908203125,
591
+ "logps/rejected": -373.01806640625,
592
+ "loss": 0.1767,
593
+ "rewards/accuracies": 0.75,
594
+ "rewards/chosen": -1.0273025035858154,
595
+ "rewards/margins": 0.6468140482902527,
596
+ "rewards/rejected": -1.6741164922714233,
597
  "step": 420
598
  },
599
  {
600
  "epoch": 1.21,
601
+ "learning_rate": 4.035098659980891e-07,
602
+ "logits/chosen": -0.24669210612773895,
603
+ "logits/rejected": -0.1399877965450287,
604
+ "logps/chosen": -415.6297912597656,
605
+ "logps/rejected": -384.90948486328125,
606
+ "loss": 0.1839,
607
+ "rewards/accuracies": 0.7749999761581421,
608
+ "rewards/chosen": -0.9914595484733582,
609
+ "rewards/margins": 0.6535197496414185,
610
+ "rewards/rejected": -1.6449792385101318,
611
  "step": 430
612
  },
613
  {
614
  "epoch": 1.24,
615
+ "learning_rate": 3.795161487716928e-07,
616
+ "logits/chosen": -0.22286108136177063,
617
+ "logits/rejected": -0.14977149665355682,
618
+ "logps/chosen": -483.51092529296875,
619
+ "logps/rejected": -409.53411865234375,
620
+ "loss": 0.1723,
621
+ "rewards/accuracies": 0.84375,
622
+ "rewards/chosen": -0.9548113942146301,
623
+ "rewards/margins": 0.8746234178543091,
624
+ "rewards/rejected": -1.8294346332550049,
625
  "step": 440
626
  },
627
  {
628
  "epoch": 1.26,
629
+ "learning_rate": 3.5581359669371223e-07,
630
+ "logits/chosen": -0.17988570034503937,
631
+ "logits/rejected": -0.17582079768180847,
632
+ "logps/chosen": -420.56048583984375,
633
+ "logps/rejected": -363.8465270996094,
634
+ "loss": 0.1609,
635
+ "rewards/accuracies": 0.7875000238418579,
636
+ "rewards/chosen": -1.0016785860061646,
637
+ "rewards/margins": 0.5783125758171082,
638
+ "rewards/rejected": -1.5799912214279175,
639
  "step": 450
640
  },
641
  {
642
  "epoch": 1.29,
643
+ "learning_rate": 3.324594901135326e-07,
644
+ "logits/chosen": -0.22410225868225098,
645
+ "logits/rejected": -0.16186970472335815,
646
+ "logps/chosen": -437.3802185058594,
647
+ "logps/rejected": -374.47247314453125,
648
+ "loss": 0.1541,
649
+ "rewards/accuracies": 0.768750011920929,
650
+ "rewards/chosen": -1.0312308073043823,
651
+ "rewards/margins": 0.6275131702423096,
652
+ "rewards/rejected": -1.6587440967559814,
653
  "step": 460
654
  },
655
  {
656
  "epoch": 1.32,
657
+ "learning_rate": 3.095102673159463e-07,
658
+ "logits/chosen": -0.18208977580070496,
659
+ "logits/rejected": -0.15096168220043182,
660
+ "logps/chosen": -473.5821228027344,
661
+ "logps/rejected": -409.6917419433594,
662
+ "loss": 0.1473,
663
  "rewards/accuracies": 0.8374999761581421,
664
+ "rewards/chosen": -1.1605236530303955,
665
+ "rewards/margins": 0.7620750069618225,
666
+ "rewards/rejected": -1.9225986003875732,
667
  "step": 470
668
  },
669
  {
670
  "epoch": 1.35,
671
+ "learning_rate": 2.870213881305802e-07,
672
+ "logits/chosen": -0.14899012446403503,
673
+ "logits/rejected": -0.05964149162173271,
674
+ "logps/chosen": -452.955078125,
675
+ "logps/rejected": -390.2185974121094,
676
+ "loss": 0.1482,
677
+ "rewards/accuracies": 0.8062499761581421,
678
+ "rewards/chosen": -1.1373943090438843,
679
+ "rewards/margins": 0.7015290856361389,
680
+ "rewards/rejected": -1.8389232158660889,
681
  "step": 480
682
  },
683
  {
684
  "epoch": 1.38,
685
+ "learning_rate": 2.6504719990588745e-07,
686
+ "logits/chosen": -0.14091524481773376,
687
+ "logits/rejected": -0.09124572575092316,
688
+ "logps/chosen": -451.630126953125,
689
+ "logps/rejected": -381.64276123046875,
690
+ "loss": 0.1474,
691
+ "rewards/accuracies": 0.768750011920929,
692
+ "rewards/chosen": -1.1757558584213257,
693
+ "rewards/margins": 0.7031680345535278,
694
+ "rewards/rejected": -1.878924012184143,
695
  "step": 490
696
  },
697
  {
698
  "epoch": 1.4,
699
+ "learning_rate": 2.436408061715988e-07,
700
+ "logits/chosen": -0.10461604595184326,
701
+ "logits/rejected": -0.0906451866030693,
702
+ "logps/chosen": -386.41705322265625,
703
+ "logps/rejected": -403.9093933105469,
704
+ "loss": 0.1412,
705
+ "rewards/accuracies": 0.78125,
706
+ "rewards/chosen": -1.1071223020553589,
707
+ "rewards/margins": 0.6409605145454407,
708
+ "rewards/rejected": -1.7480828762054443,
709
  "step": 500
710
  },
711
  {
712
  "epoch": 1.43,
713
+ "learning_rate": 2.22853938307025e-07,
714
+ "logits/chosen": -0.053080081939697266,
715
+ "logits/rejected": -0.0005340933566913009,
716
+ "logps/chosen": -396.68548583984375,
717
+ "logps/rejected": -341.5348815917969,
718
+ "loss": 0.1348,
719
+ "rewards/accuracies": 0.75,
720
+ "rewards/chosen": -1.1873186826705933,
721
+ "rewards/margins": 0.5248215794563293,
722
+ "rewards/rejected": -1.7121403217315674,
723
  "step": 510
724
  },
725
  {
726
  "epoch": 1.46,
727
+ "learning_rate": 2.0273683052534173e-07,
728
+ "logits/chosen": 0.018692368641495705,
729
+ "logits/rejected": 0.09040405601263046,
730
+ "logps/chosen": -455.23944091796875,
731
+ "logps/rejected": -418.3058166503906,
732
+ "loss": 0.14,
733
+ "rewards/accuracies": 0.768750011920929,
734
+ "rewards/chosen": -1.1594905853271484,
735
+ "rewards/margins": 0.8578866720199585,
736
+ "rewards/rejected": -2.0173771381378174,
737
  "step": 520
738
  },
739
  {
740
  "epoch": 1.49,
741
+ "learning_rate": 1.833380984759764e-07,
742
+ "logits/chosen": -0.05254416540265083,
743
+ "logits/rejected": 0.048647552728652954,
744
+ "logps/chosen": -407.8287048339844,
745
+ "logps/rejected": -425.9695739746094,
746
+ "loss": 0.1465,
747
+ "rewards/accuracies": 0.7562500238418579,
748
+ "rewards/chosen": -1.2748953104019165,
749
+ "rewards/margins": 0.7285683155059814,
750
+ "rewards/rejected": -2.0034632682800293,
751
  "step": 530
752
  },
753
  {
754
  "epoch": 1.52,
755
+ "learning_rate": 1.6470462175846606e-07,
756
+ "logits/chosen": -0.017660032957792282,
757
+ "logits/rejected": 0.017109563574194908,
758
+ "logps/chosen": -465.2491149902344,
759
+ "logps/rejected": -433.9029846191406,
760
+ "loss": 0.1469,
761
+ "rewards/accuracies": 0.7562500238418579,
762
+ "rewards/chosen": -1.0953867435455322,
763
+ "rewards/margins": 0.8007495999336243,
764
+ "rewards/rejected": -1.8961362838745117,
765
  "step": 540
766
  },
767
  {
768
  "epoch": 1.55,
769
+ "learning_rate": 1.468814306317092e-07,
770
+ "logits/chosen": 0.009503689594566822,
771
+ "logits/rejected": 0.04763117805123329,
772
+ "logps/chosen": -430.349609375,
773
+ "logps/rejected": -371.52532958984375,
774
+ "loss": 0.1389,
775
  "rewards/accuracies": 0.8500000238418579,
776
+ "rewards/chosen": -1.115271806716919,
777
+ "rewards/margins": 0.7877473831176758,
778
+ "rewards/rejected": -1.9030193090438843,
779
  "step": 550
780
  },
781
  {
782
  "epoch": 1.57,
783
+ "learning_rate": 1.299115971923958e-07,
784
+ "logits/chosen": -0.010196239687502384,
785
+ "logits/rejected": 0.055187441408634186,
786
+ "logps/chosen": -443.1453552246094,
787
+ "logps/rejected": -429.2059020996094,
788
+ "loss": 0.1395,
789
+ "rewards/accuracies": 0.84375,
790
+ "rewards/chosen": -1.179884910583496,
791
+ "rewards/margins": 0.8822008967399597,
792
+ "rewards/rejected": -2.0620856285095215,
793
  "step": 560
794
  },
795
  {
796
  "epoch": 1.6,
797
+ "learning_rate": 1.1383613128559305e-07,
798
+ "logits/chosen": -0.03149424493312836,
799
+ "logits/rejected": 0.04653029888868332,
800
+ "logps/chosen": -498.27069091796875,
801
+ "logps/rejected": -488.0616760253906,
802
+ "loss": 0.137,
803
+ "rewards/accuracies": 0.875,
804
+ "rewards/chosen": -1.2219698429107666,
805
+ "rewards/margins": 0.9878193140029907,
806
+ "rewards/rejected": -2.209789276123047,
807
  "step": 570
808
  },
809
  {
810
  "epoch": 1.63,
811
+ "learning_rate": 9.869388139903495e-08,
812
+ "logits/chosen": -0.0809461921453476,
813
+ "logits/rejected": -0.015049537643790245,
814
+ "logps/chosen": -539.2957763671875,
815
+ "logps/rejected": -481.3929748535156,
816
+ "loss": 0.132,
817
+ "rewards/accuracies": 0.8125,
818
+ "rewards/chosen": -1.3534362316131592,
819
+ "rewards/margins": 0.8015187978744507,
820
+ "rewards/rejected": -2.1549549102783203,
821
  "step": 580
822
  },
823
  {
824
  "epoch": 1.66,
825
+ "learning_rate": 8.452144078061818e-08,
826
+ "logits/chosen": 0.025172684341669083,
827
+ "logits/rejected": 0.056724805384874344,
828
+ "logps/chosen": -418.3292541503906,
829
+ "logps/rejected": -396.272216796875,
830
+ "loss": 0.1244,
831
+ "rewards/accuracies": 0.7875000238418579,
832
+ "rewards/chosen": -1.2570348978042603,
833
+ "rewards/margins": 0.773349940776825,
834
+ "rewards/rejected": -2.0303850173950195,
835
  "step": 590
836
  },
837
  {
838
  "epoch": 1.69,
839
+ "learning_rate": 7.135305900598321e-08,
840
+ "logits/chosen": -0.09145348519086838,
841
+ "logits/rejected": -0.024112572893500328,
842
+ "logps/chosen": -490.0101623535156,
843
+ "logps/rejected": -435.8873596191406,
844
+ "loss": 0.1211,
845
+ "rewards/accuracies": 0.800000011920929,
846
+ "rewards/chosen": -1.354788899421692,
847
+ "rewards/margins": 0.9178797006607056,
848
+ "rewards/rejected": -2.2726683616638184,
849
  "step": 600
850
  },
851
  {
852
  "epoch": 1.71,
853
+ "learning_rate": 5.9220559209888166e-08,
854
+ "logits/chosen": 0.052403099834918976,
855
+ "logits/rejected": 0.09970308840274811,
856
+ "logps/chosen": -431.0038146972656,
857
+ "logps/rejected": -452.14459228515625,
858
+ "loss": 0.1293,
859
+ "rewards/accuracies": 0.824999988079071,
860
+ "rewards/chosen": -1.2271394729614258,
861
+ "rewards/margins": 0.8939367532730103,
862
+ "rewards/rejected": -2.1210761070251465,
863
  "step": 610
864
  },
865
  {
866
  "epoch": 1.74,
867
+ "learning_rate": 4.815326118139812e-08,
868
+ "logits/chosen": 0.004505271557718515,
869
+ "logits/rejected": 0.142390176653862,
870
+ "logps/chosen": -400.72796630859375,
871
+ "logps/rejected": -388.5762634277344,
872
+ "loss": 0.135,
873
+ "rewards/accuracies": 0.762499988079071,
874
+ "rewards/chosen": -1.3616560697555542,
875
+ "rewards/margins": 0.6720155477523804,
876
+ "rewards/rejected": -2.0336716175079346,
877
  "step": 620
878
  },
879
  {
880
  "epoch": 1.77,
881
+ "learning_rate": 3.81779105087407e-08,
882
+ "logits/chosen": -0.03825841844081879,
883
+ "logits/rejected": 0.05763017386198044,
884
+ "logps/chosen": -464.5328063964844,
885
+ "logps/rejected": -442.33404541015625,
886
+ "loss": 0.1335,
887
+ "rewards/accuracies": 0.762499988079071,
888
+ "rewards/chosen": -1.4398839473724365,
889
+ "rewards/margins": 0.7831242680549622,
890
+ "rewards/rejected": -2.223008394241333,
891
  "step": 630
892
  },
893
  {
894
  "epoch": 1.8,
895
+ "learning_rate": 2.9318613945057637e-08,
896
+ "logits/chosen": 0.01526588760316372,
897
+ "logits/rejected": 0.07733525335788727,
898
+ "logps/chosen": -488.5757751464844,
899
+ "logps/rejected": -456.2373962402344,
900
+ "loss": 0.1244,
901
+ "rewards/accuracies": 0.824999988079071,
902
+ "rewards/chosen": -1.3201992511749268,
903
+ "rewards/margins": 0.9031252861022949,
904
+ "rewards/rejected": -2.2233245372772217,
905
  "step": 640
906
  },
907
  {
908
  "epoch": 1.83,
909
+ "learning_rate": 2.1596781151249523e-08,
910
+ "logits/chosen": -0.0026476040948182344,
911
+ "logits/rejected": 0.11588595062494278,
912
+ "logps/chosen": -447.08331298828125,
913
+ "logps/rejected": -423.4645080566406,
914
+ "loss": 0.1269,
915
+ "rewards/accuracies": 0.75,
916
+ "rewards/chosen": -1.3190840482711792,
917
+ "rewards/margins": 0.6764702796936035,
918
+ "rewards/rejected": -1.9955543279647827,
919
  "step": 650
920
  },
921
  {
922
  "epoch": 1.85,
923
+ "learning_rate": 1.5031072956701695e-08,
924
+ "logits/chosen": -0.013560554012656212,
925
+ "logits/rejected": 0.0541529655456543,
926
+ "logps/chosen": -485.8223571777344,
927
+ "logps/rejected": -464.66143798828125,
928
+ "loss": 0.1228,
929
+ "rewards/accuracies": 0.793749988079071,
930
+ "rewards/chosen": -1.3514947891235352,
931
+ "rewards/margins": 0.9442659616470337,
932
+ "rewards/rejected": -2.2957608699798584,
933
  "step": 660
934
  },
935
  {
936
  "epoch": 1.88,
937
+ "learning_rate": 9.637356262923723e-09,
938
+ "logits/chosen": 0.08311934769153595,
939
+ "logits/rejected": 0.10251543670892715,
940
+ "logps/chosen": -446.80938720703125,
941
+ "logps/rejected": -408.7090759277344,
942
+ "loss": 0.1266,
943
+ "rewards/accuracies": 0.8062499761581421,
944
+ "rewards/chosen": -1.3170959949493408,
945
+ "rewards/margins": 0.7009096145629883,
946
+ "rewards/rejected": -2.018005847930908,
947
  "step": 670
948
  },
949
  {
950
  "epoch": 1.91,
951
+ "learning_rate": 5.428665699084789e-09,
952
+ "logits/chosen": -0.018842682242393494,
953
+ "logits/rejected": -0.013545280322432518,
954
+ "logps/chosen": -475.67669677734375,
955
+ "logps/rejected": -463.5384826660156,
956
+ "loss": 0.1214,
957
+ "rewards/accuracies": 0.793749988079071,
958
+ "rewards/chosen": -1.2991926670074463,
959
+ "rewards/margins": 0.9007900357246399,
960
+ "rewards/rejected": -2.1999831199645996,
961
  "step": 680
962
  },
963
  {
964
  "epoch": 1.94,
965
+ "learning_rate": 2.415172122110343e-09,
966
+ "logits/chosen": 0.025611836463212967,
967
+ "logits/rejected": 0.10699748992919922,
968
+ "logps/chosen": -474.1285095214844,
969
+ "logps/rejected": -452.504150390625,
970
+ "loss": 0.125,
971
+ "rewards/accuracies": 0.800000011920929,
972
+ "rewards/chosen": -1.3146600723266602,
973
+ "rewards/margins": 0.7926696538925171,
974
+ "rewards/rejected": -2.107329845428467,
975
  "step": 690
976
  },
977
  {
978
  "epoch": 1.97,
979
+ "learning_rate": 6.041580374618327e-10,
980
+ "logits/chosen": -0.046818483620882034,
981
+ "logits/rejected": 0.026954257860779762,
982
+ "logps/chosen": -467.6886291503906,
983
+ "logps/rejected": -427.25994873046875,
984
+ "loss": 0.1285,
985
+ "rewards/accuracies": 0.7749999761581421,
986
+ "rewards/chosen": -1.3291774988174438,
987
+ "rewards/margins": 0.7668731212615967,
988
+ "rewards/rejected": -2.096050500869751,
989
  "step": 700
990
  },
991
  {
992
  "epoch": 2.0,
993
  "learning_rate": 0.0,
994
+ "logits/chosen": 0.03512246161699295,
995
+ "logits/rejected": 0.0718744620680809,
996
+ "logps/chosen": -398.6717224121094,
997
+ "logps/rejected": -408.44000244140625,
998
+ "loss": 0.1214,
999
+ "rewards/accuracies": 0.8062499761581421,
1000
+ "rewards/chosen": -1.3393633365631104,
1001
+ "rewards/margins": 0.6785662174224854,
1002
+ "rewards/rejected": -2.0179295539855957,
1003
  "step": 710
1004
  },
1005
  {
1006
  "epoch": 2.0,
1007
  "step": 710,
1008
  "total_flos": 0.0,
1009
+ "train_loss": 0.30386436640376774,
1010
+ "train_runtime": 10428.2681,
1011
+ "train_samples_per_second": 8.735,
1012
+ "train_steps_per_second": 0.068
1013
  }
1014
  ],
1015
  "logging_steps": 10,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c9da800f5e96dd69f198232ea91e3c7ca9805289c25d3d61903ec9fddff6d182
3
  size 6648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70fd862b96091c63a464a5db443d907f10b5d3ebcf6fc2771bb9babd8af279fa
3
  size 6648