aaabiao commited on
Commit
2c83d92
1 Parent(s): 79370db

Add files using large-upload tool

Browse files
Files changed (1) hide show
  1. trainer_state.json +732 -0
trainer_state.json ADDED
@@ -0,0 +1,732 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 800000000,
6
+ "global_step": 383,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "grad_norm": 0.84375,
14
+ "learning_rate": 1.282051282051282e-07,
15
+ "logits/chosen": -3.3797154426574707,
16
+ "logits/rejected": -3.440782070159912,
17
+ "logps/chosen": -244.57943725585938,
18
+ "logps/rejected": -168.14312744140625,
19
+ "loss": 0.6931,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/diff": -0.5416666865348816,
23
+ "rewards/diff_abs": 0.5416666865348816,
24
+ "rewards/rejected": 0.0,
25
+ "rewards/student_margin": 0.0,
26
+ "rewards/teacher_margin": 0.5416666865348816,
27
+ "step": 1
28
+ },
29
+ {
30
+ "epoch": 0.03,
31
+ "grad_norm": 0.875,
32
+ "learning_rate": 1.282051282051282e-06,
33
+ "logits/chosen": -3.357137680053711,
34
+ "logits/rejected": -3.306201457977295,
35
+ "logps/chosen": -323.62054443359375,
36
+ "logps/rejected": -269.4796142578125,
37
+ "loss": 0.6938,
38
+ "rewards/accuracies": 0.48148149251937866,
39
+ "rewards/chosen": -0.002036293037235737,
40
+ "rewards/diff": -2.205963134765625,
41
+ "rewards/diff_abs": 2.205963134765625,
42
+ "rewards/rejected": 0.003695431165397167,
43
+ "rewards/student_margin": -0.0057317232713103294,
44
+ "rewards/teacher_margin": 2.2002317905426025,
45
+ "step": 10
46
+ },
47
+ {
48
+ "epoch": 0.05,
49
+ "grad_norm": 0.83984375,
50
+ "learning_rate": 2.564102564102564e-06,
51
+ "logits/chosen": -3.522096633911133,
52
+ "logits/rejected": -3.589216709136963,
53
+ "logps/chosen": -277.0390625,
54
+ "logps/rejected": -192.60079956054688,
55
+ "loss": 0.6932,
56
+ "rewards/accuracies": 0.6333333253860474,
57
+ "rewards/chosen": 0.004549483768641949,
58
+ "rewards/diff": -2.0932600498199463,
59
+ "rewards/diff_abs": 2.093954563140869,
60
+ "rewards/rejected": -0.0068780360743403435,
61
+ "rewards/student_margin": 0.011427519842982292,
62
+ "rewards/teacher_margin": 2.1046876907348633,
63
+ "step": 20
64
+ },
65
+ {
66
+ "epoch": 0.08,
67
+ "grad_norm": 0.84765625,
68
+ "learning_rate": 3.846153846153847e-06,
69
+ "logits/chosen": -3.415924072265625,
70
+ "logits/rejected": -3.5434460639953613,
71
+ "logps/chosen": -301.97265625,
72
+ "logps/rejected": -231.6956329345703,
73
+ "loss": 0.6919,
74
+ "rewards/accuracies": 0.36666664481163025,
75
+ "rewards/chosen": -0.010005621239542961,
76
+ "rewards/diff": -2.4664387702941895,
77
+ "rewards/diff_abs": 2.4664387702941895,
78
+ "rewards/rejected": 0.0007039735792204738,
79
+ "rewards/student_margin": -0.010709594003856182,
80
+ "rewards/teacher_margin": 2.4557292461395264,
81
+ "step": 30
82
+ },
83
+ {
84
+ "epoch": 0.1,
85
+ "grad_norm": 0.8203125,
86
+ "learning_rate": 4.99989574668946e-06,
87
+ "logits/chosen": -3.488649368286133,
88
+ "logits/rejected": -3.546332597732544,
89
+ "logps/chosen": -249.2800750732422,
90
+ "logps/rejected": -179.78993225097656,
91
+ "loss": 0.6901,
92
+ "rewards/accuracies": 0.30000001192092896,
93
+ "rewards/chosen": -0.007187096867710352,
94
+ "rewards/diff": -2.8658556938171387,
95
+ "rewards/diff_abs": 2.8658556938171387,
96
+ "rewards/rejected": 0.010752001777291298,
97
+ "rewards/student_margin": -0.017939096316695213,
98
+ "rewards/teacher_margin": 2.847916841506958,
99
+ "step": 40
100
+ },
101
+ {
102
+ "epoch": 0.13,
103
+ "grad_norm": 0.8125,
104
+ "learning_rate": 4.987395866955716e-06,
105
+ "logits/chosen": -3.38765025138855,
106
+ "logits/rejected": -3.5332159996032715,
107
+ "logps/chosen": -331.1888427734375,
108
+ "logps/rejected": -186.25814819335938,
109
+ "loss": 0.6859,
110
+ "rewards/accuracies": 0.6000000238418579,
111
+ "rewards/chosen": 0.018190687522292137,
112
+ "rewards/diff": -2.0240583419799805,
113
+ "rewards/diff_abs": 2.025913953781128,
114
+ "rewards/rejected": -0.0025426391512155533,
115
+ "rewards/student_margin": 0.02073333039879799,
116
+ "rewards/teacher_margin": 2.0447916984558105,
117
+ "step": 50
118
+ },
119
+ {
120
+ "epoch": 0.16,
121
+ "grad_norm": 0.8515625,
122
+ "learning_rate": 4.954164717534748e-06,
123
+ "logits/chosen": -3.347484588623047,
124
+ "logits/rejected": -3.380958080291748,
125
+ "logps/chosen": -327.35113525390625,
126
+ "logps/rejected": -350.645751953125,
127
+ "loss": 0.6851,
128
+ "rewards/accuracies": 0.5333333015441895,
129
+ "rewards/chosen": 0.0018952175742015243,
130
+ "rewards/diff": -1.218827486038208,
131
+ "rewards/diff_abs": 1.2245365381240845,
132
+ "rewards/rejected": 0.01603511907160282,
133
+ "rewards/student_margin": -0.014139902777969837,
134
+ "rewards/teacher_margin": 1.2046875953674316,
135
+ "step": 60
136
+ },
137
+ {
138
+ "epoch": 0.18,
139
+ "grad_norm": 0.8203125,
140
+ "learning_rate": 4.900479264361017e-06,
141
+ "logits/chosen": -3.400958299636841,
142
+ "logits/rejected": -3.4466145038604736,
143
+ "logps/chosen": -307.9151916503906,
144
+ "logps/rejected": -278.41619873046875,
145
+ "loss": 0.6795,
146
+ "rewards/accuracies": 0.5999999642372131,
147
+ "rewards/chosen": 0.03656778484582901,
148
+ "rewards/diff": -1.3170411586761475,
149
+ "rewards/diff_abs": 1.3170411586761475,
150
+ "rewards/rejected": 0.030171453952789307,
151
+ "rewards/student_margin": 0.0063963280990719795,
152
+ "rewards/teacher_margin": 1.3234374523162842,
153
+ "step": 70
154
+ },
155
+ {
156
+ "epoch": 0.21,
157
+ "grad_norm": 0.8359375,
158
+ "learning_rate": 4.826786950329646e-06,
159
+ "logits/chosen": -3.5235488414764404,
160
+ "logits/rejected": -3.580641269683838,
161
+ "logps/chosen": -283.22747802734375,
162
+ "logps/rejected": -180.744873046875,
163
+ "loss": 0.6766,
164
+ "rewards/accuracies": 0.6000000238418579,
165
+ "rewards/chosen": 0.0294550321996212,
166
+ "rewards/diff": -1.3980296850204468,
167
+ "rewards/diff_abs": 1.4047094583511353,
168
+ "rewards/rejected": 0.005609627813100815,
169
+ "rewards/student_margin": 0.023845406249165535,
170
+ "rewards/teacher_margin": 1.421875,
171
+ "step": 80
172
+ },
173
+ {
174
+ "epoch": 0.23,
175
+ "grad_norm": 0.828125,
176
+ "learning_rate": 4.733701966071226e-06,
177
+ "logits/chosen": -3.4635212421417236,
178
+ "logits/rejected": -3.515272617340088,
179
+ "logps/chosen": -335.2662048339844,
180
+ "logps/rejected": -171.05323791503906,
181
+ "loss": 0.6755,
182
+ "rewards/accuracies": 0.46666669845581055,
183
+ "rewards/chosen": -0.00026724563213065267,
184
+ "rewards/diff": -2.8980860710144043,
185
+ "rewards/diff_abs": 2.8980860710144043,
186
+ "rewards/rejected": 0.00875641219317913,
187
+ "rewards/student_margin": -0.009023657068610191,
188
+ "rewards/teacher_margin": 2.8890626430511475,
189
+ "step": 90
190
+ },
191
+ {
192
+ "epoch": 0.26,
193
+ "grad_norm": 0.82421875,
194
+ "learning_rate": 4.622000130963015e-06,
195
+ "logits/chosen": -3.5022895336151123,
196
+ "logits/rejected": -3.5655486583709717,
197
+ "logps/chosen": -305.0356750488281,
198
+ "logps/rejected": -202.9454803466797,
199
+ "loss": 0.6721,
200
+ "rewards/accuracies": 0.5333333611488342,
201
+ "rewards/chosen": 0.015491512604057789,
202
+ "rewards/diff": -2.5059962272644043,
203
+ "rewards/diff_abs": 2.5059962272644043,
204
+ "rewards/rejected": -0.03372037410736084,
205
+ "rewards/student_margin": 0.049211882054805756,
206
+ "rewards/teacher_margin": 2.555208444595337,
207
+ "step": 100
208
+ },
209
+ {
210
+ "epoch": 0.29,
211
+ "grad_norm": 0.796875,
212
+ "learning_rate": 4.492612427040864e-06,
213
+ "logits/chosen": -3.5529093742370605,
214
+ "logits/rejected": -3.6310908794403076,
215
+ "logps/chosen": -277.04351806640625,
216
+ "logps/rejected": -200.5059814453125,
217
+ "loss": 0.6662,
218
+ "rewards/accuracies": 0.7000000476837158,
219
+ "rewards/chosen": 0.0211379025131464,
220
+ "rewards/diff": -1.6043806076049805,
221
+ "rewards/diff_abs": 1.6043806076049805,
222
+ "rewards/rejected": -0.05364810302853584,
223
+ "rewards/student_margin": 0.07478600740432739,
224
+ "rewards/teacher_margin": 1.6791667938232422,
225
+ "step": 110
226
+ },
227
+ {
228
+ "epoch": 0.31,
229
+ "grad_norm": 0.8125,
230
+ "learning_rate": 4.346617239703676e-06,
231
+ "logits/chosen": -3.4835896492004395,
232
+ "logits/rejected": -3.6081409454345703,
233
+ "logps/chosen": -304.2657165527344,
234
+ "logps/rejected": -239.1654052734375,
235
+ "loss": 0.6626,
236
+ "rewards/accuracies": 0.7333333492279053,
237
+ "rewards/chosen": 0.052922703325748444,
238
+ "rewards/diff": -1.7108633518218994,
239
+ "rewards/diff_abs": 1.7417595386505127,
240
+ "rewards/rejected": 0.003890041261911392,
241
+ "rewards/student_margin": 0.04903266206383705,
242
+ "rewards/teacher_margin": 1.7598956823349,
243
+ "step": 120
244
+ },
245
+ {
246
+ "epoch": 0.34,
247
+ "grad_norm": 0.83203125,
248
+ "learning_rate": 4.185231369880461e-06,
249
+ "logits/chosen": -3.2226901054382324,
250
+ "logits/rejected": -3.4232337474823,
251
+ "logps/chosen": -324.42236328125,
252
+ "logps/rejected": -221.59872436523438,
253
+ "loss": 0.6601,
254
+ "rewards/accuracies": 0.6000000238418579,
255
+ "rewards/chosen": -0.002616771962493658,
256
+ "rewards/diff": -2.5576834678649902,
257
+ "rewards/diff_abs": 2.5576834678649902,
258
+ "rewards/rejected": -0.03972511366009712,
259
+ "rewards/student_margin": 0.03710833936929703,
260
+ "rewards/teacher_margin": 2.594791889190674,
261
+ "step": 130
262
+ },
263
+ {
264
+ "epoch": 0.37,
265
+ "grad_norm": 0.86328125,
266
+ "learning_rate": 4.009799892569317e-06,
267
+ "logits/chosen": -3.487971544265747,
268
+ "logits/rejected": -3.4927310943603516,
269
+ "logps/chosen": -294.6400451660156,
270
+ "logps/rejected": -235.9231719970703,
271
+ "loss": 0.6555,
272
+ "rewards/accuracies": 0.8666666746139526,
273
+ "rewards/chosen": 0.036725759506225586,
274
+ "rewards/diff": -2.0255935192108154,
275
+ "rewards/diff_abs": 2.0322105884552,
276
+ "rewards/rejected": -0.10851386934518814,
277
+ "rewards/student_margin": 0.14523963630199432,
278
+ "rewards/teacher_margin": 2.1708333492279053,
279
+ "step": 140
280
+ },
281
+ {
282
+ "epoch": 0.39,
283
+ "grad_norm": 0.796875,
284
+ "learning_rate": 3.8217849462726334e-06,
285
+ "logits/chosen": -3.6184945106506348,
286
+ "logits/rejected": -3.572862148284912,
287
+ "logps/chosen": -246.6650390625,
288
+ "logps/rejected": -221.8422088623047,
289
+ "loss": 0.6532,
290
+ "rewards/accuracies": 0.5,
291
+ "rewards/chosen": -0.019402923062443733,
292
+ "rewards/diff": -1.9371099472045898,
293
+ "rewards/diff_abs": 1.9371099472045898,
294
+ "rewards/rejected": -0.05625145882368088,
295
+ "rewards/student_margin": 0.03684854134917259,
296
+ "rewards/teacher_margin": 1.9739586114883423,
297
+ "step": 150
298
+ },
299
+ {
300
+ "epoch": 0.42,
301
+ "grad_norm": 0.83203125,
302
+ "learning_rate": 3.6227535467632873e-06,
303
+ "logits/chosen": -3.5052731037139893,
304
+ "logits/rejected": -3.6803977489471436,
305
+ "logps/chosen": -441.88018798828125,
306
+ "logps/rejected": -259.37799072265625,
307
+ "loss": 0.647,
308
+ "rewards/accuracies": 0.6333333253860474,
309
+ "rewards/chosen": 0.04087870568037033,
310
+ "rewards/diff": -1.968920350074768,
311
+ "rewards/diff_abs": 1.9809789657592773,
312
+ "rewards/rejected": -0.06728442758321762,
313
+ "rewards/student_margin": 0.10816313326358795,
314
+ "rewards/teacher_margin": 2.0770833492279053,
315
+ "step": 160
316
+ },
317
+ {
318
+ "epoch": 0.44,
319
+ "grad_norm": 0.8203125,
320
+ "learning_rate": 3.4143645267483144e-06,
321
+ "logits/chosen": -3.505716323852539,
322
+ "logits/rejected": -3.5521907806396484,
323
+ "logps/chosen": -318.2767333984375,
324
+ "logps/rejected": -263.3311767578125,
325
+ "loss": 0.6466,
326
+ "rewards/accuracies": 0.7333332896232605,
327
+ "rewards/chosen": -0.027162820100784302,
328
+ "rewards/diff": -2.4665005207061768,
329
+ "rewards/diff_abs": 2.5026025772094727,
330
+ "rewards/rejected": -0.1044122725725174,
331
+ "rewards/student_margin": 0.07724946737289429,
332
+ "rewards/teacher_margin": 2.543750047683716,
333
+ "step": 170
334
+ },
335
+ {
336
+ "epoch": 0.47,
337
+ "grad_norm": 0.8125,
338
+ "learning_rate": 3.1983547102818104e-06,
339
+ "logits/chosen": -3.4710662364959717,
340
+ "logits/rejected": -3.5504486560821533,
341
+ "logps/chosen": -357.35101318359375,
342
+ "logps/rejected": -294.84698486328125,
343
+ "loss": 0.6401,
344
+ "rewards/accuracies": 0.6333333253860474,
345
+ "rewards/chosen": -0.05695001035928726,
346
+ "rewards/diff": -1.7987518310546875,
347
+ "rewards/diff_abs": 1.833298683166504,
348
+ "rewards/rejected": -0.3196563422679901,
349
+ "rewards/student_margin": 0.26270633935928345,
350
+ "rewards/teacher_margin": 2.0614585876464844,
351
+ "step": 180
352
+ },
353
+ {
354
+ "epoch": 0.5,
355
+ "grad_norm": 0.81640625,
356
+ "learning_rate": 2.9765244371567873e-06,
357
+ "logits/chosen": -3.4952006340026855,
358
+ "logits/rejected": -3.588458299636841,
359
+ "logps/chosen": -280.5351257324219,
360
+ "logps/rejected": -209.8900909423828,
361
+ "loss": 0.638,
362
+ "rewards/accuracies": 0.5666667222976685,
363
+ "rewards/chosen": 0.002451175358146429,
364
+ "rewards/diff": -2.4403529167175293,
365
+ "rewards/diff_abs": 2.4539525508880615,
366
+ "rewards/rejected": -0.18167506158351898,
367
+ "rewards/student_margin": 0.18412622809410095,
368
+ "rewards/teacher_margin": 2.624479055404663,
369
+ "step": 190
370
+ },
371
+ {
372
+ "epoch": 0.52,
373
+ "grad_norm": 0.83984375,
374
+ "learning_rate": 2.7507225579233487e-06,
375
+ "logits/chosen": -3.7310454845428467,
376
+ "logits/rejected": -3.9183707237243652,
377
+ "logps/chosen": -268.8802795410156,
378
+ "logps/rejected": -197.14840698242188,
379
+ "loss": 0.6356,
380
+ "rewards/accuracies": 0.6999999284744263,
381
+ "rewards/chosen": 0.019510583952069283,
382
+ "rewards/diff": -2.137479305267334,
383
+ "rewards/diff_abs": 2.137479305267334,
384
+ "rewards/rejected": -0.06957288086414337,
385
+ "rewards/student_margin": 0.0890834629535675,
386
+ "rewards/teacher_margin": 2.2265625,
387
+ "step": 200
388
+ },
389
+ {
390
+ "epoch": 0.55,
391
+ "grad_norm": 0.875,
392
+ "learning_rate": 2.522831024592615e-06,
393
+ "logits/chosen": -3.594924211502075,
394
+ "logits/rejected": -3.7799072265625,
395
+ "logps/chosen": -307.20013427734375,
396
+ "logps/rejected": -242.7568817138672,
397
+ "loss": 0.6406,
398
+ "rewards/accuracies": 0.6666666269302368,
399
+ "rewards/chosen": -0.02844623290002346,
400
+ "rewards/diff": -2.2553000450134277,
401
+ "rewards/diff_abs": 2.259291172027588,
402
+ "rewards/rejected": -0.13147947192192078,
403
+ "rewards/student_margin": 0.10303322970867157,
404
+ "rewards/teacher_margin": 2.3583333492279053,
405
+ "step": 210
406
+ },
407
+ {
408
+ "epoch": 0.57,
409
+ "grad_norm": 0.8359375,
410
+ "learning_rate": 2.2947492054556075e-06,
411
+ "logits/chosen": -3.5902042388916016,
412
+ "logits/rejected": -3.788142681121826,
413
+ "logps/chosen": -323.9666442871094,
414
+ "logps/rejected": -212.97549438476562,
415
+ "loss": 0.6286,
416
+ "rewards/accuracies": 0.5666667222976685,
417
+ "rewards/chosen": -0.08455105125904083,
418
+ "rewards/diff": -1.5480363368988037,
419
+ "rewards/diff_abs": 1.552331566810608,
420
+ "rewards/rejected": -0.20005643367767334,
421
+ "rewards/student_margin": 0.11550538241863251,
422
+ "rewards/teacher_margin": 1.6635417938232422,
423
+ "step": 220
424
+ },
425
+ {
426
+ "epoch": 0.6,
427
+ "grad_norm": 0.8125,
428
+ "learning_rate": 2.0683780547456666e-06,
429
+ "logits/chosen": -3.5200886726379395,
430
+ "logits/rejected": -3.7031588554382324,
431
+ "logps/chosen": -314.7069396972656,
432
+ "logps/rejected": -296.3868408203125,
433
+ "loss": 0.6393,
434
+ "rewards/accuracies": 0.699999988079071,
435
+ "rewards/chosen": 0.02559809945523739,
436
+ "rewards/diff": -1.6750246286392212,
437
+ "rewards/diff_abs": 1.6955029964447021,
438
+ "rewards/rejected": -0.23323163390159607,
439
+ "rewards/student_margin": 0.2588297724723816,
440
+ "rewards/teacher_margin": 1.933854341506958,
441
+ "step": 230
442
+ },
443
+ {
444
+ "epoch": 0.63,
445
+ "grad_norm": 0.8046875,
446
+ "learning_rate": 1.845604269082787e-06,
447
+ "logits/chosen": -3.597382068634033,
448
+ "logits/rejected": -3.8518309593200684,
449
+ "logps/chosen": -327.3302917480469,
450
+ "logps/rejected": -229.868408203125,
451
+ "loss": 0.6288,
452
+ "rewards/accuracies": 0.5666666626930237,
453
+ "rewards/chosen": -0.04607323184609413,
454
+ "rewards/diff": -2.3076095581054688,
455
+ "rewards/diff_abs": 2.3076095581054688,
456
+ "rewards/rejected": -0.08013031631708145,
457
+ "rewards/student_margin": 0.03405708074569702,
458
+ "rewards/teacher_margin": 2.3416669368743896,
459
+ "step": 240
460
+ },
461
+ {
462
+ "epoch": 0.65,
463
+ "grad_norm": 0.81640625,
464
+ "learning_rate": 1.628284562748429e-06,
465
+ "logits/chosen": -3.5664756298065186,
466
+ "logits/rejected": -3.988274097442627,
467
+ "logps/chosen": -453.97540283203125,
468
+ "logps/rejected": -193.4569854736328,
469
+ "loss": 0.6248,
470
+ "rewards/accuracies": 0.7333333492279053,
471
+ "rewards/chosen": 0.06378475576639175,
472
+ "rewards/diff": -2.4481723308563232,
473
+ "rewards/diff_abs": 2.4774887561798096,
474
+ "rewards/rejected": -0.18335548043251038,
475
+ "rewards/student_margin": 0.24714021384716034,
476
+ "rewards/teacher_margin": 2.695312738418579,
477
+ "step": 250
478
+ },
479
+ {
480
+ "epoch": 0.68,
481
+ "grad_norm": 0.82421875,
482
+ "learning_rate": 1.4182301928489556e-06,
483
+ "logits/chosen": -3.7244060039520264,
484
+ "logits/rejected": -3.958916425704956,
485
+ "logps/chosen": -319.8840026855469,
486
+ "logps/rejected": -182.678955078125,
487
+ "loss": 0.6236,
488
+ "rewards/accuracies": 0.6333333253860474,
489
+ "rewards/chosen": -0.004236244596540928,
490
+ "rewards/diff": -2.6814849376678467,
491
+ "rewards/diff_abs": 2.6881275177001953,
492
+ "rewards/rejected": -0.15868888795375824,
493
+ "rewards/student_margin": 0.1544526368379593,
494
+ "rewards/teacher_margin": 2.835937738418579,
495
+ "step": 260
496
+ },
497
+ {
498
+ "epoch": 0.7,
499
+ "grad_norm": 0.7734375,
500
+ "learning_rate": 1.2171918633431623e-06,
501
+ "logits/chosen": -3.5800559520721436,
502
+ "logits/rejected": -3.4622998237609863,
503
+ "logps/chosen": -342.5908508300781,
504
+ "logps/rejected": -346.6748352050781,
505
+ "loss": 0.6325,
506
+ "rewards/accuracies": 0.6999999284744263,
507
+ "rewards/chosen": -0.052391983568668365,
508
+ "rewards/diff": -1.884416937828064,
509
+ "rewards/diff_abs": 1.9824146032333374,
510
+ "rewards/rejected": -0.28151676058769226,
511
+ "rewards/student_margin": 0.2291247844696045,
512
+ "rewards/teacher_margin": 2.113541603088379,
513
+ "step": 270
514
+ },
515
+ {
516
+ "epoch": 0.73,
517
+ "grad_norm": 0.8046875,
518
+ "learning_rate": 1.0268451337516774e-06,
519
+ "logits/chosen": -3.7309062480926514,
520
+ "logits/rejected": -3.933661699295044,
521
+ "logps/chosen": -308.80218505859375,
522
+ "logps/rejected": -169.55230712890625,
523
+ "loss": 0.6301,
524
+ "rewards/accuracies": 0.6000000238418579,
525
+ "rewards/chosen": -0.07487048208713531,
526
+ "rewards/diff": -2.276700735092163,
527
+ "rewards/diff_abs": 2.2792751789093018,
528
+ "rewards/rejected": -0.23931550979614258,
529
+ "rewards/student_margin": 0.16444504261016846,
530
+ "rewards/teacher_margin": 2.441145896911621,
531
+ "step": 280
532
+ },
533
+ {
534
+ "epoch": 0.76,
535
+ "grad_norm": 0.82421875,
536
+ "learning_rate": 8.487764541597765e-07,
537
+ "logits/chosen": -3.5284225940704346,
538
+ "logits/rejected": -3.898432493209839,
539
+ "logps/chosen": -268.7054748535156,
540
+ "logps/rejected": -169.67959594726562,
541
+ "loss": 0.6262,
542
+ "rewards/accuracies": 0.6000000238418579,
543
+ "rewards/chosen": -0.09594132751226425,
544
+ "rewards/diff": -2.3169074058532715,
545
+ "rewards/diff_abs": 2.3744282722473145,
546
+ "rewards/rejected": -0.22174236178398132,
547
+ "rewards/student_margin": 0.12580107152462006,
548
+ "rewards/teacher_margin": 2.4427084922790527,
549
+ "step": 290
550
+ },
551
+ {
552
+ "epoch": 0.78,
553
+ "grad_norm": 0.81640625,
554
+ "learning_rate": 6.844699429052377e-07,
555
+ "logits/chosen": -3.428218364715576,
556
+ "logits/rejected": -3.6805927753448486,
557
+ "logps/chosen": -411.22412109375,
558
+ "logps/rejected": -314.48919677734375,
559
+ "loss": 0.6265,
560
+ "rewards/accuracies": 0.699999988079071,
561
+ "rewards/chosen": -0.09454776346683502,
562
+ "rewards/diff": -1.8355839252471924,
563
+ "rewards/diff_abs": 1.851203203201294,
564
+ "rewards/rejected": -0.2600056231021881,
565
+ "rewards/student_margin": 0.16545787453651428,
566
+ "rewards/teacher_margin": 2.001041889190674,
567
+ "step": 300
568
+ },
569
+ {
570
+ "epoch": 0.81,
571
+ "grad_norm": 0.83203125,
572
+ "learning_rate": 5.352950171529928e-07,
573
+ "logits/chosen": -3.615025281906128,
574
+ "logits/rejected": -3.696945905685425,
575
+ "logps/chosen": -245.61856079101562,
576
+ "logps/rejected": -196.4246368408203,
577
+ "loss": 0.6258,
578
+ "rewards/accuracies": 0.800000011920929,
579
+ "rewards/chosen": -0.09576699882745743,
580
+ "rewards/diff": -2.4770545959472656,
581
+ "rewards/diff_abs": 2.491313934326172,
582
+ "rewards/rejected": -0.31142088770866394,
583
+ "rewards/student_margin": 0.2156539261341095,
584
+ "rewards/teacher_margin": 2.6927084922790527,
585
+ "step": 310
586
+ },
587
+ {
588
+ "epoch": 0.84,
589
+ "grad_norm": 0.8203125,
590
+ "learning_rate": 4.024949794498623e-07,
591
+ "logits/chosen": -3.5860378742218018,
592
+ "logits/rejected": -3.9259471893310547,
593
+ "logps/chosen": -254.970458984375,
594
+ "logps/rejected": -180.4410400390625,
595
+ "loss": 0.6223,
596
+ "rewards/accuracies": 0.7333333492279053,
597
+ "rewards/chosen": -0.07610569894313812,
598
+ "rewards/diff": -2.3922712802886963,
599
+ "rewards/diff_abs": 2.509066104888916,
600
+ "rewards/rejected": -0.33383452892303467,
601
+ "rewards/student_margin": 0.2577288746833801,
602
+ "rewards/teacher_margin": 2.6500000953674316,
603
+ "step": 320
604
+ },
605
+ {
606
+ "epoch": 0.86,
607
+ "grad_norm": 0.828125,
608
+ "learning_rate": 2.8717665538507965e-07,
609
+ "logits/chosen": -3.5933964252471924,
610
+ "logits/rejected": -3.603943347930908,
611
+ "logps/chosen": -280.0771789550781,
612
+ "logps/rejected": -201.8335723876953,
613
+ "loss": 0.6266,
614
+ "rewards/accuracies": 0.6333333253860474,
615
+ "rewards/chosen": -0.06104619428515434,
616
+ "rewards/diff": -1.3486114740371704,
617
+ "rewards/diff_abs": 1.5223140716552734,
618
+ "rewards/rejected": -0.4061849117279053,
619
+ "rewards/student_margin": 0.34513863921165466,
620
+ "rewards/teacher_margin": 1.693750023841858,
621
+ "step": 330
622
+ },
623
+ {
624
+ "epoch": 0.89,
625
+ "grad_norm": 0.83984375,
626
+ "learning_rate": 1.9030116872178317e-07,
627
+ "logits/chosen": -3.7776057720184326,
628
+ "logits/rejected": -3.7544455528259277,
629
+ "logps/chosen": -294.9410705566406,
630
+ "logps/rejected": -220.80538940429688,
631
+ "loss": 0.6294,
632
+ "rewards/accuracies": 0.6666666865348816,
633
+ "rewards/chosen": -0.010373707860708237,
634
+ "rewards/diff": -1.709843397140503,
635
+ "rewards/diff_abs": 1.8078594207763672,
636
+ "rewards/rejected": -0.30157214403152466,
637
+ "rewards/student_margin": 0.2911984324455261,
638
+ "rewards/teacher_margin": 2.001041889190674,
639
+ "step": 340
640
+ },
641
+ {
642
+ "epoch": 0.91,
643
+ "grad_norm": 0.8125,
644
+ "learning_rate": 1.1267593088441886e-07,
645
+ "logits/chosen": -3.633087158203125,
646
+ "logits/rejected": -3.5318355560302734,
647
+ "logps/chosen": -301.98785400390625,
648
+ "logps/rejected": -271.0285949707031,
649
+ "loss": 0.6263,
650
+ "rewards/accuracies": 0.6333333253860474,
651
+ "rewards/chosen": -0.04605107754468918,
652
+ "rewards/diff": -1.565039873123169,
653
+ "rewards/diff_abs": 1.6360604763031006,
654
+ "rewards/rejected": -0.3101779520511627,
655
+ "rewards/student_margin": 0.26412689685821533,
656
+ "rewards/teacher_margin": 1.8291666507720947,
657
+ "step": 350
658
+ },
659
+ {
660
+ "epoch": 0.94,
661
+ "grad_norm": 0.83203125,
662
+ "learning_rate": 5.494791156587686e-08,
663
+ "logits/chosen": -3.7542724609375,
664
+ "logits/rejected": -3.735842227935791,
665
+ "logps/chosen": -233.6118621826172,
666
+ "logps/rejected": -237.41378784179688,
667
+ "loss": 0.6249,
668
+ "rewards/accuracies": 0.6666666269302368,
669
+ "rewards/chosen": -0.07444219291210175,
670
+ "rewards/diff": -1.7430684566497803,
671
+ "rewards/diff_abs": 1.7606357336044312,
672
+ "rewards/rejected": -0.1824154257774353,
673
+ "rewards/student_margin": 0.10797325521707535,
674
+ "rewards/teacher_margin": 1.851041555404663,
675
+ "step": 360
676
+ },
677
+ {
678
+ "epoch": 0.97,
679
+ "grad_norm": 0.83984375,
680
+ "learning_rate": 1.7598246540683483e-08,
681
+ "logits/chosen": -3.840771436691284,
682
+ "logits/rejected": -3.835493803024292,
683
+ "logps/chosen": -247.64285278320312,
684
+ "logps/rejected": -202.43807983398438,
685
+ "loss": 0.6301,
686
+ "rewards/accuracies": 0.7666666507720947,
687
+ "rewards/chosen": -0.036110132932662964,
688
+ "rewards/diff": -1.690749168395996,
689
+ "rewards/diff_abs": 1.7279847860336304,
690
+ "rewards/rejected": -0.28442350029945374,
691
+ "rewards/student_margin": 0.24831333756446838,
692
+ "rewards/teacher_margin": 1.9390627145767212,
693
+ "step": 370
694
+ },
695
+ {
696
+ "epoch": 0.99,
697
+ "grad_norm": 0.8125,
698
+ "learning_rate": 9.382276255742729e-10,
699
+ "logits/chosen": -3.547929286956787,
700
+ "logits/rejected": -3.6237056255340576,
701
+ "logps/chosen": -379.63031005859375,
702
+ "logps/rejected": -313.1451721191406,
703
+ "loss": 0.6277,
704
+ "rewards/accuracies": 0.6999999284744263,
705
+ "rewards/chosen": -0.0008453071350231767,
706
+ "rewards/diff": -1.6570953130722046,
707
+ "rewards/diff_abs": 1.7126853466033936,
708
+ "rewards/rejected": -0.24843759834766388,
709
+ "rewards/student_margin": 0.2475922852754593,
710
+ "rewards/teacher_margin": 1.904687523841858,
711
+ "step": 380
712
+ },
713
+ {
714
+ "epoch": 1.0,
715
+ "step": 383,
716
+ "total_flos": 0.0,
717
+ "train_loss": 0.6488963676806219,
718
+ "train_runtime": 2900.3403,
719
+ "train_samples_per_second": 50.684,
720
+ "train_steps_per_second": 0.132
721
+ }
722
+ ],
723
+ "logging_steps": 10,
724
+ "max_steps": 383,
725
+ "num_input_tokens_seen": 0,
726
+ "num_train_epochs": 1,
727
+ "save_steps": 100000000000000000000000000000000,
728
+ "total_flos": 0.0,
729
+ "train_batch_size": 3,
730
+ "trial_name": null,
731
+ "trial_params": null
732
+ }