flozi00 commited on
Commit
e137217
1 Parent(s): 61b785d

Upload folder using huggingface_hub

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:035fcee6ade42d8a6e210e0bfb333167cdc4142533f169f6bd7e5c2c8b59ca73
3
  size 4996670464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2913558abbf7c09f823f3d9149b0c3f3f8f8bcccc9b292c0095a5ac58c81928
3
  size 4996670464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5021db8004c45a9a530ca6803cb7f47e7efcbccaab5aa96fe3152fc5d66ad20a
3
  size 1178224960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d1dd526bd7bbd181272b566e5c7e236590ff2c966e830278b4d1a8b80e28701
3
  size 1178224960
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74c17b077acfeba0f92ffed7849a96f3861ffba1ac7f25e9d00e928a0c556655
3
  size 3094642882
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:635d436cc8d5e7a2a2efe1ed2a6519275a69bf2704b79004c45ffe40c45b243f
3
  size 3094642882
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a742e5814e33065999dacc80c1127cad656555ce6fc832d7c43bde53fdae9c09
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8596e8e763631f6218be973d23c17f87c5de1b18960b384dbd7a76dc1dcbe692
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.08426633428279255,
5
  "eval_steps": 5000,
6
- "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1367,6 +1367,1876 @@
1367
  "rewards/rejected": -0.49420568346977234,
1368
  "sft_loss": 0.557674765586853,
1369
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1370
  }
1371
  ],
1372
  "logging_steps": 10,
@@ -1386,7 +3256,7 @@
1386
  "attributes": {}
1387
  }
1388
  },
1389
- "total_flos": 5.905053933387448e+17,
1390
  "train_batch_size": 2,
1391
  "trial_name": null,
1392
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.20013254392163232,
5
  "eval_steps": 5000,
6
+ "global_step": 1900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1367
  "rewards/rejected": -0.49420568346977234,
1368
  "sft_loss": 0.557674765586853,
1369
  "step": 800
1370
+ },
1371
+ {
1372
+ "epoch": 0.08531966346132745,
1373
+ "grad_norm": 5.111828327178955,
1374
+ "learning_rate": 0.0004263157894736842,
1375
+ "logits/chosen": -4.80694580078125,
1376
+ "logits/rejected": -4.806312561035156,
1377
+ "logps/chosen": -0.7565589547157288,
1378
+ "logps/rejected": -5.1447882652282715,
1379
+ "loss": 0.791,
1380
+ "odds_ratio_loss": 2.5595006942749023,
1381
+ "rewards/accuracies": 0.862500011920929,
1382
+ "rewards/chosen": -0.07565589994192123,
1383
+ "rewards/margins": 0.4388229250907898,
1384
+ "rewards/rejected": -0.514478862285614,
1385
+ "sft_loss": 0.5350964665412903,
1386
+ "step": 810
1387
+ },
1388
+ {
1389
+ "epoch": 0.08637299263986237,
1390
+ "grad_norm": 4.786795616149902,
1391
+ "learning_rate": 0.0004315789473684211,
1392
+ "logits/chosen": -4.826291561126709,
1393
+ "logits/rejected": -4.825577259063721,
1394
+ "logps/chosen": -0.6888704895973206,
1395
+ "logps/rejected": -5.567938804626465,
1396
+ "loss": 0.7202,
1397
+ "odds_ratio_loss": 2.4763171672821045,
1398
+ "rewards/accuracies": 0.8541666865348816,
1399
+ "rewards/chosen": -0.06888704746961594,
1400
+ "rewards/margins": 0.487906813621521,
1401
+ "rewards/rejected": -0.5567939281463623,
1402
+ "sft_loss": 0.472540020942688,
1403
+ "step": 820
1404
+ },
1405
+ {
1406
+ "epoch": 0.08742632181839727,
1407
+ "grad_norm": 7.63191556930542,
1408
+ "learning_rate": 0.00043684210526315795,
1409
+ "logits/chosen": -4.981302738189697,
1410
+ "logits/rejected": -4.980616569519043,
1411
+ "logps/chosen": -0.7095519304275513,
1412
+ "logps/rejected": -5.726536273956299,
1413
+ "loss": 0.7455,
1414
+ "odds_ratio_loss": 2.7154994010925293,
1415
+ "rewards/accuracies": 0.8458333611488342,
1416
+ "rewards/chosen": -0.07095518708229065,
1417
+ "rewards/margins": 0.5016984343528748,
1418
+ "rewards/rejected": -0.5726536512374878,
1419
+ "sft_loss": 0.4739212989807129,
1420
+ "step": 830
1421
+ },
1422
+ {
1423
+ "epoch": 0.08847965099693218,
1424
+ "grad_norm": 3.9395651817321777,
1425
+ "learning_rate": 0.0004421052631578947,
1426
+ "logits/chosen": -5.055291652679443,
1427
+ "logits/rejected": -5.054480075836182,
1428
+ "logps/chosen": -0.6979976892471313,
1429
+ "logps/rejected": -5.470063209533691,
1430
+ "loss": 0.7345,
1431
+ "odds_ratio_loss": 2.5650830268859863,
1432
+ "rewards/accuracies": 0.8354166746139526,
1433
+ "rewards/chosen": -0.0697997659444809,
1434
+ "rewards/margins": 0.4772065579891205,
1435
+ "rewards/rejected": -0.547006368637085,
1436
+ "sft_loss": 0.477975457906723,
1437
+ "step": 840
1438
+ },
1439
+ {
1440
+ "epoch": 0.08953298017546708,
1441
+ "grad_norm": 9.380526542663574,
1442
+ "learning_rate": 0.0004473684210526316,
1443
+ "logits/chosen": -4.9736008644104,
1444
+ "logits/rejected": -4.972882270812988,
1445
+ "logps/chosen": -0.6816462874412537,
1446
+ "logps/rejected": -5.888026714324951,
1447
+ "loss": 0.715,
1448
+ "odds_ratio_loss": 2.5402190685272217,
1449
+ "rewards/accuracies": 0.8583333492279053,
1450
+ "rewards/chosen": -0.0681646317243576,
1451
+ "rewards/margins": 0.520638108253479,
1452
+ "rewards/rejected": -0.588802695274353,
1453
+ "sft_loss": 0.46102267503738403,
1454
+ "step": 850
1455
+ },
1456
+ {
1457
+ "epoch": 0.090586309354002,
1458
+ "grad_norm": 3.032940626144409,
1459
+ "learning_rate": 0.00045263157894736845,
1460
+ "logits/chosen": -4.86979866027832,
1461
+ "logits/rejected": -4.869546413421631,
1462
+ "logps/chosen": -0.8415181636810303,
1463
+ "logps/rejected": -4.487551689147949,
1464
+ "loss": 0.8781,
1465
+ "odds_ratio_loss": 2.631443977355957,
1466
+ "rewards/accuracies": 0.8229166865348816,
1467
+ "rewards/chosen": -0.08415181934833527,
1468
+ "rewards/margins": 0.3646034300327301,
1469
+ "rewards/rejected": -0.44875526428222656,
1470
+ "sft_loss": 0.6149870157241821,
1471
+ "step": 860
1472
+ },
1473
+ {
1474
+ "epoch": 0.0916396385325369,
1475
+ "grad_norm": 5.457218170166016,
1476
+ "learning_rate": 0.00045789473684210527,
1477
+ "logits/chosen": -4.5640482902526855,
1478
+ "logits/rejected": -4.5640788078308105,
1479
+ "logps/chosen": -0.7579687833786011,
1480
+ "logps/rejected": -3.3095312118530273,
1481
+ "loss": 0.7946,
1482
+ "odds_ratio_loss": 2.6955649852752686,
1483
+ "rewards/accuracies": 0.831250011920929,
1484
+ "rewards/chosen": -0.07579687237739563,
1485
+ "rewards/margins": 0.2551562488079071,
1486
+ "rewards/rejected": -0.33095312118530273,
1487
+ "sft_loss": 0.5250447988510132,
1488
+ "step": 870
1489
+ },
1490
+ {
1491
+ "epoch": 0.0926929677110718,
1492
+ "grad_norm": 4.475607872009277,
1493
+ "learning_rate": 0.00046315789473684214,
1494
+ "logits/chosen": -4.721373558044434,
1495
+ "logits/rejected": -4.7214860916137695,
1496
+ "logps/chosen": -0.7569971680641174,
1497
+ "logps/rejected": -3.347615957260132,
1498
+ "loss": 0.7905,
1499
+ "odds_ratio_loss": 2.5438392162323,
1500
+ "rewards/accuracies": 0.831250011920929,
1501
+ "rewards/chosen": -0.07569971680641174,
1502
+ "rewards/margins": 0.25906190276145935,
1503
+ "rewards/rejected": -0.3347616195678711,
1504
+ "sft_loss": 0.536092221736908,
1505
+ "step": 880
1506
+ },
1507
+ {
1508
+ "epoch": 0.09374629688960671,
1509
+ "grad_norm": 31.67135238647461,
1510
+ "learning_rate": 0.00046842105263157895,
1511
+ "logits/chosen": -4.7270989418029785,
1512
+ "logits/rejected": -4.727247714996338,
1513
+ "logps/chosen": -0.7946822047233582,
1514
+ "logps/rejected": -3.15295147895813,
1515
+ "loss": 0.829,
1516
+ "odds_ratio_loss": 2.443174123764038,
1517
+ "rewards/accuracies": 0.8520833253860474,
1518
+ "rewards/chosen": -0.07946821302175522,
1519
+ "rewards/margins": 0.23582692444324493,
1520
+ "rewards/rejected": -0.31529513001441956,
1521
+ "sft_loss": 0.5846543908119202,
1522
+ "step": 890
1523
+ },
1524
+ {
1525
+ "epoch": 0.09479962606814162,
1526
+ "grad_norm": 3.2339320182800293,
1527
+ "learning_rate": 0.00047368421052631577,
1528
+ "logits/chosen": -5.0870866775512695,
1529
+ "logits/rejected": -5.087241172790527,
1530
+ "logps/chosen": -0.6878632307052612,
1531
+ "logps/rejected": -2.8736085891723633,
1532
+ "loss": 0.7248,
1533
+ "odds_ratio_loss": 2.5279555320739746,
1534
+ "rewards/accuracies": 0.862500011920929,
1535
+ "rewards/chosen": -0.06878631561994553,
1536
+ "rewards/margins": 0.21857453882694244,
1537
+ "rewards/rejected": -0.2873608469963074,
1538
+ "sft_loss": 0.47198787331581116,
1539
+ "step": 900
1540
+ },
1541
+ {
1542
+ "epoch": 0.09585295524667653,
1543
+ "grad_norm": 2.4642884731292725,
1544
+ "learning_rate": 0.00047894736842105264,
1545
+ "logits/chosen": -5.1205644607543945,
1546
+ "logits/rejected": -5.120718479156494,
1547
+ "logps/chosen": -0.6843523383140564,
1548
+ "logps/rejected": -3.210320472717285,
1549
+ "loss": 0.7126,
1550
+ "odds_ratio_loss": 2.3976242542266846,
1551
+ "rewards/accuracies": 0.8687499761581421,
1552
+ "rewards/chosen": -0.06843523681163788,
1553
+ "rewards/margins": 0.25259679555892944,
1554
+ "rewards/rejected": -0.3210320770740509,
1555
+ "sft_loss": 0.47283756732940674,
1556
+ "step": 910
1557
+ },
1558
+ {
1559
+ "epoch": 0.09690628442521143,
1560
+ "grad_norm": 5.920956611633301,
1561
+ "learning_rate": 0.0004842105263157895,
1562
+ "logits/chosen": -5.090200424194336,
1563
+ "logits/rejected": -5.090282917022705,
1564
+ "logps/chosen": -0.6722908616065979,
1565
+ "logps/rejected": -3.273742198944092,
1566
+ "loss": 0.7063,
1567
+ "odds_ratio_loss": 2.530402421951294,
1568
+ "rewards/accuracies": 0.84375,
1569
+ "rewards/chosen": -0.06722908467054367,
1570
+ "rewards/margins": 0.2601451575756073,
1571
+ "rewards/rejected": -0.3273741900920868,
1572
+ "sft_loss": 0.45327988266944885,
1573
+ "step": 920
1574
+ },
1575
+ {
1576
+ "epoch": 0.09795961360374635,
1577
+ "grad_norm": 4.960987567901611,
1578
+ "learning_rate": 0.0004894736842105264,
1579
+ "logits/chosen": -5.087591648101807,
1580
+ "logits/rejected": -5.087601661682129,
1581
+ "logps/chosen": -0.7244377136230469,
1582
+ "logps/rejected": -3.0884244441986084,
1583
+ "loss": 0.7571,
1584
+ "odds_ratio_loss": 2.6207199096679688,
1585
+ "rewards/accuracies": 0.8500000238418579,
1586
+ "rewards/chosen": -0.07244376838207245,
1587
+ "rewards/margins": 0.23639869689941406,
1588
+ "rewards/rejected": -0.3088424801826477,
1589
+ "sft_loss": 0.4949897825717926,
1590
+ "step": 930
1591
+ },
1592
+ {
1593
+ "epoch": 0.09901294278228125,
1594
+ "grad_norm": 5.539714336395264,
1595
+ "learning_rate": 0.0004947368421052632,
1596
+ "logits/chosen": -5.29771614074707,
1597
+ "logits/rejected": -5.297707557678223,
1598
+ "logps/chosen": -0.6988152265548706,
1599
+ "logps/rejected": -3.315657615661621,
1600
+ "loss": 0.7308,
1601
+ "odds_ratio_loss": 2.3126912117004395,
1602
+ "rewards/accuracies": 0.8645833134651184,
1603
+ "rewards/chosen": -0.06988153606653214,
1604
+ "rewards/margins": 0.26168423891067505,
1605
+ "rewards/rejected": -0.3315657675266266,
1606
+ "sft_loss": 0.4995124638080597,
1607
+ "step": 940
1608
+ },
1609
+ {
1610
+ "epoch": 0.10006627196081616,
1611
+ "grad_norm": 6.586909294128418,
1612
+ "learning_rate": 0.0005,
1613
+ "logits/chosen": -5.285855293273926,
1614
+ "logits/rejected": -5.2858428955078125,
1615
+ "logps/chosen": -0.6867055296897888,
1616
+ "logps/rejected": -3.43638277053833,
1617
+ "loss": 0.7161,
1618
+ "odds_ratio_loss": 2.5117852687835693,
1619
+ "rewards/accuracies": 0.8791666626930237,
1620
+ "rewards/chosen": -0.06867055594921112,
1621
+ "rewards/margins": 0.2749677002429962,
1622
+ "rewards/rejected": -0.34363824129104614,
1623
+ "sft_loss": 0.4649271070957184,
1624
+ "step": 950
1625
+ },
1626
+ {
1627
+ "epoch": 0.10111960113935106,
1628
+ "grad_norm": 6.756776809692383,
1629
+ "learning_rate": 0.0004999983096040005,
1630
+ "logits/chosen": -5.550118923187256,
1631
+ "logits/rejected": -5.5501179695129395,
1632
+ "logps/chosen": -0.7224279642105103,
1633
+ "logps/rejected": -3.376439332962036,
1634
+ "loss": 0.7555,
1635
+ "odds_ratio_loss": 2.5149753093719482,
1636
+ "rewards/accuracies": 0.8604166507720947,
1637
+ "rewards/chosen": -0.07224280387163162,
1638
+ "rewards/margins": 0.2654011845588684,
1639
+ "rewards/rejected": -0.3376440107822418,
1640
+ "sft_loss": 0.5040432214736938,
1641
+ "step": 960
1642
+ },
1643
+ {
1644
+ "epoch": 0.10217293031788596,
1645
+ "grad_norm": 161.19760131835938,
1646
+ "learning_rate": 0.0004999932384388613,
1647
+ "logits/chosen": -5.225105285644531,
1648
+ "logits/rejected": -5.225213527679443,
1649
+ "logps/chosen": -1.5466647148132324,
1650
+ "logps/rejected": -3.935602903366089,
1651
+ "loss": 1.5839,
1652
+ "odds_ratio_loss": 3.059565782546997,
1653
+ "rewards/accuracies": 0.8604166507720947,
1654
+ "rewards/chosen": -0.154666468501091,
1655
+ "rewards/margins": 0.2388937920331955,
1656
+ "rewards/rejected": -0.3935602605342865,
1657
+ "sft_loss": 1.277909755706787,
1658
+ "step": 970
1659
+ },
1660
+ {
1661
+ "epoch": 0.10322625949642088,
1662
+ "grad_norm": 4.612696170806885,
1663
+ "learning_rate": 0.000499984786573161,
1664
+ "logits/chosen": -5.253983020782471,
1665
+ "logits/rejected": -5.254087448120117,
1666
+ "logps/chosen": -0.6865792870521545,
1667
+ "logps/rejected": -3.219088077545166,
1668
+ "loss": 0.7214,
1669
+ "odds_ratio_loss": 2.43359375,
1670
+ "rewards/accuracies": 0.8166666626930237,
1671
+ "rewards/chosen": -0.06865792721509933,
1672
+ "rewards/margins": 0.2532508671283722,
1673
+ "rewards/rejected": -0.3219088315963745,
1674
+ "sft_loss": 0.4780765473842621,
1675
+ "step": 980
1676
+ },
1677
+ {
1678
+ "epoch": 0.10427958867495578,
1679
+ "grad_norm": 4.7582621574401855,
1680
+ "learning_rate": 0.0004999729541211952,
1681
+ "logits/chosen": -5.1987152099609375,
1682
+ "logits/rejected": -5.198534965515137,
1683
+ "logps/chosen": -0.8549334406852722,
1684
+ "logps/rejected": -4.291516304016113,
1685
+ "loss": 0.8956,
1686
+ "odds_ratio_loss": 2.6601970195770264,
1687
+ "rewards/accuracies": 0.8125,
1688
+ "rewards/chosen": -0.08549333363771439,
1689
+ "rewards/margins": 0.3436582684516907,
1690
+ "rewards/rejected": -0.42915162444114685,
1691
+ "sft_loss": 0.6296234726905823,
1692
+ "step": 990
1693
+ },
1694
+ {
1695
+ "epoch": 0.10533291785349069,
1696
+ "grad_norm": 5.924813747406006,
1697
+ "learning_rate": 0.0004999577412429764,
1698
+ "logits/chosen": -5.115817070007324,
1699
+ "logits/rejected": -5.115783214569092,
1700
+ "logps/chosen": -0.6959132552146912,
1701
+ "logps/rejected": -3.6701784133911133,
1702
+ "loss": 0.7282,
1703
+ "odds_ratio_loss": 2.448162078857422,
1704
+ "rewards/accuracies": 0.8374999761581421,
1705
+ "rewards/chosen": -0.06959132105112076,
1706
+ "rewards/margins": 0.29742658138275146,
1707
+ "rewards/rejected": -0.36701786518096924,
1708
+ "sft_loss": 0.48338958621025085,
1709
+ "step": 1000
1710
+ },
1711
+ {
1712
+ "epoch": 0.1063862470320256,
1713
+ "grad_norm": 3.775995969772339,
1714
+ "learning_rate": 0.0004999391481442307,
1715
+ "logits/chosen": -5.038882732391357,
1716
+ "logits/rejected": -5.039083003997803,
1717
+ "logps/chosen": -0.6505192518234253,
1718
+ "logps/rejected": -2.7321646213531494,
1719
+ "loss": 0.686,
1720
+ "odds_ratio_loss": 2.5259392261505127,
1721
+ "rewards/accuracies": 0.8395833373069763,
1722
+ "rewards/chosen": -0.06505192071199417,
1723
+ "rewards/margins": 0.2081645280122757,
1724
+ "rewards/rejected": -0.2732164263725281,
1725
+ "sft_loss": 0.43344053626060486,
1726
+ "step": 1010
1727
+ },
1728
+ {
1729
+ "epoch": 0.10743957621056051,
1730
+ "grad_norm": 3.577996253967285,
1731
+ "learning_rate": 0.0004999171750763959,
1732
+ "logits/chosen": -4.925287246704102,
1733
+ "logits/rejected": -4.925505638122559,
1734
+ "logps/chosen": -0.6491485238075256,
1735
+ "logps/rejected": -2.673661947250366,
1736
+ "loss": 0.683,
1737
+ "odds_ratio_loss": 2.427180528640747,
1738
+ "rewards/accuracies": 0.862500011920929,
1739
+ "rewards/chosen": -0.06491485238075256,
1740
+ "rewards/margins": 0.20245136320590973,
1741
+ "rewards/rejected": -0.2673662006855011,
1742
+ "sft_loss": 0.44026196002960205,
1743
+ "step": 1020
1744
+ },
1745
+ {
1746
+ "epoch": 0.10849290538909541,
1747
+ "grad_norm": 4.37986946105957,
1748
+ "learning_rate": 0.0004998918223366173,
1749
+ "logits/chosen": -5.010101318359375,
1750
+ "logits/rejected": -5.010295391082764,
1751
+ "logps/chosen": -0.7345671057701111,
1752
+ "logps/rejected": -2.6488146781921387,
1753
+ "loss": 0.7723,
1754
+ "odds_ratio_loss": 2.661365509033203,
1755
+ "rewards/accuracies": 0.8291666507720947,
1756
+ "rewards/chosen": -0.07345671951770782,
1757
+ "rewards/margins": 0.19142475724220276,
1758
+ "rewards/rejected": -0.2648814618587494,
1759
+ "sft_loss": 0.5061719417572021,
1760
+ "step": 1030
1761
+ },
1762
+ {
1763
+ "epoch": 0.10954623456763031,
1764
+ "grad_norm": 8.366073608398438,
1765
+ "learning_rate": 0.0004998630902677444,
1766
+ "logits/chosen": -5.033807277679443,
1767
+ "logits/rejected": -5.034041881561279,
1768
+ "logps/chosen": -0.7631211876869202,
1769
+ "logps/rejected": -2.8217647075653076,
1770
+ "loss": 0.8007,
1771
+ "odds_ratio_loss": 2.807353973388672,
1772
+ "rewards/accuracies": 0.84375,
1773
+ "rewards/chosen": -0.0763121098279953,
1774
+ "rewards/margins": 0.2058643400669098,
1775
+ "rewards/rejected": -0.2821764647960663,
1776
+ "sft_loss": 0.5199962854385376,
1777
+ "step": 1040
1778
+ },
1779
+ {
1780
+ "epoch": 0.11059956374616522,
1781
+ "grad_norm": 3.063091993331909,
1782
+ "learning_rate": 0.0004998309792583257,
1783
+ "logits/chosen": -5.056707859039307,
1784
+ "logits/rejected": -5.056928634643555,
1785
+ "logps/chosen": -0.7930384278297424,
1786
+ "logps/rejected": -3.048496723175049,
1787
+ "loss": 0.8372,
1788
+ "odds_ratio_loss": 2.895695209503174,
1789
+ "rewards/accuracies": 0.7749999761581421,
1790
+ "rewards/chosen": -0.07930383831262589,
1791
+ "rewards/margins": 0.22554583847522736,
1792
+ "rewards/rejected": -0.30484965443611145,
1793
+ "sft_loss": 0.5476340651512146,
1794
+ "step": 1050
1795
+ },
1796
+ {
1797
+ "epoch": 0.11165289292470013,
1798
+ "grad_norm": 4.035704612731934,
1799
+ "learning_rate": 0.0004997954897426039,
1800
+ "logits/chosen": -4.731100559234619,
1801
+ "logits/rejected": -4.731292247772217,
1802
+ "logps/chosen": -0.7024089694023132,
1803
+ "logps/rejected": -3.2537832260131836,
1804
+ "loss": 0.7407,
1805
+ "odds_ratio_loss": 2.768791675567627,
1806
+ "rewards/accuracies": 0.8187500238418579,
1807
+ "rewards/chosen": -0.07024088501930237,
1808
+ "rewards/margins": 0.25513747334480286,
1809
+ "rewards/rejected": -0.3253783583641052,
1810
+ "sft_loss": 0.46386364102363586,
1811
+ "step": 1060
1812
+ },
1813
+ {
1814
+ "epoch": 0.11270622210323504,
1815
+ "grad_norm": 5.625176906585693,
1816
+ "learning_rate": 0.0004997566222005095,
1817
+ "logits/chosen": -5.09245491027832,
1818
+ "logits/rejected": -5.0925984382629395,
1819
+ "logps/chosen": -0.7186325192451477,
1820
+ "logps/rejected": -3.2234888076782227,
1821
+ "loss": 0.7539,
1822
+ "odds_ratio_loss": 2.5938258171081543,
1823
+ "rewards/accuracies": 0.856249988079071,
1824
+ "rewards/chosen": -0.07186325639486313,
1825
+ "rewards/margins": 0.2504856288433075,
1826
+ "rewards/rejected": -0.3223489224910736,
1827
+ "sft_loss": 0.4945569932460785,
1828
+ "step": 1070
1829
+ },
1830
+ {
1831
+ "epoch": 0.11375955128176994,
1832
+ "grad_norm": 8.753211975097656,
1833
+ "learning_rate": 0.0004997143771576551,
1834
+ "logits/chosen": -5.339606761932373,
1835
+ "logits/rejected": -5.339755058288574,
1836
+ "logps/chosen": -0.6703072190284729,
1837
+ "logps/rejected": -3.089592933654785,
1838
+ "loss": 0.706,
1839
+ "odds_ratio_loss": 2.5606048107147217,
1840
+ "rewards/accuracies": 0.8208333253860474,
1841
+ "rewards/chosen": -0.06703073531389236,
1842
+ "rewards/margins": 0.2419285774230957,
1843
+ "rewards/rejected": -0.30895933508872986,
1844
+ "sft_loss": 0.44997820258140564,
1845
+ "step": 1080
1846
+ },
1847
+ {
1848
+ "epoch": 0.11481288046030486,
1849
+ "grad_norm": 7.652149677276611,
1850
+ "learning_rate": 0.0004996687551853271,
1851
+ "logits/chosen": -5.191481113433838,
1852
+ "logits/rejected": -5.191573619842529,
1853
+ "logps/chosen": -0.7030736207962036,
1854
+ "logps/rejected": -3.310757875442505,
1855
+ "loss": 0.7353,
1856
+ "odds_ratio_loss": 2.5654568672180176,
1857
+ "rewards/accuracies": 0.8645833134651184,
1858
+ "rewards/chosen": -0.07030736654996872,
1859
+ "rewards/margins": 0.26076844334602356,
1860
+ "rewards/rejected": -0.3310757875442505,
1861
+ "sft_loss": 0.47873547673225403,
1862
+ "step": 1090
1863
+ },
1864
+ {
1865
+ "epoch": 0.11586620963883976,
1866
+ "grad_norm": 4.607894420623779,
1867
+ "learning_rate": 0.0004996197569004794,
1868
+ "logits/chosen": -4.798411846160889,
1869
+ "logits/rejected": -4.798523426055908,
1870
+ "logps/chosen": -0.6611460447311401,
1871
+ "logps/rejected": -2.983532428741455,
1872
+ "loss": 0.6988,
1873
+ "odds_ratio_loss": 2.426917314529419,
1874
+ "rewards/accuracies": 0.824999988079071,
1875
+ "rewards/chosen": -0.06611461192369461,
1876
+ "rewards/margins": 0.23223866522312164,
1877
+ "rewards/rejected": -0.29835325479507446,
1878
+ "sft_loss": 0.4561263918876648,
1879
+ "step": 1100
1880
+ },
1881
+ {
1882
+ "epoch": 0.11691953881737467,
1883
+ "grad_norm": 6.003838539123535,
1884
+ "learning_rate": 0.000499567382965724,
1885
+ "logits/chosen": -4.689076900482178,
1886
+ "logits/rejected": -4.689194202423096,
1887
+ "logps/chosen": -0.6927405595779419,
1888
+ "logps/rejected": -3.072282552719116,
1889
+ "loss": 0.7266,
1890
+ "odds_ratio_loss": 2.494777202606201,
1891
+ "rewards/accuracies": 0.8520833253860474,
1892
+ "rewards/chosen": -0.06927405297756195,
1893
+ "rewards/margins": 0.23795419931411743,
1894
+ "rewards/rejected": -0.3072282373905182,
1895
+ "sft_loss": 0.477167546749115,
1896
+ "step": 1110
1897
+ },
1898
+ {
1899
+ "epoch": 0.11797286799590957,
1900
+ "grad_norm": 3.377394676208496,
1901
+ "learning_rate": 0.0004995116340893223,
1902
+ "logits/chosen": -4.666645526885986,
1903
+ "logits/rejected": -4.666871547698975,
1904
+ "logps/chosen": -0.6164705157279968,
1905
+ "logps/rejected": -3.2369017601013184,
1906
+ "loss": 0.6451,
1907
+ "odds_ratio_loss": 2.411543369293213,
1908
+ "rewards/accuracies": 0.8645833134651184,
1909
+ "rewards/chosen": -0.061647046357393265,
1910
+ "rewards/margins": 0.26204314827919006,
1911
+ "rewards/rejected": -0.3236902058124542,
1912
+ "sft_loss": 0.40390655398368835,
1913
+ "step": 1120
1914
+ },
1915
+ {
1916
+ "epoch": 0.11902619717444447,
1917
+ "grad_norm": 3.650819778442383,
1918
+ "learning_rate": 0.0004994525110251759,
1919
+ "logits/chosen": -5.011782169342041,
1920
+ "logits/rejected": -5.012125492095947,
1921
+ "logps/chosen": -0.8005008101463318,
1922
+ "logps/rejected": -3.1773478984832764,
1923
+ "loss": 0.8349,
1924
+ "odds_ratio_loss": 2.8024778366088867,
1925
+ "rewards/accuracies": 0.8354166746139526,
1926
+ "rewards/chosen": -0.08005008101463318,
1927
+ "rewards/margins": 0.2376846969127655,
1928
+ "rewards/rejected": -0.3177347779273987,
1929
+ "sft_loss": 0.5546395182609558,
1930
+ "step": 1130
1931
+ },
1932
+ {
1933
+ "epoch": 0.12007952635297939,
1934
+ "grad_norm": 5.192956447601318,
1935
+ "learning_rate": 0.0004993900145728157,
1936
+ "logits/chosen": -5.2029805183410645,
1937
+ "logits/rejected": -5.203344821929932,
1938
+ "logps/chosen": -0.7520886063575745,
1939
+ "logps/rejected": -3.3132715225219727,
1940
+ "loss": 0.7852,
1941
+ "odds_ratio_loss": 2.5903329849243164,
1942
+ "rewards/accuracies": 0.84375,
1943
+ "rewards/chosen": -0.07520885765552521,
1944
+ "rewards/margins": 0.2561182677745819,
1945
+ "rewards/rejected": -0.3313271403312683,
1946
+ "sft_loss": 0.5261538624763489,
1947
+ "step": 1140
1948
+ },
1949
+ {
1950
+ "epoch": 0.12113285553151429,
1951
+ "grad_norm": 7.4040985107421875,
1952
+ "learning_rate": 0.0004993241455773918,
1953
+ "logits/chosen": -5.165520191192627,
1954
+ "logits/rejected": -5.165550708770752,
1955
+ "logps/chosen": -0.7126467227935791,
1956
+ "logps/rejected": -3.5545897483825684,
1957
+ "loss": 0.748,
1958
+ "odds_ratio_loss": 2.594726800918579,
1959
+ "rewards/accuracies": 0.8354166746139526,
1960
+ "rewards/chosen": -0.07126467674970627,
1961
+ "rewards/margins": 0.28419435024261475,
1962
+ "rewards/rejected": -0.3554590046405792,
1963
+ "sft_loss": 0.48852840065956116,
1964
+ "step": 1150
1965
+ },
1966
+ {
1967
+ "epoch": 0.1221861847100492,
1968
+ "grad_norm": 4.905418872833252,
1969
+ "learning_rate": 0.0004992549049296619,
1970
+ "logits/chosen": -5.190316200256348,
1971
+ "logits/rejected": -5.190418720245361,
1972
+ "logps/chosen": -0.6702563762664795,
1973
+ "logps/rejected": -3.4835519790649414,
1974
+ "loss": 0.7052,
1975
+ "odds_ratio_loss": 2.497361183166504,
1976
+ "rewards/accuracies": 0.8500000238418579,
1977
+ "rewards/chosen": -0.06702563911676407,
1978
+ "rewards/margins": 0.28132954239845276,
1979
+ "rewards/rejected": -0.3483552038669586,
1980
+ "sft_loss": 0.4554961621761322,
1981
+ "step": 1160
1982
+ },
1983
+ {
1984
+ "epoch": 0.1232395138885841,
1985
+ "grad_norm": 4.1152238845825195,
1986
+ "learning_rate": 0.0004991822935659786,
1987
+ "logits/chosen": -5.477373123168945,
1988
+ "logits/rejected": -5.477328300476074,
1989
+ "logps/chosen": -0.8472169637680054,
1990
+ "logps/rejected": -3.917269229888916,
1991
+ "loss": 0.8817,
1992
+ "odds_ratio_loss": 3.221430540084839,
1993
+ "rewards/accuracies": 0.8416666388511658,
1994
+ "rewards/chosen": -0.08472169190645218,
1995
+ "rewards/margins": 0.30700525641441345,
1996
+ "rewards/rejected": -0.39172691106796265,
1997
+ "sft_loss": 0.5595788955688477,
1998
+ "step": 1170
1999
+ },
2000
+ {
2001
+ "epoch": 0.12429284306711902,
2002
+ "grad_norm": 5.008730888366699,
2003
+ "learning_rate": 0.0004991063124682778,
2004
+ "logits/chosen": -5.323733329772949,
2005
+ "logits/rejected": -5.323288917541504,
2006
+ "logps/chosen": -0.757644534111023,
2007
+ "logps/rejected": -5.853792190551758,
2008
+ "loss": 0.7925,
2009
+ "odds_ratio_loss": 2.670191526412964,
2010
+ "rewards/accuracies": 0.8500000238418579,
2011
+ "rewards/chosen": -0.07576445490121841,
2012
+ "rewards/margins": 0.5096147656440735,
2013
+ "rewards/rejected": -0.5853793025016785,
2014
+ "sft_loss": 0.5255211591720581,
2015
+ "step": 1180
2016
+ },
2017
+ {
2018
+ "epoch": 0.12534617224565392,
2019
+ "grad_norm": 31.133501052856445,
2020
+ "learning_rate": 0.0004990269626640645,
2021
+ "logits/chosen": -5.64047384262085,
2022
+ "logits/rejected": -5.64005184173584,
2023
+ "logps/chosen": -0.768204391002655,
2024
+ "logps/rejected": -5.164267063140869,
2025
+ "loss": 0.8058,
2026
+ "odds_ratio_loss": 2.815326452255249,
2027
+ "rewards/accuracies": 0.8354166746139526,
2028
+ "rewards/chosen": -0.07682044059038162,
2029
+ "rewards/margins": 0.43960627913475037,
2030
+ "rewards/rejected": -0.516426682472229,
2031
+ "sft_loss": 0.5242764949798584,
2032
+ "step": 1190
2033
+ },
2034
+ {
2035
+ "epoch": 0.12639950142418882,
2036
+ "grad_norm": 6.6835222244262695,
2037
+ "learning_rate": 0.0004989442452263996,
2038
+ "logits/chosen": -5.151331424713135,
2039
+ "logits/rejected": -5.1510162353515625,
2040
+ "logps/chosen": -0.8430954217910767,
2041
+ "logps/rejected": -4.105188369750977,
2042
+ "loss": 0.8846,
2043
+ "odds_ratio_loss": 3.0806498527526855,
2044
+ "rewards/accuracies": 0.7749999761581421,
2045
+ "rewards/chosen": -0.08430954068899155,
2046
+ "rewards/margins": 0.32620927691459656,
2047
+ "rewards/rejected": -0.4105188250541687,
2048
+ "sft_loss": 0.5764933228492737,
2049
+ "step": 1200
2050
+ },
2051
+ {
2052
+ "epoch": 0.12745283060272372,
2053
+ "grad_norm": 6.060980796813965,
2054
+ "learning_rate": 0.0004988581612738847,
2055
+ "logits/chosen": -5.232106685638428,
2056
+ "logits/rejected": -5.2317962646484375,
2057
+ "logps/chosen": -0.7580560445785522,
2058
+ "logps/rejected": -3.9481935501098633,
2059
+ "loss": 0.7955,
2060
+ "odds_ratio_loss": 2.8234329223632812,
2061
+ "rewards/accuracies": 0.8187500238418579,
2062
+ "rewards/chosen": -0.07580561190843582,
2063
+ "rewards/margins": 0.319013774394989,
2064
+ "rewards/rejected": -0.39481934905052185,
2065
+ "sft_loss": 0.5131634473800659,
2066
+ "step": 1210
2067
+ },
2068
+ {
2069
+ "epoch": 0.12850615978125865,
2070
+ "grad_norm": 4.154183864593506,
2071
+ "learning_rate": 0.0004987687119706477,
2072
+ "logits/chosen": -5.385165214538574,
2073
+ "logits/rejected": -5.385090351104736,
2074
+ "logps/chosen": -0.7838355302810669,
2075
+ "logps/rejected": -3.81575608253479,
2076
+ "loss": 0.8198,
2077
+ "odds_ratio_loss": 2.7132718563079834,
2078
+ "rewards/accuracies": 0.8479166626930237,
2079
+ "rewards/chosen": -0.07838355004787445,
2080
+ "rewards/margins": 0.30319201946258545,
2081
+ "rewards/rejected": -0.3815755844116211,
2082
+ "sft_loss": 0.5484524965286255,
2083
+ "step": 1220
2084
+ },
2085
+ {
2086
+ "epoch": 0.12955948895979355,
2087
+ "grad_norm": 4.047881603240967,
2088
+ "learning_rate": 0.0004986758985263265,
2089
+ "logits/chosen": -5.267871379852295,
2090
+ "logits/rejected": -5.267872333526611,
2091
+ "logps/chosen": -0.7567169666290283,
2092
+ "logps/rejected": -2.9766650199890137,
2093
+ "loss": 0.7939,
2094
+ "odds_ratio_loss": 2.620311975479126,
2095
+ "rewards/accuracies": 0.8187500238418579,
2096
+ "rewards/chosen": -0.07567168772220612,
2097
+ "rewards/margins": 0.2219947874546051,
2098
+ "rewards/rejected": -0.2976664900779724,
2099
+ "sft_loss": 0.5318555235862732,
2100
+ "step": 1230
2101
+ },
2102
+ {
2103
+ "epoch": 0.13061281813832845,
2104
+ "grad_norm": 2.7552692890167236,
2105
+ "learning_rate": 0.0004985797221960529,
2106
+ "logits/chosen": -5.264489650726318,
2107
+ "logits/rejected": -5.264598369598389,
2108
+ "logps/chosen": -0.7095692753791809,
2109
+ "logps/rejected": -3.2434911727905273,
2110
+ "loss": 0.7445,
2111
+ "odds_ratio_loss": 2.7399401664733887,
2112
+ "rewards/accuracies": 0.8458333611488342,
2113
+ "rewards/chosen": -0.07095693051815033,
2114
+ "rewards/margins": 0.25339218974113464,
2115
+ "rewards/rejected": -0.3243491053581238,
2116
+ "sft_loss": 0.47051993012428284,
2117
+ "step": 1240
2118
+ },
2119
+ {
2120
+ "epoch": 0.13166614731686335,
2121
+ "grad_norm": 3.3210740089416504,
2122
+ "learning_rate": 0.0004984801842804357,
2123
+ "logits/chosen": -5.190452575683594,
2124
+ "logits/rejected": -5.190454959869385,
2125
+ "logps/chosen": -0.6608388423919678,
2126
+ "logps/rejected": -3.0926926136016846,
2127
+ "loss": 0.6955,
2128
+ "odds_ratio_loss": 2.5877368450164795,
2129
+ "rewards/accuracies": 0.8541666865348816,
2130
+ "rewards/chosen": -0.0660838857293129,
2131
+ "rewards/margins": 0.2431853860616684,
2132
+ "rewards/rejected": -0.3092692792415619,
2133
+ "sft_loss": 0.43669426441192627,
2134
+ "step": 1250
2135
+ },
2136
+ {
2137
+ "epoch": 0.13271947649539828,
2138
+ "grad_norm": 2.113543748855591,
2139
+ "learning_rate": 0.0004983772861255426,
2140
+ "logits/chosen": -5.355245113372803,
2141
+ "logits/rejected": -5.3552327156066895,
2142
+ "logps/chosen": -0.7177728414535522,
2143
+ "logps/rejected": -3.0373685359954834,
2144
+ "loss": 0.7558,
2145
+ "odds_ratio_loss": 2.9420340061187744,
2146
+ "rewards/accuracies": 0.8020833134651184,
2147
+ "rewards/chosen": -0.07177729159593582,
2148
+ "rewards/margins": 0.23195955157279968,
2149
+ "rewards/rejected": -0.3037368357181549,
2150
+ "sft_loss": 0.4615623354911804,
2151
+ "step": 1260
2152
+ },
2153
+ {
2154
+ "epoch": 0.13377280567393318,
2155
+ "grad_norm": 2.6125612258911133,
2156
+ "learning_rate": 0.0004982710291228828,
2157
+ "logits/chosen": -5.5682549476623535,
2158
+ "logits/rejected": -5.568284034729004,
2159
+ "logps/chosen": -0.7032897472381592,
2160
+ "logps/rejected": -3.1011478900909424,
2161
+ "loss": 0.7454,
2162
+ "odds_ratio_loss": 2.6539957523345947,
2163
+ "rewards/accuracies": 0.793749988079071,
2164
+ "rewards/chosen": -0.0703289657831192,
2165
+ "rewards/margins": 0.23978586494922638,
2166
+ "rewards/rejected": -0.3101148307323456,
2167
+ "sft_loss": 0.4799610376358032,
2168
+ "step": 1270
2169
+ },
2170
+ {
2171
+ "epoch": 0.13482613485246808,
2172
+ "grad_norm": 4.396284580230713,
2173
+ "learning_rate": 0.0004981614147093875,
2174
+ "logits/chosen": -5.787298202514648,
2175
+ "logits/rejected": -5.787331581115723,
2176
+ "logps/chosen": -0.7411842942237854,
2177
+ "logps/rejected": -3.3715906143188477,
2178
+ "loss": 0.7797,
2179
+ "odds_ratio_loss": 2.705798625946045,
2180
+ "rewards/accuracies": 0.824999988079071,
2181
+ "rewards/chosen": -0.07411842048168182,
2182
+ "rewards/margins": 0.2630406320095062,
2183
+ "rewards/rejected": -0.33715906739234924,
2184
+ "sft_loss": 0.5091153979301453,
2185
+ "step": 1280
2186
+ },
2187
+ {
2188
+ "epoch": 0.13587946403100298,
2189
+ "grad_norm": 5.317102909088135,
2190
+ "learning_rate": 0.000498048444367391,
2191
+ "logits/chosen": -5.471971035003662,
2192
+ "logits/rejected": -5.472008228302002,
2193
+ "logps/chosen": -0.7457516193389893,
2194
+ "logps/rejected": -3.6155009269714355,
2195
+ "loss": 0.7782,
2196
+ "odds_ratio_loss": 2.801055908203125,
2197
+ "rewards/accuracies": 0.8583333492279053,
2198
+ "rewards/chosen": -0.07457517087459564,
2199
+ "rewards/margins": 0.2869749367237091,
2200
+ "rewards/rejected": -0.36155006289482117,
2201
+ "sft_loss": 0.49809518456459045,
2202
+ "step": 1290
2203
+ },
2204
+ {
2205
+ "epoch": 0.13693279320953788,
2206
+ "grad_norm": 4.3287506103515625,
2207
+ "learning_rate": 0.00049793211962461,
2208
+ "logits/chosen": -5.262281894683838,
2209
+ "logits/rejected": -5.262419700622559,
2210
+ "logps/chosen": -0.7028716802597046,
2211
+ "logps/rejected": -2.8387629985809326,
2212
+ "loss": 0.7409,
2213
+ "odds_ratio_loss": 2.748593807220459,
2214
+ "rewards/accuracies": 0.8208333253860474,
2215
+ "rewards/chosen": -0.07028716057538986,
2216
+ "rewards/margins": 0.2135891616344452,
2217
+ "rewards/rejected": -0.28387632966041565,
2218
+ "sft_loss": 0.4660036265850067,
2219
+ "step": 1300
2220
+ },
2221
+ {
2222
+ "epoch": 0.1379861223880728,
2223
+ "grad_norm": 1.7063987255096436,
2224
+ "learning_rate": 0.0004978124420541238,
2225
+ "logits/chosen": -5.180874347686768,
2226
+ "logits/rejected": -5.181042671203613,
2227
+ "logps/chosen": -0.7432348132133484,
2228
+ "logps/rejected": -2.8490519523620605,
2229
+ "loss": 0.7859,
2230
+ "odds_ratio_loss": 2.679919958114624,
2231
+ "rewards/accuracies": 0.8229166865348816,
2232
+ "rewards/chosen": -0.07432348281145096,
2233
+ "rewards/margins": 0.2105817198753357,
2234
+ "rewards/rejected": -0.28490516543388367,
2235
+ "sft_loss": 0.5179334282875061,
2236
+ "step": 1310
2237
+ },
2238
+ {
2239
+ "epoch": 0.1390394515666077,
2240
+ "grad_norm": 2.908730983734131,
2241
+ "learning_rate": 0.0004976894132743521,
2242
+ "logits/chosen": -5.495538234710693,
2243
+ "logits/rejected": -5.49543571472168,
2244
+ "logps/chosen": -0.696144700050354,
2245
+ "logps/rejected": -3.018319845199585,
2246
+ "loss": 0.7357,
2247
+ "odds_ratio_loss": 3.030001163482666,
2248
+ "rewards/accuracies": 0.8062499761581421,
2249
+ "rewards/chosen": -0.0696144625544548,
2250
+ "rewards/margins": 0.23221756517887115,
2251
+ "rewards/rejected": -0.30183205008506775,
2252
+ "sft_loss": 0.4326884150505066,
2253
+ "step": 1320
2254
+ },
2255
+ {
2256
+ "epoch": 0.14009278074514261,
2257
+ "grad_norm": 3.4632487297058105,
2258
+ "learning_rate": 0.0004975630349490338,
2259
+ "logits/chosen": -5.525754928588867,
2260
+ "logits/rejected": -5.52562141418457,
2261
+ "logps/chosen": -0.7210444808006287,
2262
+ "logps/rejected": -3.5407607555389404,
2263
+ "loss": 0.7551,
2264
+ "odds_ratio_loss": 2.435321092605591,
2265
+ "rewards/accuracies": 0.8229166865348816,
2266
+ "rewards/chosen": -0.07210444658994675,
2267
+ "rewards/margins": 0.28197160363197327,
2268
+ "rewards/rejected": -0.3540760576725006,
2269
+ "sft_loss": 0.5115490555763245,
2270
+ "step": 1330
2271
+ },
2272
+ {
2273
+ "epoch": 0.14114610992367752,
2274
+ "grad_norm": 6.958703517913818,
2275
+ "learning_rate": 0.0004974333087872041,
2276
+ "logits/chosen": -5.362403869628906,
2277
+ "logits/rejected": -5.362163066864014,
2278
+ "logps/chosen": -0.732792854309082,
2279
+ "logps/rejected": -4.104527950286865,
2280
+ "loss": 0.7646,
2281
+ "odds_ratio_loss": 2.719597339630127,
2282
+ "rewards/accuracies": 0.8479166626930237,
2283
+ "rewards/chosen": -0.07327928394079208,
2284
+ "rewards/margins": 0.3371734619140625,
2285
+ "rewards/rejected": -0.4104527533054352,
2286
+ "sft_loss": 0.4926711320877075,
2287
+ "step": 1340
2288
+ },
2289
+ {
2290
+ "epoch": 0.14219943910221244,
2291
+ "grad_norm": 2.859614849090576,
2292
+ "learning_rate": 0.0004973002365431719,
2293
+ "logits/chosen": -5.708899021148682,
2294
+ "logits/rejected": -5.708664894104004,
2295
+ "logps/chosen": -0.6528833508491516,
2296
+ "logps/rejected": -4.244791030883789,
2297
+ "loss": 0.6818,
2298
+ "odds_ratio_loss": 2.3199055194854736,
2299
+ "rewards/accuracies": 0.8770833611488342,
2300
+ "rewards/chosen": -0.06528832763433456,
2301
+ "rewards/margins": 0.35919085144996643,
2302
+ "rewards/rejected": -0.4244791567325592,
2303
+ "sft_loss": 0.44977909326553345,
2304
+ "step": 1350
2305
+ },
2306
+ {
2307
+ "epoch": 0.14325276828074734,
2308
+ "grad_norm": 4.355047702789307,
2309
+ "learning_rate": 0.0004971638200164954,
2310
+ "logits/chosen": -6.141923427581787,
2311
+ "logits/rejected": -6.141890525817871,
2312
+ "logps/chosen": -0.7438533902168274,
2313
+ "logps/rejected": -3.756343126296997,
2314
+ "loss": 0.7845,
2315
+ "odds_ratio_loss": 2.7992961406707764,
2316
+ "rewards/accuracies": 0.824999988079071,
2317
+ "rewards/chosen": -0.07438533753156662,
2318
+ "rewards/margins": 0.3012489676475525,
2319
+ "rewards/rejected": -0.3756342828273773,
2320
+ "sft_loss": 0.5046018362045288,
2321
+ "step": 1360
2322
+ },
2323
+ {
2324
+ "epoch": 0.14430609745928225,
2325
+ "grad_norm": 2.1691360473632812,
2326
+ "learning_rate": 0.0004970240610519582,
2327
+ "logits/chosen": -5.513346195220947,
2328
+ "logits/rejected": -5.513165473937988,
2329
+ "logps/chosen": -0.7122442722320557,
2330
+ "logps/rejected": -4.005341529846191,
2331
+ "loss": 0.7452,
2332
+ "odds_ratio_loss": 2.3780596256256104,
2333
+ "rewards/accuracies": 0.8500000238418579,
2334
+ "rewards/chosen": -0.07122442126274109,
2335
+ "rewards/margins": 0.32930976152420044,
2336
+ "rewards/rejected": -0.40053418278694153,
2337
+ "sft_loss": 0.5073560476303101,
2338
+ "step": 1370
2339
+ },
2340
+ {
2341
+ "epoch": 0.14535942663781715,
2342
+ "grad_norm": 8.108336448669434,
2343
+ "learning_rate": 0.0004968809615395443,
2344
+ "logits/chosen": -5.834110736846924,
2345
+ "logits/rejected": -5.834160327911377,
2346
+ "logps/chosen": -0.7287462949752808,
2347
+ "logps/rejected": -3.322920560836792,
2348
+ "loss": 0.7683,
2349
+ "odds_ratio_loss": 2.7808871269226074,
2350
+ "rewards/accuracies": 0.824999988079071,
2351
+ "rewards/chosen": -0.07287462800741196,
2352
+ "rewards/margins": 0.25941744446754456,
2353
+ "rewards/rejected": -0.3322920799255371,
2354
+ "sft_loss": 0.4901922941207886,
2355
+ "step": 1380
2356
+ },
2357
+ {
2358
+ "epoch": 0.14641275581635205,
2359
+ "grad_norm": 4.8621392250061035,
2360
+ "learning_rate": 0.0004967345234144125,
2361
+ "logits/chosen": -5.492813587188721,
2362
+ "logits/rejected": -5.492822647094727,
2363
+ "logps/chosen": -0.6987211108207703,
2364
+ "logps/rejected": -3.542156934738159,
2365
+ "loss": 0.7353,
2366
+ "odds_ratio_loss": 2.4634013175964355,
2367
+ "rewards/accuracies": 0.8333333134651184,
2368
+ "rewards/chosen": -0.06987211108207703,
2369
+ "rewards/margins": 0.2843436002731323,
2370
+ "rewards/rejected": -0.35421568155288696,
2371
+ "sft_loss": 0.48899564146995544,
2372
+ "step": 1390
2373
+ },
2374
+ {
2375
+ "epoch": 0.14746608499488698,
2376
+ "grad_norm": 4.177161693572998,
2377
+ "learning_rate": 0.00049658474865687,
2378
+ "logits/chosen": -5.457418441772461,
2379
+ "logits/rejected": -5.457381248474121,
2380
+ "logps/chosen": -0.6720997095108032,
2381
+ "logps/rejected": -3.794857978820801,
2382
+ "loss": 0.705,
2383
+ "odds_ratio_loss": 2.515441417694092,
2384
+ "rewards/accuracies": 0.8520833253860474,
2385
+ "rewards/chosen": -0.06720996648073196,
2386
+ "rewards/margins": 0.31227582693099976,
2387
+ "rewards/rejected": -0.3794857859611511,
2388
+ "sft_loss": 0.4534277021884918,
2389
+ "step": 1400
2390
+ },
2391
+ {
2392
+ "epoch": 0.14851941417342188,
2393
+ "grad_norm": 6.939654350280762,
2394
+ "learning_rate": 0.000496431639292346,
2395
+ "logits/chosen": -5.591572284698486,
2396
+ "logits/rejected": -5.591520309448242,
2397
+ "logps/chosen": -0.6898292899131775,
2398
+ "logps/rejected": -3.8443169593811035,
2399
+ "loss": 0.7205,
2400
+ "odds_ratio_loss": 2.573276996612549,
2401
+ "rewards/accuracies": 0.8583333492279053,
2402
+ "rewards/chosen": -0.06898292899131775,
2403
+ "rewards/margins": 0.3154487609863281,
2404
+ "rewards/rejected": -0.3844316899776459,
2405
+ "sft_loss": 0.46317487955093384,
2406
+ "step": 1410
2407
+ },
2408
+ {
2409
+ "epoch": 0.14957274335195678,
2410
+ "grad_norm": 3.3419246673583984,
2411
+ "learning_rate": 0.0004962751973913644,
2412
+ "logits/chosen": -5.660191059112549,
2413
+ "logits/rejected": -5.6601338386535645,
2414
+ "logps/chosen": -0.7214117050170898,
2415
+ "logps/rejected": -3.800171375274658,
2416
+ "loss": 0.7539,
2417
+ "odds_ratio_loss": 2.2002346515655518,
2418
+ "rewards/accuracies": 0.8354166746139526,
2419
+ "rewards/chosen": -0.07214117050170898,
2420
+ "rewards/margins": 0.30787599086761475,
2421
+ "rewards/rejected": -0.38001713156700134,
2422
+ "sft_loss": 0.5338917374610901,
2423
+ "step": 1420
2424
+ },
2425
+ {
2426
+ "epoch": 0.15062607253049168,
2427
+ "grad_norm": 6.627447128295898,
2428
+ "learning_rate": 0.0004961154250695152,
2429
+ "logits/chosen": -5.646604537963867,
2430
+ "logits/rejected": -5.646506309509277,
2431
+ "logps/chosen": -0.6789618730545044,
2432
+ "logps/rejected": -3.7556862831115723,
2433
+ "loss": 0.7131,
2434
+ "odds_ratio_loss": 2.4358396530151367,
2435
+ "rewards/accuracies": 0.8395833373069763,
2436
+ "rewards/chosen": -0.06789619475603104,
2437
+ "rewards/margins": 0.3076724112033844,
2438
+ "rewards/rejected": -0.3755686581134796,
2439
+ "sft_loss": 0.46949687600135803,
2440
+ "step": 1430
2441
+ },
2442
+ {
2443
+ "epoch": 0.15167940170902658,
2444
+ "grad_norm": 7.264041900634766,
2445
+ "learning_rate": 0.0004959523244874262,
2446
+ "logits/chosen": -5.661590576171875,
2447
+ "logits/rejected": -5.661508560180664,
2448
+ "logps/chosen": -0.6989915370941162,
2449
+ "logps/rejected": -3.6030194759368896,
2450
+ "loss": 0.7324,
2451
+ "odds_ratio_loss": 2.595428943634033,
2452
+ "rewards/accuracies": 0.8458333611488342,
2453
+ "rewards/chosen": -0.06989916414022446,
2454
+ "rewards/margins": 0.2904028296470642,
2455
+ "rewards/rejected": -0.3603019714355469,
2456
+ "sft_loss": 0.4728315472602844,
2457
+ "step": 1440
2458
+ },
2459
+ {
2460
+ "epoch": 0.1527327308875615,
2461
+ "grad_norm": 3.4570446014404297,
2462
+ "learning_rate": 0.0004957858978507342,
2463
+ "logits/chosen": -5.628535270690918,
2464
+ "logits/rejected": -5.628504276275635,
2465
+ "logps/chosen": -0.6590794324874878,
2466
+ "logps/rejected": -3.3306422233581543,
2467
+ "loss": 0.6931,
2468
+ "odds_ratio_loss": 2.6064822673797607,
2469
+ "rewards/accuracies": 0.8416666388511658,
2470
+ "rewards/chosen": -0.06590793281793594,
2471
+ "rewards/margins": 0.2671562433242798,
2472
+ "rewards/rejected": -0.3330641984939575,
2473
+ "sft_loss": 0.43240654468536377,
2474
+ "step": 1450
2475
+ },
2476
+ {
2477
+ "epoch": 0.1537860600660964,
2478
+ "grad_norm": 7.473481178283691,
2479
+ "learning_rate": 0.0004956161474100544,
2480
+ "logits/chosen": -5.7138261795043945,
2481
+ "logits/rejected": -5.713827133178711,
2482
+ "logps/chosen": -0.6599766612052917,
2483
+ "logps/rejected": -3.381110668182373,
2484
+ "loss": 0.6939,
2485
+ "odds_ratio_loss": 2.4460840225219727,
2486
+ "rewards/accuracies": 0.84375,
2487
+ "rewards/chosen": -0.06599767506122589,
2488
+ "rewards/margins": 0.2721133828163147,
2489
+ "rewards/rejected": -0.3381110727787018,
2490
+ "sft_loss": 0.44932422041893005,
2491
+ "step": 1460
2492
+ },
2493
+ {
2494
+ "epoch": 0.1548393892446313,
2495
+ "grad_norm": 6.867354869842529,
2496
+ "learning_rate": 0.0004954430754609506,
2497
+ "logits/chosen": -5.79508638381958,
2498
+ "logits/rejected": -5.795089244842529,
2499
+ "logps/chosen": -0.6903258562088013,
2500
+ "logps/rejected": -3.085076332092285,
2501
+ "loss": 0.7338,
2502
+ "odds_ratio_loss": 2.651606798171997,
2503
+ "rewards/accuracies": 0.8041666746139526,
2504
+ "rewards/chosen": -0.06903257966041565,
2505
+ "rewards/margins": 0.2394750565290451,
2506
+ "rewards/rejected": -0.30850762128829956,
2507
+ "sft_loss": 0.4686751961708069,
2508
+ "step": 1470
2509
+ },
2510
+ {
2511
+ "epoch": 0.1558927184231662,
2512
+ "grad_norm": 3.0605275630950928,
2513
+ "learning_rate": 0.0004952666843439038,
2514
+ "logits/chosen": -5.6379008293151855,
2515
+ "logits/rejected": -5.6378703117370605,
2516
+ "logps/chosen": -0.6344018578529358,
2517
+ "logps/rejected": -3.5333213806152344,
2518
+ "loss": 0.6667,
2519
+ "odds_ratio_loss": 2.371415853500366,
2520
+ "rewards/accuracies": 0.8354166746139526,
2521
+ "rewards/chosen": -0.06344018131494522,
2522
+ "rewards/margins": 0.2898919880390167,
2523
+ "rewards/rejected": -0.35333216190338135,
2524
+ "sft_loss": 0.42956602573394775,
2525
+ "step": 1480
2526
+ },
2527
+ {
2528
+ "epoch": 0.15694604760170114,
2529
+ "grad_norm": 3.5998635292053223,
2530
+ "learning_rate": 0.0004950869764442807,
2531
+ "logits/chosen": -5.513609886169434,
2532
+ "logits/rejected": -5.513594627380371,
2533
+ "logps/chosen": -0.6546897888183594,
2534
+ "logps/rejected": -3.4388887882232666,
2535
+ "loss": 0.6841,
2536
+ "odds_ratio_loss": 2.4476258754730225,
2537
+ "rewards/accuracies": 0.8854166865348816,
2538
+ "rewards/chosen": -0.06546898186206818,
2539
+ "rewards/margins": 0.2784199118614197,
2540
+ "rewards/rejected": -0.34388887882232666,
2541
+ "sft_loss": 0.43934836983680725,
2542
+ "step": 1490
2543
+ },
2544
+ {
2545
+ "epoch": 0.15799937678023604,
2546
+ "grad_norm": 3.7529213428497314,
2547
+ "learning_rate": 0.0004949039541923015,
2548
+ "logits/chosen": -5.581011772155762,
2549
+ "logits/rejected": -5.580976486206055,
2550
+ "logps/chosen": -0.6702675223350525,
2551
+ "logps/rejected": -3.7115352153778076,
2552
+ "loss": 0.7055,
2553
+ "odds_ratio_loss": 2.6884007453918457,
2554
+ "rewards/accuracies": 0.8416666388511658,
2555
+ "rewards/chosen": -0.06702675670385361,
2556
+ "rewards/margins": 0.3041267395019531,
2557
+ "rewards/rejected": -0.37115350365638733,
2558
+ "sft_loss": 0.43668031692504883,
2559
+ "step": 1500
2560
+ },
2561
+ {
2562
+ "epoch": 0.15905270595877094,
2563
+ "grad_norm": 6.72556734085083,
2564
+ "learning_rate": 0.0004947176200630068,
2565
+ "logits/chosen": -5.502162456512451,
2566
+ "logits/rejected": -5.502138137817383,
2567
+ "logps/chosen": -0.6218239068984985,
2568
+ "logps/rejected": -3.428339719772339,
2569
+ "loss": 0.6499,
2570
+ "odds_ratio_loss": 2.3972864151000977,
2571
+ "rewards/accuracies": 0.8708333373069763,
2572
+ "rewards/chosen": -0.06218238174915314,
2573
+ "rewards/margins": 0.28065159916877747,
2574
+ "rewards/rejected": -0.3428339660167694,
2575
+ "sft_loss": 0.4101923108100891,
2576
+ "step": 1510
2577
+ },
2578
+ {
2579
+ "epoch": 0.16010603513730584,
2580
+ "grad_norm": 3.806243658065796,
2581
+ "learning_rate": 0.0004945279765762243,
2582
+ "logits/chosen": -5.590113639831543,
2583
+ "logits/rejected": -5.590085029602051,
2584
+ "logps/chosen": -0.6871938109397888,
2585
+ "logps/rejected": -3.6291747093200684,
2586
+ "loss": 0.7198,
2587
+ "odds_ratio_loss": 2.472882032394409,
2588
+ "rewards/accuracies": 0.8458333611488342,
2589
+ "rewards/chosen": -0.06871937960386276,
2590
+ "rewards/margins": 0.2941981554031372,
2591
+ "rewards/rejected": -0.3629175126552582,
2592
+ "sft_loss": 0.4725038409233093,
2593
+ "step": 1520
2594
+ },
2595
+ {
2596
+ "epoch": 0.16115936431584074,
2597
+ "grad_norm": 5.523288249969482,
2598
+ "learning_rate": 0.0004943350262965349,
2599
+ "logits/chosen": -5.691066265106201,
2600
+ "logits/rejected": -5.6911163330078125,
2601
+ "logps/chosen": -0.6510820984840393,
2602
+ "logps/rejected": -3.059138774871826,
2603
+ "loss": 0.685,
2604
+ "odds_ratio_loss": 2.56689715385437,
2605
+ "rewards/accuracies": 0.862500011920929,
2606
+ "rewards/chosen": -0.06510820984840393,
2607
+ "rewards/margins": 0.24080567061901093,
2608
+ "rewards/rejected": -0.30591386556625366,
2609
+ "sft_loss": 0.42830324172973633,
2610
+ "step": 1530
2611
+ },
2612
+ {
2613
+ "epoch": 0.16221269349437567,
2614
+ "grad_norm": 4.537405967712402,
2615
+ "learning_rate": 0.0004941387718332374,
2616
+ "logits/chosen": -5.746434688568115,
2617
+ "logits/rejected": -5.746466159820557,
2618
+ "logps/chosen": -0.6964403390884399,
2619
+ "logps/rejected": -3.4062578678131104,
2620
+ "loss": 0.7309,
2621
+ "odds_ratio_loss": 2.425229787826538,
2622
+ "rewards/accuracies": 0.8416666388511658,
2623
+ "rewards/chosen": -0.0696440264582634,
2624
+ "rewards/margins": 0.2709817886352539,
2625
+ "rewards/rejected": -0.3406257629394531,
2626
+ "sft_loss": 0.4884008467197418,
2627
+ "step": 1540
2628
+ },
2629
+ {
2630
+ "epoch": 0.16326602267291057,
2631
+ "grad_norm": 1.987900972366333,
2632
+ "learning_rate": 0.000493939215840314,
2633
+ "logits/chosen": -5.694365978240967,
2634
+ "logits/rejected": -5.694273471832275,
2635
+ "logps/chosen": -0.6464301347732544,
2636
+ "logps/rejected": -3.67587947845459,
2637
+ "loss": 0.6811,
2638
+ "odds_ratio_loss": 2.441976308822632,
2639
+ "rewards/accuracies": 0.84375,
2640
+ "rewards/chosen": -0.0646430179476738,
2641
+ "rewards/margins": 0.3029448688030243,
2642
+ "rewards/rejected": -0.36758795380592346,
2643
+ "sft_loss": 0.43692201375961304,
2644
+ "step": 1550
2645
+ },
2646
+ {
2647
+ "epoch": 0.16431935185144547,
2648
+ "grad_norm": 4.507101058959961,
2649
+ "learning_rate": 0.000493736361016394,
2650
+ "logits/chosen": -5.841778755187988,
2651
+ "logits/rejected": -5.841654300689697,
2652
+ "logps/chosen": -0.6818705797195435,
2653
+ "logps/rejected": -3.587505578994751,
2654
+ "loss": 0.7136,
2655
+ "odds_ratio_loss": 2.5504865646362305,
2656
+ "rewards/accuracies": 0.8291666507720947,
2657
+ "rewards/chosen": -0.06818706542253494,
2658
+ "rewards/margins": 0.29056352376937866,
2659
+ "rewards/rejected": -0.3587505519390106,
2660
+ "sft_loss": 0.45857658982276917,
2661
+ "step": 1560
2662
+ },
2663
+ {
2664
+ "epoch": 0.16537268102998037,
2665
+ "grad_norm": 3.6050593852996826,
2666
+ "learning_rate": 0.0004935302101047171,
2667
+ "logits/chosen": -5.996950149536133,
2668
+ "logits/rejected": -5.996947288513184,
2669
+ "logps/chosen": -0.6442388296127319,
2670
+ "logps/rejected": -3.464613437652588,
2671
+ "loss": 0.673,
2672
+ "odds_ratio_loss": 2.08451247215271,
2673
+ "rewards/accuracies": 0.8520833253860474,
2674
+ "rewards/chosen": -0.06442389637231827,
2675
+ "rewards/margins": 0.2820374071598053,
2676
+ "rewards/rejected": -0.34646129608154297,
2677
+ "sft_loss": 0.4645636975765228,
2678
+ "step": 1570
2679
+ },
2680
+ {
2681
+ "epoch": 0.1664260102085153,
2682
+ "grad_norm": 6.22938871383667,
2683
+ "learning_rate": 0.0004933207658930968,
2684
+ "logits/chosen": -6.110846996307373,
2685
+ "logits/rejected": -6.1108527183532715,
2686
+ "logps/chosen": -0.5905119776725769,
2687
+ "logps/rejected": -3.7033638954162598,
2688
+ "loss": 0.6196,
2689
+ "odds_ratio_loss": 2.2230594158172607,
2690
+ "rewards/accuracies": 0.8708333373069763,
2691
+ "rewards/chosen": -0.05905119329690933,
2692
+ "rewards/margins": 0.3112851679325104,
2693
+ "rewards/rejected": -0.3703364133834839,
2694
+ "sft_loss": 0.3972512185573578,
2695
+ "step": 1580
2696
+ },
2697
+ {
2698
+ "epoch": 0.1674793393870502,
2699
+ "grad_norm": 5.692537307739258,
2700
+ "learning_rate": 0.0004931080312138824,
2701
+ "logits/chosen": -5.9748077392578125,
2702
+ "logits/rejected": -5.974870681762695,
2703
+ "logps/chosen": -0.6414510607719421,
2704
+ "logps/rejected": -3.1401894092559814,
2705
+ "loss": 0.6755,
2706
+ "odds_ratio_loss": 2.408728837966919,
2707
+ "rewards/accuracies": 0.8416666388511658,
2708
+ "rewards/chosen": -0.06414511054754257,
2709
+ "rewards/margins": 0.24987384676933289,
2710
+ "rewards/rejected": -0.31401893496513367,
2711
+ "sft_loss": 0.4346589744091034,
2712
+ "step": 1590
2713
+ },
2714
+ {
2715
+ "epoch": 0.1685326685655851,
2716
+ "grad_norm": 4.23068380355835,
2717
+ "learning_rate": 0.0004928920089439206,
2718
+ "logits/chosen": -5.843720436096191,
2719
+ "logits/rejected": -5.84379768371582,
2720
+ "logps/chosen": -0.7081334590911865,
2721
+ "logps/rejected": -3.097430467605591,
2722
+ "loss": 0.7425,
2723
+ "odds_ratio_loss": 2.416752338409424,
2724
+ "rewards/accuracies": 0.8354166746139526,
2725
+ "rewards/chosen": -0.07081333547830582,
2726
+ "rewards/margins": 0.23892968893051147,
2727
+ "rewards/rejected": -0.3097430169582367,
2728
+ "sft_loss": 0.5008493661880493,
2729
+ "step": 1600
2730
+ },
2731
+ {
2732
+ "epoch": 0.16958599774412,
2733
+ "grad_norm": 7.679098606109619,
2734
+ "learning_rate": 0.000492672702004517,
2735
+ "logits/chosen": -5.8228349685668945,
2736
+ "logits/rejected": -5.822881698608398,
2737
+ "logps/chosen": -0.6390455365180969,
2738
+ "logps/rejected": -2.928208112716675,
2739
+ "loss": 0.6714,
2740
+ "odds_ratio_loss": 2.5421833992004395,
2741
+ "rewards/accuracies": 0.8479166626930237,
2742
+ "rewards/chosen": -0.0639045462012291,
2743
+ "rewards/margins": 0.2289162576198578,
2744
+ "rewards/rejected": -0.29282084107398987,
2745
+ "sft_loss": 0.4171499013900757,
2746
+ "step": 1610
2747
+ },
2748
+ {
2749
+ "epoch": 0.1706393269226549,
2750
+ "grad_norm": 2.961296558380127,
2751
+ "learning_rate": 0.000492450113361396,
2752
+ "logits/chosen": -5.76025915145874,
2753
+ "logits/rejected": -5.760366916656494,
2754
+ "logps/chosen": -0.707727313041687,
2755
+ "logps/rejected": -2.787848472595215,
2756
+ "loss": 0.7488,
2757
+ "odds_ratio_loss": 2.6859564781188965,
2758
+ "rewards/accuracies": 0.8083333373069763,
2759
+ "rewards/chosen": -0.07077272981405258,
2760
+ "rewards/margins": 0.20801211893558502,
2761
+ "rewards/rejected": -0.278784841299057,
2762
+ "sft_loss": 0.48024895787239075,
2763
+ "step": 1620
2764
+ },
2765
+ {
2766
+ "epoch": 0.17169265610118983,
2767
+ "grad_norm": 5.1038432121276855,
2768
+ "learning_rate": 0.0004922242460246613,
2769
+ "logits/chosen": -5.724485397338867,
2770
+ "logits/rejected": -5.724647521972656,
2771
+ "logps/chosen": -0.6975029110908508,
2772
+ "logps/rejected": -2.6335670948028564,
2773
+ "loss": 0.7302,
2774
+ "odds_ratio_loss": 2.552466869354248,
2775
+ "rewards/accuracies": 0.8729166388511658,
2776
+ "rewards/chosen": -0.06975029408931732,
2777
+ "rewards/margins": 0.1936063915491104,
2778
+ "rewards/rejected": -0.26335668563842773,
2779
+ "sft_loss": 0.4749198257923126,
2780
+ "step": 1630
2781
+ },
2782
+ {
2783
+ "epoch": 0.17274598527972473,
2784
+ "grad_norm": 4.588533401489258,
2785
+ "learning_rate": 0.0004919951030487549,
2786
+ "logits/chosen": -5.752465724945068,
2787
+ "logits/rejected": -5.75269079208374,
2788
+ "logps/chosen": -0.7377220392227173,
2789
+ "logps/rejected": -2.5051889419555664,
2790
+ "loss": 0.7742,
2791
+ "odds_ratio_loss": 2.6720080375671387,
2792
+ "rewards/accuracies": 0.8354166746139526,
2793
+ "rewards/chosen": -0.07377220690250397,
2794
+ "rewards/margins": 0.1767466962337494,
2795
+ "rewards/rejected": -0.25051891803741455,
2796
+ "sft_loss": 0.5070357918739319,
2797
+ "step": 1640
2798
+ },
2799
+ {
2800
+ "epoch": 0.17379931445825963,
2801
+ "grad_norm": 4.702722072601318,
2802
+ "learning_rate": 0.0004917626875324156,
2803
+ "logits/chosen": -5.999307155609131,
2804
+ "logits/rejected": -5.999597072601318,
2805
+ "logps/chosen": -0.6588828563690186,
2806
+ "logps/rejected": -3.105346202850342,
2807
+ "loss": 0.6907,
2808
+ "odds_ratio_loss": 2.3203392028808594,
2809
+ "rewards/accuracies": 0.862500011920929,
2810
+ "rewards/chosen": -0.06588829308748245,
2811
+ "rewards/margins": 0.244646355509758,
2812
+ "rewards/rejected": -0.31053462624549866,
2813
+ "sft_loss": 0.458629310131073,
2814
+ "step": 1650
2815
+ },
2816
+ {
2817
+ "epoch": 0.17485264363679454,
2818
+ "grad_norm": 4.34697961807251,
2819
+ "learning_rate": 0.0004915270026186377,
2820
+ "logits/chosen": -6.0448760986328125,
2821
+ "logits/rejected": -6.045097827911377,
2822
+ "logps/chosen": -0.6159750819206238,
2823
+ "logps/rejected": -3.314389705657959,
2824
+ "loss": 0.6451,
2825
+ "odds_ratio_loss": 2.2701919078826904,
2826
+ "rewards/accuracies": 0.862500011920929,
2827
+ "rewards/chosen": -0.06159750744700432,
2828
+ "rewards/margins": 0.2698414921760559,
2829
+ "rewards/rejected": -0.33143898844718933,
2830
+ "sft_loss": 0.4180779755115509,
2831
+ "step": 1660
2832
+ },
2833
+ {
2834
+ "epoch": 0.17590597281532946,
2835
+ "grad_norm": 6.3104119300842285,
2836
+ "learning_rate": 0.0004912880514946277,
2837
+ "logits/chosen": -6.198366165161133,
2838
+ "logits/rejected": -6.198540210723877,
2839
+ "logps/chosen": -0.6512912511825562,
2840
+ "logps/rejected": -3.2115368843078613,
2841
+ "loss": 0.6825,
2842
+ "odds_ratio_loss": 2.350752353668213,
2843
+ "rewards/accuracies": 0.8500000238418579,
2844
+ "rewards/chosen": -0.0651291236281395,
2845
+ "rewards/margins": 0.256024569272995,
2846
+ "rewards/rejected": -0.3211536705493927,
2847
+ "sft_loss": 0.4474564790725708,
2848
+ "step": 1670
2849
+ },
2850
+ {
2851
+ "epoch": 0.17695930199386437,
2852
+ "grad_norm": 4.557435989379883,
2853
+ "learning_rate": 0.0004910458373917618,
2854
+ "logits/chosen": -5.757941722869873,
2855
+ "logits/rejected": -5.75801944732666,
2856
+ "logps/chosen": -0.7286573648452759,
2857
+ "logps/rejected": -3.038480043411255,
2858
+ "loss": 0.7631,
2859
+ "odds_ratio_loss": 2.563901662826538,
2860
+ "rewards/accuracies": 0.8229166865348816,
2861
+ "rewards/chosen": -0.07286573201417923,
2862
+ "rewards/margins": 0.23098230361938477,
2863
+ "rewards/rejected": -0.303847998380661,
2864
+ "sft_loss": 0.5067596435546875,
2865
+ "step": 1680
2866
+ },
2867
+ {
2868
+ "epoch": 0.17801263117239927,
2869
+ "grad_norm": 4.517858028411865,
2870
+ "learning_rate": 0.0004908003635855421,
2871
+ "logits/chosen": -5.7685866355896,
2872
+ "logits/rejected": -5.768723011016846,
2873
+ "logps/chosen": -0.6709701418876648,
2874
+ "logps/rejected": -3.045161724090576,
2875
+ "loss": 0.7057,
2876
+ "odds_ratio_loss": 2.353038787841797,
2877
+ "rewards/accuracies": 0.8270833492279053,
2878
+ "rewards/chosen": -0.0670970231294632,
2879
+ "rewards/margins": 0.23741915822029114,
2880
+ "rewards/rejected": -0.30451616644859314,
2881
+ "sft_loss": 0.4703839421272278,
2882
+ "step": 1690
2883
+ },
2884
+ {
2885
+ "epoch": 0.17906596035093417,
2886
+ "grad_norm": 3.8505797386169434,
2887
+ "learning_rate": 0.0004905516333955521,
2888
+ "logits/chosen": -5.820653915405273,
2889
+ "logits/rejected": -5.820913791656494,
2890
+ "logps/chosen": -0.6014044880867004,
2891
+ "logps/rejected": -2.9712061882019043,
2892
+ "loss": 0.6348,
2893
+ "odds_ratio_loss": 2.333472728729248,
2894
+ "rewards/accuracies": 0.84375,
2895
+ "rewards/chosen": -0.060140449553728104,
2896
+ "rewards/margins": 0.23698018491268158,
2897
+ "rewards/rejected": -0.2971206307411194,
2898
+ "sft_loss": 0.40144431591033936,
2899
+ "step": 1700
2900
+ },
2901
+ {
2902
+ "epoch": 0.18011928952946907,
2903
+ "grad_norm": 5.725916385650635,
2904
+ "learning_rate": 0.0004902996501854119,
2905
+ "logits/chosen": -6.354620933532715,
2906
+ "logits/rejected": -6.355036735534668,
2907
+ "logps/chosen": -1.4475494623184204,
2908
+ "logps/rejected": -3.9082717895507812,
2909
+ "loss": 1.4845,
2910
+ "odds_ratio_loss": 4.034452438354492,
2911
+ "rewards/accuracies": 0.8291666507720947,
2912
+ "rewards/chosen": -0.14475493133068085,
2913
+ "rewards/margins": 0.24607227742671967,
2914
+ "rewards/rejected": -0.3908271789550781,
2915
+ "sft_loss": 1.0810879468917847,
2916
+ "step": 1710
2917
+ },
2918
+ {
2919
+ "epoch": 0.181172618708004,
2920
+ "grad_norm": 8.74376392364502,
2921
+ "learning_rate": 0.0004900444173627328,
2922
+ "logits/chosen": -6.625903129577637,
2923
+ "logits/rejected": -6.62622594833374,
2924
+ "logps/chosen": -0.7164724469184875,
2925
+ "logps/rejected": -3.1040403842926025,
2926
+ "loss": 0.7538,
2927
+ "odds_ratio_loss": 2.710744619369507,
2928
+ "rewards/accuracies": 0.8208333253860474,
2929
+ "rewards/chosen": -0.07164724916219711,
2930
+ "rewards/margins": 0.23875676095485687,
2931
+ "rewards/rejected": -0.3104040026664734,
2932
+ "sft_loss": 0.4827170670032501,
2933
+ "step": 1720
2934
+ },
2935
+ {
2936
+ "epoch": 0.1822259478865389,
2937
+ "grad_norm": 2.9171664714813232,
2938
+ "learning_rate": 0.0004897859383790711,
2939
+ "logits/chosen": -6.8558220863342285,
2940
+ "logits/rejected": -6.856238842010498,
2941
+ "logps/chosen": -0.6772664785385132,
2942
+ "logps/rejected": -3.0685346126556396,
2943
+ "loss": 0.7145,
2944
+ "odds_ratio_loss": 2.5943028926849365,
2945
+ "rewards/accuracies": 0.8145833611488342,
2946
+ "rewards/chosen": -0.06772664934396744,
2947
+ "rewards/margins": 0.23912683129310608,
2948
+ "rewards/rejected": -0.3068534731864929,
2949
+ "sft_loss": 0.4550252854824066,
2950
+ "step": 1730
2951
+ },
2952
+ {
2953
+ "epoch": 0.1832792770650738,
2954
+ "grad_norm": 4.223478317260742,
2955
+ "learning_rate": 0.0004895242167298816,
2956
+ "logits/chosen": -6.91244649887085,
2957
+ "logits/rejected": -6.912972927093506,
2958
+ "logps/chosen": -0.6928088665008545,
2959
+ "logps/rejected": -3.333386182785034,
2960
+ "loss": 0.724,
2961
+ "odds_ratio_loss": 2.613675117492676,
2962
+ "rewards/accuracies": 0.8291666507720947,
2963
+ "rewards/chosen": -0.06928088515996933,
2964
+ "rewards/margins": 0.2640577256679535,
2965
+ "rewards/rejected": -0.3333386480808258,
2966
+ "sft_loss": 0.4625937044620514,
2967
+ "step": 1740
2968
+ },
2969
+ {
2970
+ "epoch": 0.1843326062436087,
2971
+ "grad_norm": 6.387345790863037,
2972
+ "learning_rate": 0.0004892592559544702,
2973
+ "logits/chosen": -6.475886821746826,
2974
+ "logits/rejected": -6.476265907287598,
2975
+ "logps/chosen": -0.689757764339447,
2976
+ "logps/rejected": -2.9596078395843506,
2977
+ "loss": 0.7284,
2978
+ "odds_ratio_loss": 2.5536398887634277,
2979
+ "rewards/accuracies": 0.8083333373069763,
2980
+ "rewards/chosen": -0.0689757764339447,
2981
+ "rewards/margins": 0.22698503732681274,
2982
+ "rewards/rejected": -0.29596078395843506,
2983
+ "sft_loss": 0.47301238775253296,
2984
+ "step": 1750
2985
+ },
2986
+ {
2987
+ "epoch": 0.1853859354221436,
2988
+ "grad_norm": 4.752090930938721,
2989
+ "learning_rate": 0.0004889910596359457,
2990
+ "logits/chosen": -6.3866801261901855,
2991
+ "logits/rejected": -6.3870625495910645,
2992
+ "logps/chosen": -0.6425169110298157,
2993
+ "logps/rejected": -3.2402262687683105,
2994
+ "loss": 0.6768,
2995
+ "odds_ratio_loss": 2.416498899459839,
2996
+ "rewards/accuracies": 0.8583333492279053,
2997
+ "rewards/chosen": -0.06425168365240097,
2998
+ "rewards/margins": 0.259770929813385,
2999
+ "rewards/rejected": -0.3240226209163666,
3000
+ "sft_loss": 0.43517401814460754,
3001
+ "step": 1760
3002
+ },
3003
+ {
3004
+ "epoch": 0.18643926460067853,
3005
+ "grad_norm": 4.994302272796631,
3006
+ "learning_rate": 0.0004887196314011722,
3007
+ "logits/chosen": -6.208808422088623,
3008
+ "logits/rejected": -6.209136009216309,
3009
+ "logps/chosen": -0.6876904964447021,
3010
+ "logps/rejected": -3.412658929824829,
3011
+ "loss": 0.7195,
3012
+ "odds_ratio_loss": 2.441316843032837,
3013
+ "rewards/accuracies": 0.8479166626930237,
3014
+ "rewards/chosen": -0.06876904517412186,
3015
+ "rewards/margins": 0.2724968492984772,
3016
+ "rewards/rejected": -0.34126585721969604,
3017
+ "sft_loss": 0.4754055142402649,
3018
+ "step": 1770
3019
+ },
3020
+ {
3021
+ "epoch": 0.18749259377921343,
3022
+ "grad_norm": 6.439792156219482,
3023
+ "learning_rate": 0.0004884449749207192,
3024
+ "logits/chosen": -6.457438945770264,
3025
+ "logits/rejected": -6.457731246948242,
3026
+ "logps/chosen": -0.6336179375648499,
3027
+ "logps/rejected": -2.840827703475952,
3028
+ "loss": 0.6657,
3029
+ "odds_ratio_loss": 2.448340892791748,
3030
+ "rewards/accuracies": 0.8708333373069763,
3031
+ "rewards/chosen": -0.06336179375648499,
3032
+ "rewards/margins": 0.22072099149227142,
3033
+ "rewards/rejected": -0.2840828001499176,
3034
+ "sft_loss": 0.420904278755188,
3035
+ "step": 1780
3036
+ },
3037
+ {
3038
+ "epoch": 0.18854592295774833,
3039
+ "grad_norm": 4.342998027801514,
3040
+ "learning_rate": 0.00048816709390881266,
3041
+ "logits/chosen": -6.21989631652832,
3042
+ "logits/rejected": -6.220187664031982,
3043
+ "logps/chosen": -0.6857010722160339,
3044
+ "logps/rejected": -2.874756336212158,
3045
+ "loss": 0.7182,
3046
+ "odds_ratio_loss": 2.453400135040283,
3047
+ "rewards/accuracies": 0.8520833253860474,
3048
+ "rewards/chosen": -0.06857011467218399,
3049
+ "rewards/margins": 0.218905508518219,
3050
+ "rewards/rejected": -0.2874756455421448,
3051
+ "sft_loss": 0.47288984060287476,
3052
+ "step": 1790
3053
+ },
3054
+ {
3055
+ "epoch": 0.18959925213628323,
3056
+ "grad_norm": 33.773277282714844,
3057
+ "learning_rate": 0.0004878859921232839,
3058
+ "logits/chosen": -5.917886257171631,
3059
+ "logits/rejected": -5.9181623458862305,
3060
+ "logps/chosen": -0.7129290103912354,
3061
+ "logps/rejected": -2.9589409828186035,
3062
+ "loss": 0.7486,
3063
+ "odds_ratio_loss": 2.4278337955474854,
3064
+ "rewards/accuracies": 0.8166666626930237,
3065
+ "rewards/chosen": -0.07129290699958801,
3066
+ "rewards/margins": 0.22460119426250458,
3067
+ "rewards/rejected": -0.2958941161632538,
3068
+ "sft_loss": 0.5058320760726929,
3069
+ "step": 1800
3070
+ },
3071
+ {
3072
+ "epoch": 0.19065258131481816,
3073
+ "grad_norm": 4.040477275848389,
3074
+ "learning_rate": 0.00048760167336551964,
3075
+ "logits/chosen": -5.841413974761963,
3076
+ "logits/rejected": -5.8417158126831055,
3077
+ "logps/chosen": -0.6335561275482178,
3078
+ "logps/rejected": -3.0441653728485107,
3079
+ "loss": 0.6684,
3080
+ "odds_ratio_loss": 2.3195104598999023,
3081
+ "rewards/accuracies": 0.8354166746139526,
3082
+ "rewards/chosen": -0.06335561722517014,
3083
+ "rewards/margins": 0.24106094241142273,
3084
+ "rewards/rejected": -0.30441656708717346,
3085
+ "sft_loss": 0.4364630877971649,
3086
+ "step": 1810
3087
+ },
3088
+ {
3089
+ "epoch": 0.19170591049335306,
3090
+ "grad_norm": 6.749710559844971,
3091
+ "learning_rate": 0.0004873141414804103,
3092
+ "logits/chosen": -5.7162394523620605,
3093
+ "logits/rejected": -5.716516017913818,
3094
+ "logps/chosen": -0.6518925428390503,
3095
+ "logps/rejected": -3.0878660678863525,
3096
+ "loss": 0.6886,
3097
+ "odds_ratio_loss": 2.5920615196228027,
3098
+ "rewards/accuracies": 0.8458333611488342,
3099
+ "rewards/chosen": -0.06518926471471786,
3100
+ "rewards/margins": 0.2435973733663559,
3101
+ "rewards/rejected": -0.30878666043281555,
3102
+ "sft_loss": 0.4293573498725891,
3103
+ "step": 1820
3104
+ },
3105
+ {
3106
+ "epoch": 0.19275923967188796,
3107
+ "grad_norm": 2.045081615447998,
3108
+ "learning_rate": 0.00048702340035629787,
3109
+ "logits/chosen": -5.856993198394775,
3110
+ "logits/rejected": -5.857146263122559,
3111
+ "logps/chosen": -0.6080865263938904,
3112
+ "logps/rejected": -2.7589311599731445,
3113
+ "loss": 0.6369,
3114
+ "odds_ratio_loss": 2.007796287536621,
3115
+ "rewards/accuracies": 0.8666666746139526,
3116
+ "rewards/chosen": -0.06080865487456322,
3117
+ "rewards/margins": 0.21508444845676422,
3118
+ "rewards/rejected": -0.27589309215545654,
3119
+ "sft_loss": 0.43609535694122314,
3120
+ "step": 1830
3121
+ },
3122
+ {
3123
+ "epoch": 0.19381256885042286,
3124
+ "grad_norm": 4.158146381378174,
3125
+ "learning_rate": 0.0004867294539249234,
3126
+ "logits/chosen": -6.230529308319092,
3127
+ "logits/rejected": -6.230895519256592,
3128
+ "logps/chosen": -0.6979438066482544,
3129
+ "logps/rejected": -3.5587799549102783,
3130
+ "loss": 0.7295,
3131
+ "odds_ratio_loss": 2.502671718597412,
3132
+ "rewards/accuracies": 0.862500011920929,
3133
+ "rewards/chosen": -0.06979438662528992,
3134
+ "rewards/margins": 0.2860836088657379,
3135
+ "rewards/rejected": -0.35587799549102783,
3136
+ "sft_loss": 0.4792328178882599,
3137
+ "step": 1840
3138
+ },
3139
+ {
3140
+ "epoch": 0.19486589802895776,
3141
+ "grad_norm": 2.7599072456359863,
3142
+ "learning_rate": 0.0004864323061613738,
3143
+ "logits/chosen": -6.244935512542725,
3144
+ "logits/rejected": -6.245189189910889,
3145
+ "logps/chosen": -0.6155544519424438,
3146
+ "logps/rejected": -3.0617711544036865,
3147
+ "loss": 0.6473,
3148
+ "odds_ratio_loss": 2.4084250926971436,
3149
+ "rewards/accuracies": 0.84375,
3150
+ "rewards/chosen": -0.06155544891953468,
3151
+ "rewards/margins": 0.2446216493844986,
3152
+ "rewards/rejected": -0.3061771094799042,
3153
+ "sft_loss": 0.4064619243144989,
3154
+ "step": 1850
3155
+ },
3156
+ {
3157
+ "epoch": 0.1959192272074927,
3158
+ "grad_norm": 4.056497573852539,
3159
+ "learning_rate": 0.0004861319610840282,
3160
+ "logits/chosen": -5.854410648345947,
3161
+ "logits/rejected": -5.8545074462890625,
3162
+ "logps/chosen": -0.7075474262237549,
3163
+ "logps/rejected": -3.4564712047576904,
3164
+ "loss": 0.7458,
3165
+ "odds_ratio_loss": 2.5600531101226807,
3166
+ "rewards/accuracies": 0.8333333134651184,
3167
+ "rewards/chosen": -0.07075474411249161,
3168
+ "rewards/margins": 0.2748924195766449,
3169
+ "rewards/rejected": -0.3456471860408783,
3170
+ "sft_loss": 0.4897785782814026,
3171
+ "step": 1860
3172
+ },
3173
+ {
3174
+ "epoch": 0.1969725563860276,
3175
+ "grad_norm": 7.0494489669799805,
3176
+ "learning_rate": 0.00048582842275450366,
3177
+ "logits/chosen": -5.870307922363281,
3178
+ "logits/rejected": -5.870253086090088,
3179
+ "logps/chosen": -0.6499666571617126,
3180
+ "logps/rejected": -3.4302499294281006,
3181
+ "loss": 0.6847,
3182
+ "odds_ratio_loss": 2.5706870555877686,
3183
+ "rewards/accuracies": 0.8520833253860474,
3184
+ "rewards/chosen": -0.06499668210744858,
3185
+ "rewards/margins": 0.27802836894989014,
3186
+ "rewards/rejected": -0.3430250287055969,
3187
+ "sft_loss": 0.4276408553123474,
3188
+ "step": 1870
3189
+ },
3190
+ {
3191
+ "epoch": 0.1980258855645625,
3192
+ "grad_norm": 9.770491600036621,
3193
+ "learning_rate": 0.0004855216952775999,
3194
+ "logits/chosen": -6.05530309677124,
3195
+ "logits/rejected": -6.05518102645874,
3196
+ "logps/chosen": -0.6710807681083679,
3197
+ "logps/rejected": -3.7501718997955322,
3198
+ "loss": 0.7031,
3199
+ "odds_ratio_loss": 2.5512893199920654,
3200
+ "rewards/accuracies": 0.8583333492279053,
3201
+ "rewards/chosen": -0.06710807234048843,
3202
+ "rewards/margins": 0.3079090714454651,
3203
+ "rewards/rejected": -0.3750171661376953,
3204
+ "sft_loss": 0.4479447305202484,
3205
+ "step": 1880
3206
+ },
3207
+ {
3208
+ "epoch": 0.1990792147430974,
3209
+ "grad_norm": 46.68679428100586,
3210
+ "learning_rate": 0.0004852117828012441,
3211
+ "logits/chosen": -6.125611782073975,
3212
+ "logits/rejected": -6.125678539276123,
3213
+ "logps/chosen": -0.8755971789360046,
3214
+ "logps/rejected": -4.0310163497924805,
3215
+ "loss": 0.9086,
3216
+ "odds_ratio_loss": 3.206876516342163,
3217
+ "rewards/accuracies": 0.8416666388511658,
3218
+ "rewards/chosen": -0.08755972236394882,
3219
+ "rewards/margins": 0.31554192304611206,
3220
+ "rewards/rejected": -0.4031016528606415,
3221
+ "sft_loss": 0.5878926515579224,
3222
+ "step": 1890
3223
+ },
3224
+ {
3225
+ "epoch": 0.20013254392163232,
3226
+ "grad_norm": 7.289094924926758,
3227
+ "learning_rate": 0.00048489868951643477,
3228
+ "logits/chosen": -6.526234149932861,
3229
+ "logits/rejected": -6.526541233062744,
3230
+ "logps/chosen": -1.0017836093902588,
3231
+ "logps/rejected": -3.509293556213379,
3232
+ "loss": 1.0477,
3233
+ "odds_ratio_loss": 3.5900070667266846,
3234
+ "rewards/accuracies": 0.8270833492279053,
3235
+ "rewards/chosen": -0.10017836093902588,
3236
+ "rewards/margins": 0.2507510483264923,
3237
+ "rewards/rejected": -0.3509294092655182,
3238
+ "sft_loss": 0.6887442469596863,
3239
+ "step": 1900
3240
  }
3241
  ],
3242
  "logging_steps": 10,
 
3256
  "attributes": {}
3257
  }
3258
  },
3259
+ "total_flos": 1.4024826693511741e+18,
3260
  "train_batch_size": 2,
3261
  "trial_name": null,
3262
  "trial_params": null