flozi00 commited on
Commit
61b785d
1 Parent(s): 559435c

Upload folder using huggingface_hub

Browse files
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f66bfb42982b8c06c9bc25b8b99d14435f7a44a4827e6f1c4f4f62b47436fd6f
3
  size 4996670464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:035fcee6ade42d8a6e210e0bfb333167cdc4142533f169f6bd7e5c2c8b59ca73
3
  size 4996670464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c9c22855dcd0d98bb0e09d292039b2831909be91813061f03aa2692016df446
3
  size 1178224960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5021db8004c45a9a530ca6803cb7f47e7efcbccaab5aa96fe3152fc5d66ad20a
3
  size 1178224960
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9a2af9d07fb61306ec347f800e5ef1d06cd08a760bc2a70df53fd800b38069f3
3
- size 3094642562
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74c17b077acfeba0f92ffed7849a96f3861ffba1ac7f25e9d00e928a0c556655
3
+ size 3094642882
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aca84d21747b494f2647f19153afa27bad629c2a874992b2bbe3f010aee01c0c
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a742e5814e33065999dacc80c1127cad656555ce6fc832d7c43bde53fdae9c09
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.021066583570698138,
5
  "eval_steps": 5000,
6
- "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -347,6 +347,1026 @@
347
  "rewards/rejected": -0.4050002992153168,
348
  "sft_loss": 0.4557226896286011,
349
  "step": 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  }
351
  ],
352
  "logging_steps": 10,
@@ -366,7 +1386,7 @@
366
  "attributes": {}
367
  }
368
  },
369
- "total_flos": 1.4746225175966515e+17,
370
  "train_batch_size": 2,
371
  "trial_name": null,
372
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.08426633428279255,
5
  "eval_steps": 5000,
6
+ "global_step": 800,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
347
  "rewards/rejected": -0.4050002992153168,
348
  "sft_loss": 0.4557226896286011,
349
  "step": 200
350
+ },
351
+ {
352
+ "epoch": 0.022119912749233046,
353
+ "grad_norm": 5.377903938293457,
354
+ "learning_rate": 0.00011052631578947368,
355
+ "logits/chosen": -2.777975559234619,
356
+ "logits/rejected": -2.778069257736206,
357
+ "logps/chosen": -0.663443922996521,
358
+ "logps/rejected": -3.8639333248138428,
359
+ "loss": 0.69,
360
+ "odds_ratio_loss": 1.9626834392547607,
361
+ "rewards/accuracies": 0.893750011920929,
362
+ "rewards/chosen": -0.06634439527988434,
363
+ "rewards/margins": 0.3200489580631256,
364
+ "rewards/rejected": -0.38639336824417114,
365
+ "sft_loss": 0.4936945140361786,
366
+ "step": 210
367
+ },
368
+ {
369
+ "epoch": 0.02317324192776795,
370
+ "grad_norm": 11.14415454864502,
371
+ "learning_rate": 0.00011578947368421053,
372
+ "logits/chosen": -2.69752836227417,
373
+ "logits/rejected": -2.6975817680358887,
374
+ "logps/chosen": -0.7092010378837585,
375
+ "logps/rejected": -3.6659300327301025,
376
+ "loss": 0.7441,
377
+ "odds_ratio_loss": 2.298079013824463,
378
+ "rewards/accuracies": 0.8354166746139526,
379
+ "rewards/chosen": -0.07092010229825974,
380
+ "rewards/margins": 0.29567286372184753,
381
+ "rewards/rejected": -0.36659297347068787,
382
+ "sft_loss": 0.5143173933029175,
383
+ "step": 220
384
+ },
385
+ {
386
+ "epoch": 0.024226571106302858,
387
+ "grad_norm": 17.70037269592285,
388
+ "learning_rate": 0.00012105263157894738,
389
+ "logits/chosen": -2.8756678104400635,
390
+ "logits/rejected": -2.875657796859741,
391
+ "logps/chosen": -0.639348566532135,
392
+ "logps/rejected": -4.930140018463135,
393
+ "loss": 0.6653,
394
+ "odds_ratio_loss": 2.042109489440918,
395
+ "rewards/accuracies": 0.8958333134651184,
396
+ "rewards/chosen": -0.06393485516309738,
397
+ "rewards/margins": 0.42907920479774475,
398
+ "rewards/rejected": -0.49301406741142273,
399
+ "sft_loss": 0.4610413908958435,
400
+ "step": 230
401
+ },
402
+ {
403
+ "epoch": 0.025279900284837765,
404
+ "grad_norm": 22.880346298217773,
405
+ "learning_rate": 0.0001263157894736842,
406
+ "logits/chosen": -3.3157336711883545,
407
+ "logits/rejected": -3.3157567977905273,
408
+ "logps/chosen": -1.2456268072128296,
409
+ "logps/rejected": -5.491827964782715,
410
+ "loss": 1.3155,
411
+ "odds_ratio_loss": 4.847614765167236,
412
+ "rewards/accuracies": 0.793749988079071,
413
+ "rewards/chosen": -0.12456268817186356,
414
+ "rewards/margins": 0.4246201515197754,
415
+ "rewards/rejected": -0.5491827726364136,
416
+ "sft_loss": 0.8306990265846252,
417
+ "step": 240
418
+ },
419
+ {
420
+ "epoch": 0.026333229463372673,
421
+ "grad_norm": 5.729049205780029,
422
+ "learning_rate": 0.00013157894736842105,
423
+ "logits/chosen": -2.5473344326019287,
424
+ "logits/rejected": -2.5469789505004883,
425
+ "logps/chosen": -0.749646008014679,
426
+ "logps/rejected": -6.155911445617676,
427
+ "loss": 0.7811,
428
+ "odds_ratio_loss": 2.4923181533813477,
429
+ "rewards/accuracies": 0.8645833134651184,
430
+ "rewards/chosen": -0.07496459782123566,
431
+ "rewards/margins": 0.5406264662742615,
432
+ "rewards/rejected": -0.6155910491943359,
433
+ "sft_loss": 0.5319061875343323,
434
+ "step": 250
435
+ },
436
+ {
437
+ "epoch": 0.027386558641907578,
438
+ "grad_norm": 11.868535995483398,
439
+ "learning_rate": 0.00013684210526315792,
440
+ "logits/chosen": -2.9207849502563477,
441
+ "logits/rejected": -2.9205822944641113,
442
+ "logps/chosen": -0.8412100076675415,
443
+ "logps/rejected": -5.425389289855957,
444
+ "loss": 0.8748,
445
+ "odds_ratio_loss": 3.032189130783081,
446
+ "rewards/accuracies": 0.8479166626930237,
447
+ "rewards/chosen": -0.08412099629640579,
448
+ "rewards/margins": 0.4584178924560547,
449
+ "rewards/rejected": -0.5425389409065247,
450
+ "sft_loss": 0.5715639591217041,
451
+ "step": 260
452
+ },
453
+ {
454
+ "epoch": 0.028439887820442485,
455
+ "grad_norm": 9.387285232543945,
456
+ "learning_rate": 0.00014210526315789474,
457
+ "logits/chosen": -3.1293420791625977,
458
+ "logits/rejected": -3.129204273223877,
459
+ "logps/chosen": -0.7858380079269409,
460
+ "logps/rejected": -4.375970363616943,
461
+ "loss": 0.8281,
462
+ "odds_ratio_loss": 2.820624351501465,
463
+ "rewards/accuracies": 0.8062499761581421,
464
+ "rewards/chosen": -0.07858379930257797,
465
+ "rewards/margins": 0.35901322960853577,
466
+ "rewards/rejected": -0.43759700655937195,
467
+ "sft_loss": 0.5460221171379089,
468
+ "step": 270
469
+ },
470
+ {
471
+ "epoch": 0.029493216998977393,
472
+ "grad_norm": 5.87777042388916,
473
+ "learning_rate": 0.00014736842105263158,
474
+ "logits/chosen": -2.9154765605926514,
475
+ "logits/rejected": -2.91546368598938,
476
+ "logps/chosen": -0.595488965511322,
477
+ "logps/rejected": -3.7737627029418945,
478
+ "loss": 0.6283,
479
+ "odds_ratio_loss": 2.1527862548828125,
480
+ "rewards/accuracies": 0.8604166507720947,
481
+ "rewards/chosen": -0.05954889953136444,
482
+ "rewards/margins": 0.31782734394073486,
483
+ "rewards/rejected": -0.3773762583732605,
484
+ "sft_loss": 0.4130483567714691,
485
+ "step": 280
486
+ },
487
+ {
488
+ "epoch": 0.0305465461775123,
489
+ "grad_norm": 10.7889986038208,
490
+ "learning_rate": 0.00015263157894736842,
491
+ "logits/chosen": -3.0789036750793457,
492
+ "logits/rejected": -3.079068899154663,
493
+ "logps/chosen": -0.5851417779922485,
494
+ "logps/rejected": -3.892369031906128,
495
+ "loss": 0.6135,
496
+ "odds_ratio_loss": 2.1990480422973633,
497
+ "rewards/accuracies": 0.8791666626930237,
498
+ "rewards/chosen": -0.05851416662335396,
499
+ "rewards/margins": 0.33072274923324585,
500
+ "rewards/rejected": -0.3892369568347931,
501
+ "sft_loss": 0.3936450183391571,
502
+ "step": 290
503
+ },
504
+ {
505
+ "epoch": 0.031599875356047205,
506
+ "grad_norm": 10.757394790649414,
507
+ "learning_rate": 0.00015789473684210527,
508
+ "logits/chosen": -3.1641581058502197,
509
+ "logits/rejected": -3.1641595363616943,
510
+ "logps/chosen": -0.625824511051178,
511
+ "logps/rejected": -4.4615254402160645,
512
+ "loss": 0.6551,
513
+ "odds_ratio_loss": 2.074664354324341,
514
+ "rewards/accuracies": 0.8708333373069763,
515
+ "rewards/chosen": -0.06258244812488556,
516
+ "rewards/margins": 0.3835701644420624,
517
+ "rewards/rejected": -0.44615259766578674,
518
+ "sft_loss": 0.44765299558639526,
519
+ "step": 300
520
+ },
521
+ {
522
+ "epoch": 0.03265320453458211,
523
+ "grad_norm": 8.260001182556152,
524
+ "learning_rate": 0.0001631578947368421,
525
+ "logits/chosen": -3.1696364879608154,
526
+ "logits/rejected": -3.169647693634033,
527
+ "logps/chosen": -0.6334646940231323,
528
+ "logps/rejected": -4.125209331512451,
529
+ "loss": 0.6653,
530
+ "odds_ratio_loss": 2.147244691848755,
531
+ "rewards/accuracies": 0.8812500238418579,
532
+ "rewards/chosen": -0.06334646791219711,
533
+ "rewards/margins": 0.34917446970939636,
534
+ "rewards/rejected": -0.4125209450721741,
535
+ "sft_loss": 0.45062482357025146,
536
+ "step": 310
537
+ },
538
+ {
539
+ "epoch": 0.03370653371311702,
540
+ "grad_norm": 17.23076057434082,
541
+ "learning_rate": 0.00016842105263157895,
542
+ "logits/chosen": -3.175907611846924,
543
+ "logits/rejected": -3.175673007965088,
544
+ "logps/chosen": -0.8329946994781494,
545
+ "logps/rejected": -4.718179702758789,
546
+ "loss": 0.8649,
547
+ "odds_ratio_loss": 2.6120009422302246,
548
+ "rewards/accuracies": 0.8729166388511658,
549
+ "rewards/chosen": -0.08329946547746658,
550
+ "rewards/margins": 0.3885185122489929,
551
+ "rewards/rejected": -0.4718180298805237,
552
+ "sft_loss": 0.6036695837974548,
553
+ "step": 320
554
+ },
555
+ {
556
+ "epoch": 0.03475986289165193,
557
+ "grad_norm": 8.57013988494873,
558
+ "learning_rate": 0.0001736842105263158,
559
+ "logits/chosen": -2.885397434234619,
560
+ "logits/rejected": -2.8853094577789307,
561
+ "logps/chosen": -0.7634103298187256,
562
+ "logps/rejected": -3.892472982406616,
563
+ "loss": 0.7919,
564
+ "odds_ratio_loss": 2.4835994243621826,
565
+ "rewards/accuracies": 0.8770833611488342,
566
+ "rewards/chosen": -0.07634103298187256,
567
+ "rewards/margins": 0.31290626525878906,
568
+ "rewards/rejected": -0.389247328042984,
569
+ "sft_loss": 0.5435259938240051,
570
+ "step": 330
571
+ },
572
+ {
573
+ "epoch": 0.035813192070186836,
574
+ "grad_norm": 6.161098957061768,
575
+ "learning_rate": 0.00017894736842105264,
576
+ "logits/chosen": -2.7474565505981445,
577
+ "logits/rejected": -2.747159004211426,
578
+ "logps/chosen": -0.6635507345199585,
579
+ "logps/rejected": -4.673018455505371,
580
+ "loss": 0.6934,
581
+ "odds_ratio_loss": 2.4976134300231934,
582
+ "rewards/accuracies": 0.862500011920929,
583
+ "rewards/chosen": -0.06635507941246033,
584
+ "rewards/margins": 0.4009467363357544,
585
+ "rewards/rejected": -0.4673018753528595,
586
+ "sft_loss": 0.44364967942237854,
587
+ "step": 340
588
+ },
589
+ {
590
+ "epoch": 0.036866521248721744,
591
+ "grad_norm": 6.6998372077941895,
592
+ "learning_rate": 0.00018421052631578948,
593
+ "logits/chosen": -3.3279902935028076,
594
+ "logits/rejected": -3.327728509902954,
595
+ "logps/chosen": -0.7464654445648193,
596
+ "logps/rejected": -4.424903869628906,
597
+ "loss": 0.7811,
598
+ "odds_ratio_loss": 2.4744033813476562,
599
+ "rewards/accuracies": 0.8270833492279053,
600
+ "rewards/chosen": -0.07464654743671417,
601
+ "rewards/margins": 0.3678438365459442,
602
+ "rewards/rejected": -0.4424903988838196,
603
+ "sft_loss": 0.5336239337921143,
604
+ "step": 350
605
+ },
606
+ {
607
+ "epoch": 0.037919850427256645,
608
+ "grad_norm": 4.3592400550842285,
609
+ "learning_rate": 0.00018947368421052632,
610
+ "logits/chosen": -3.632689952850342,
611
+ "logits/rejected": -3.6323180198669434,
612
+ "logps/chosen": -0.6197668313980103,
613
+ "logps/rejected": -5.021815299987793,
614
+ "loss": 0.647,
615
+ "odds_ratio_loss": 2.0902163982391357,
616
+ "rewards/accuracies": 0.893750011920929,
617
+ "rewards/chosen": -0.061976686120033264,
618
+ "rewards/margins": 0.44020482897758484,
619
+ "rewards/rejected": -0.5021815299987793,
620
+ "sft_loss": 0.43794170022010803,
621
+ "step": 360
622
+ },
623
+ {
624
+ "epoch": 0.03897317960579155,
625
+ "grad_norm": 9.133977890014648,
626
+ "learning_rate": 0.00019473684210526317,
627
+ "logits/chosen": -3.6677591800689697,
628
+ "logits/rejected": -3.667369842529297,
629
+ "logps/chosen": -0.6474730372428894,
630
+ "logps/rejected": -5.439915180206299,
631
+ "loss": 0.6779,
632
+ "odds_ratio_loss": 2.1945674419403076,
633
+ "rewards/accuracies": 0.8666666746139526,
634
+ "rewards/chosen": -0.06474730372428894,
635
+ "rewards/margins": 0.4792442321777344,
636
+ "rewards/rejected": -0.5439915060997009,
637
+ "sft_loss": 0.45841965079307556,
638
+ "step": 370
639
+ },
640
+ {
641
+ "epoch": 0.04002650878432646,
642
+ "grad_norm": 9.33304214477539,
643
+ "learning_rate": 0.0002,
644
+ "logits/chosen": -3.5276687145233154,
645
+ "logits/rejected": -3.527397632598877,
646
+ "logps/chosen": -0.6519566178321838,
647
+ "logps/rejected": -4.4450178146362305,
648
+ "loss": 0.6857,
649
+ "odds_ratio_loss": 2.411801338195801,
650
+ "rewards/accuracies": 0.84375,
651
+ "rewards/chosen": -0.06519566476345062,
652
+ "rewards/margins": 0.3793061375617981,
653
+ "rewards/rejected": -0.4445018172264099,
654
+ "sft_loss": 0.44453203678131104,
655
+ "step": 380
656
+ },
657
+ {
658
+ "epoch": 0.04107983796286137,
659
+ "grad_norm": 8.269370079040527,
660
+ "learning_rate": 0.00020526315789473685,
661
+ "logits/chosen": -3.345468282699585,
662
+ "logits/rejected": -3.3452601432800293,
663
+ "logps/chosen": -0.7482808232307434,
664
+ "logps/rejected": -4.133052349090576,
665
+ "loss": 0.7849,
666
+ "odds_ratio_loss": 2.473043918609619,
667
+ "rewards/accuracies": 0.8395833373069763,
668
+ "rewards/chosen": -0.07482809573411942,
669
+ "rewards/margins": 0.33847716450691223,
670
+ "rewards/rejected": -0.41330528259277344,
671
+ "sft_loss": 0.5376084446907043,
672
+ "step": 390
673
+ },
674
+ {
675
+ "epoch": 0.042133167141396276,
676
+ "grad_norm": 3.1917130947113037,
677
+ "learning_rate": 0.00021052631578947367,
678
+ "logits/chosen": -3.498554229736328,
679
+ "logits/rejected": -3.4982807636260986,
680
+ "logps/chosen": -0.6910140514373779,
681
+ "logps/rejected": -4.305008888244629,
682
+ "loss": 0.7229,
683
+ "odds_ratio_loss": 2.5834543704986572,
684
+ "rewards/accuracies": 0.8374999761581421,
685
+ "rewards/chosen": -0.06910141557455063,
686
+ "rewards/margins": 0.36139950156211853,
687
+ "rewards/rejected": -0.43050095438957214,
688
+ "sft_loss": 0.46452444791793823,
689
+ "step": 400
690
+ },
691
+ {
692
+ "epoch": 0.043186496319931184,
693
+ "grad_norm": 8.981714248657227,
694
+ "learning_rate": 0.00021578947368421054,
695
+ "logits/chosen": -3.482508420944214,
696
+ "logits/rejected": -3.482311725616455,
697
+ "logps/chosen": -0.666519045829773,
698
+ "logps/rejected": -3.838574171066284,
699
+ "loss": 0.7004,
700
+ "odds_ratio_loss": 2.59932017326355,
701
+ "rewards/accuracies": 0.84375,
702
+ "rewards/chosen": -0.06665190309286118,
703
+ "rewards/margins": 0.3172055184841156,
704
+ "rewards/rejected": -0.3838574290275574,
705
+ "sft_loss": 0.4405144453048706,
706
+ "step": 410
707
+ },
708
+ {
709
+ "epoch": 0.04423982549846609,
710
+ "grad_norm": 5.946087837219238,
711
+ "learning_rate": 0.00022105263157894735,
712
+ "logits/chosen": -3.5680463314056396,
713
+ "logits/rejected": -3.5679588317871094,
714
+ "logps/chosen": -0.6861178874969482,
715
+ "logps/rejected": -3.294382333755493,
716
+ "loss": 0.7209,
717
+ "odds_ratio_loss": 2.4581611156463623,
718
+ "rewards/accuracies": 0.8520833253860474,
719
+ "rewards/chosen": -0.06861178576946259,
720
+ "rewards/margins": 0.26082643866539,
721
+ "rewards/rejected": -0.3294382095336914,
722
+ "sft_loss": 0.4750979244709015,
723
+ "step": 420
724
+ },
725
+ {
726
+ "epoch": 0.045293154677001,
727
+ "grad_norm": 16.975387573242188,
728
+ "learning_rate": 0.00022631578947368422,
729
+ "logits/chosen": -3.8716492652893066,
730
+ "logits/rejected": -3.871539831161499,
731
+ "logps/chosen": -0.7186715602874756,
732
+ "logps/rejected": -3.2046849727630615,
733
+ "loss": 0.7559,
734
+ "odds_ratio_loss": 2.6692261695861816,
735
+ "rewards/accuracies": 0.8458333611488342,
736
+ "rewards/chosen": -0.07186715304851532,
737
+ "rewards/margins": 0.24860134720802307,
738
+ "rewards/rejected": -0.3204684853553772,
739
+ "sft_loss": 0.48894843459129333,
740
+ "step": 430
741
+ },
742
+ {
743
+ "epoch": 0.0463464838555359,
744
+ "grad_norm": 3.843916416168213,
745
+ "learning_rate": 0.00023157894736842107,
746
+ "logits/chosen": -3.794214963912964,
747
+ "logits/rejected": -3.794062852859497,
748
+ "logps/chosen": -0.6966003179550171,
749
+ "logps/rejected": -3.6082844734191895,
750
+ "loss": 0.7316,
751
+ "odds_ratio_loss": 2.6561334133148193,
752
+ "rewards/accuracies": 0.8479166626930237,
753
+ "rewards/chosen": -0.06966003775596619,
754
+ "rewards/margins": 0.2911684215068817,
755
+ "rewards/rejected": -0.3608284592628479,
756
+ "sft_loss": 0.4659655690193176,
757
+ "step": 440
758
+ },
759
+ {
760
+ "epoch": 0.04739981303407081,
761
+ "grad_norm": 14.617210388183594,
762
+ "learning_rate": 0.00023684210526315788,
763
+ "logits/chosen": -3.84993052482605,
764
+ "logits/rejected": -3.8500382900238037,
765
+ "logps/chosen": -0.7132828831672668,
766
+ "logps/rejected": -3.116370916366577,
767
+ "loss": 0.7449,
768
+ "odds_ratio_loss": 2.349879264831543,
769
+ "rewards/accuracies": 0.8583333492279053,
770
+ "rewards/chosen": -0.0713282972574234,
771
+ "rewards/margins": 0.24030880630016327,
772
+ "rewards/rejected": -0.31163710355758667,
773
+ "sft_loss": 0.5099204182624817,
774
+ "step": 450
775
+ },
776
+ {
777
+ "epoch": 0.048453142212605715,
778
+ "grad_norm": 15.630524635314941,
779
+ "learning_rate": 0.00024210526315789475,
780
+ "logits/chosen": -4.3313679695129395,
781
+ "logits/rejected": -4.331648349761963,
782
+ "logps/chosen": -0.7833544611930847,
783
+ "logps/rejected": -2.8526246547698975,
784
+ "loss": 0.8191,
785
+ "odds_ratio_loss": 2.5849623680114746,
786
+ "rewards/accuracies": 0.8354166746139526,
787
+ "rewards/chosen": -0.07833544164896011,
788
+ "rewards/margins": 0.20692706108093262,
789
+ "rewards/rejected": -0.28526249527931213,
790
+ "sft_loss": 0.5606356263160706,
791
+ "step": 460
792
+ },
793
+ {
794
+ "epoch": 0.04950647139114062,
795
+ "grad_norm": 4.825496196746826,
796
+ "learning_rate": 0.0002473684210526316,
797
+ "logits/chosen": -4.020305156707764,
798
+ "logits/rejected": -4.020514965057373,
799
+ "logps/chosen": -0.7084909677505493,
800
+ "logps/rejected": -2.901973009109497,
801
+ "loss": 0.745,
802
+ "odds_ratio_loss": 2.5733158588409424,
803
+ "rewards/accuracies": 0.8479166626930237,
804
+ "rewards/chosen": -0.07084909081459045,
805
+ "rewards/margins": 0.21934820711612701,
806
+ "rewards/rejected": -0.29019731283187866,
807
+ "sft_loss": 0.48766499757766724,
808
+ "step": 470
809
+ },
810
+ {
811
+ "epoch": 0.05055980056967553,
812
+ "grad_norm": 6.267645835876465,
813
+ "learning_rate": 0.0002526315789473684,
814
+ "logits/chosen": -3.936088800430298,
815
+ "logits/rejected": -3.936236619949341,
816
+ "logps/chosen": -0.7358769774436951,
817
+ "logps/rejected": -2.6652441024780273,
818
+ "loss": 0.7689,
819
+ "odds_ratio_loss": 2.5294764041900635,
820
+ "rewards/accuracies": 0.8541666865348816,
821
+ "rewards/chosen": -0.07358769327402115,
822
+ "rewards/margins": 0.1929367184638977,
823
+ "rewards/rejected": -0.26652440428733826,
824
+ "sft_loss": 0.5159851312637329,
825
+ "step": 480
826
+ },
827
+ {
828
+ "epoch": 0.05161312974821044,
829
+ "grad_norm": 7.438229084014893,
830
+ "learning_rate": 0.0002578947368421053,
831
+ "logits/chosen": -4.008545875549316,
832
+ "logits/rejected": -4.008641242980957,
833
+ "logps/chosen": -0.7306921482086182,
834
+ "logps/rejected": -2.7273244857788086,
835
+ "loss": 0.7645,
836
+ "odds_ratio_loss": 2.6694443225860596,
837
+ "rewards/accuracies": 0.8291666507720947,
838
+ "rewards/chosen": -0.07306921482086182,
839
+ "rewards/margins": 0.1996632218360901,
840
+ "rewards/rejected": -0.2727324366569519,
841
+ "sft_loss": 0.49753716588020325,
842
+ "step": 490
843
+ },
844
+ {
845
+ "epoch": 0.052666458926745346,
846
+ "grad_norm": 5.6936469078063965,
847
+ "learning_rate": 0.0002631578947368421,
848
+ "logits/chosen": -3.8969054222106934,
849
+ "logits/rejected": -3.896923303604126,
850
+ "logps/chosen": -0.7155380249023438,
851
+ "logps/rejected": -3.1710591316223145,
852
+ "loss": 0.7467,
853
+ "odds_ratio_loss": 2.616429567337036,
854
+ "rewards/accuracies": 0.8729166388511658,
855
+ "rewards/chosen": -0.07155381143093109,
856
+ "rewards/margins": 0.24555210769176483,
857
+ "rewards/rejected": -0.3171059191226959,
858
+ "sft_loss": 0.48508700728416443,
859
+ "step": 500
860
+ },
861
+ {
862
+ "epoch": 0.053719788105280254,
863
+ "grad_norm": 4.272115230560303,
864
+ "learning_rate": 0.00026842105263157897,
865
+ "logits/chosen": -3.8689732551574707,
866
+ "logits/rejected": -3.868974208831787,
867
+ "logps/chosen": -0.6541014313697815,
868
+ "logps/rejected": -3.1265270709991455,
869
+ "loss": 0.6821,
870
+ "odds_ratio_loss": 2.2681777477264404,
871
+ "rewards/accuracies": 0.8854166865348816,
872
+ "rewards/chosen": -0.06541014462709427,
873
+ "rewards/margins": 0.2472425401210785,
874
+ "rewards/rejected": -0.31265270709991455,
875
+ "sft_loss": 0.4552646279335022,
876
+ "step": 510
877
+ },
878
+ {
879
+ "epoch": 0.054773117283815155,
880
+ "grad_norm": 5.510837078094482,
881
+ "learning_rate": 0.00027368421052631584,
882
+ "logits/chosen": -3.8620245456695557,
883
+ "logits/rejected": -3.862044334411621,
884
+ "logps/chosen": -0.6386537551879883,
885
+ "logps/rejected": -3.322967767715454,
886
+ "loss": 0.6712,
887
+ "odds_ratio_loss": 2.311323881149292,
888
+ "rewards/accuracies": 0.8583333492279053,
889
+ "rewards/chosen": -0.06386537849903107,
890
+ "rewards/margins": 0.2684313654899597,
891
+ "rewards/rejected": -0.33229681849479675,
892
+ "sft_loss": 0.4400910437107086,
893
+ "step": 520
894
+ },
895
+ {
896
+ "epoch": 0.05582644646235006,
897
+ "grad_norm": 12.38877010345459,
898
+ "learning_rate": 0.0002789473684210526,
899
+ "logits/chosen": -4.60584020614624,
900
+ "logits/rejected": -4.6057939529418945,
901
+ "logps/chosen": -0.7113536596298218,
902
+ "logps/rejected": -3.330021381378174,
903
+ "loss": 0.7496,
904
+ "odds_ratio_loss": 2.427239179611206,
905
+ "rewards/accuracies": 0.8354166746139526,
906
+ "rewards/chosen": -0.07113537192344666,
907
+ "rewards/margins": 0.2618667781352997,
908
+ "rewards/rejected": -0.33300215005874634,
909
+ "sft_loss": 0.5068832635879517,
910
+ "step": 530
911
+ },
912
+ {
913
+ "epoch": 0.05687977564088497,
914
+ "grad_norm": 2.2653727531433105,
915
+ "learning_rate": 0.00028421052631578947,
916
+ "logits/chosen": -5.099688529968262,
917
+ "logits/rejected": -5.09957218170166,
918
+ "logps/chosen": -0.6874160170555115,
919
+ "logps/rejected": -3.100078582763672,
920
+ "loss": 0.7231,
921
+ "odds_ratio_loss": 2.652129650115967,
922
+ "rewards/accuracies": 0.8500000238418579,
923
+ "rewards/chosen": -0.06874160468578339,
924
+ "rewards/margins": 0.24126628041267395,
925
+ "rewards/rejected": -0.31000787019729614,
926
+ "sft_loss": 0.45785781741142273,
927
+ "step": 540
928
+ },
929
+ {
930
+ "epoch": 0.05793310481941988,
931
+ "grad_norm": 6.484382152557373,
932
+ "learning_rate": 0.00028947368421052634,
933
+ "logits/chosen": -4.181884288787842,
934
+ "logits/rejected": -4.181893348693848,
935
+ "logps/chosen": -0.7184228897094727,
936
+ "logps/rejected": -3.2252790927886963,
937
+ "loss": 0.7563,
938
+ "odds_ratio_loss": 2.4872865676879883,
939
+ "rewards/accuracies": 0.8458333611488342,
940
+ "rewards/chosen": -0.07184228301048279,
941
+ "rewards/margins": 0.25068560242652893,
942
+ "rewards/rejected": -0.3225278854370117,
943
+ "sft_loss": 0.5075890421867371,
944
+ "step": 550
945
+ },
946
+ {
947
+ "epoch": 0.058986433997954786,
948
+ "grad_norm": 6.237575531005859,
949
+ "learning_rate": 0.00029473684210526316,
950
+ "logits/chosen": -4.042048931121826,
951
+ "logits/rejected": -4.042147159576416,
952
+ "logps/chosen": -0.6820612549781799,
953
+ "logps/rejected": -2.6241307258605957,
954
+ "loss": 0.7178,
955
+ "odds_ratio_loss": 2.6058013439178467,
956
+ "rewards/accuracies": 0.8604166507720947,
957
+ "rewards/chosen": -0.06820613890886307,
958
+ "rewards/margins": 0.19420695304870605,
959
+ "rewards/rejected": -0.26241305470466614,
960
+ "sft_loss": 0.45718762278556824,
961
+ "step": 560
962
+ },
963
+ {
964
+ "epoch": 0.060039763176489694,
965
+ "grad_norm": 5.729897499084473,
966
+ "learning_rate": 0.0003,
967
+ "logits/chosen": -3.9665284156799316,
968
+ "logits/rejected": -3.966668128967285,
969
+ "logps/chosen": -0.7161160111427307,
970
+ "logps/rejected": -2.8060250282287598,
971
+ "loss": 0.7529,
972
+ "odds_ratio_loss": 2.4470136165618896,
973
+ "rewards/accuracies": 0.8520833253860474,
974
+ "rewards/chosen": -0.07161159813404083,
975
+ "rewards/margins": 0.20899087190628052,
976
+ "rewards/rejected": -0.28060245513916016,
977
+ "sft_loss": 0.5082017779350281,
978
+ "step": 570
979
+ },
980
+ {
981
+ "epoch": 0.0610930923550246,
982
+ "grad_norm": 5.065602779388428,
983
+ "learning_rate": 0.00030526315789473684,
984
+ "logits/chosen": -3.9091081619262695,
985
+ "logits/rejected": -3.9092376232147217,
986
+ "logps/chosen": -0.6755971908569336,
987
+ "logps/rejected": -2.8741674423217773,
988
+ "loss": 0.7078,
989
+ "odds_ratio_loss": 2.4316818714141846,
990
+ "rewards/accuracies": 0.8812500238418579,
991
+ "rewards/chosen": -0.06755972653627396,
992
+ "rewards/margins": 0.21985705196857452,
993
+ "rewards/rejected": -0.2874167859554291,
994
+ "sft_loss": 0.4646414816379547,
995
+ "step": 580
996
+ },
997
+ {
998
+ "epoch": 0.06214642153355951,
999
+ "grad_norm": 2.45158314704895,
1000
+ "learning_rate": 0.0003105263157894737,
1001
+ "logits/chosen": -3.9886550903320312,
1002
+ "logits/rejected": -3.9887642860412598,
1003
+ "logps/chosen": -0.649567186832428,
1004
+ "logps/rejected": -3.0265567302703857,
1005
+ "loss": 0.6791,
1006
+ "odds_ratio_loss": 2.3147711753845215,
1007
+ "rewards/accuracies": 0.8729166388511658,
1008
+ "rewards/chosen": -0.06495673209428787,
1009
+ "rewards/margins": 0.2376989722251892,
1010
+ "rewards/rejected": -0.3026556670665741,
1011
+ "sft_loss": 0.4476209580898285,
1012
+ "step": 590
1013
+ },
1014
+ {
1015
+ "epoch": 0.06319975071209441,
1016
+ "grad_norm": 15.312357902526855,
1017
+ "learning_rate": 0.00031578947368421053,
1018
+ "logits/chosen": -3.9752919673919678,
1019
+ "logits/rejected": -3.9754388332366943,
1020
+ "logps/chosen": -0.696826159954071,
1021
+ "logps/rejected": -3.0343518257141113,
1022
+ "loss": 0.732,
1023
+ "odds_ratio_loss": 2.443164587020874,
1024
+ "rewards/accuracies": 0.8479166626930237,
1025
+ "rewards/chosen": -0.06968262046575546,
1026
+ "rewards/margins": 0.23375259339809418,
1027
+ "rewards/rejected": -0.30343523621559143,
1028
+ "sft_loss": 0.48767518997192383,
1029
+ "step": 600
1030
+ },
1031
+ {
1032
+ "epoch": 0.06425307989062932,
1033
+ "grad_norm": 9.758230209350586,
1034
+ "learning_rate": 0.0003210526315789474,
1035
+ "logits/chosen": -3.76411509513855,
1036
+ "logits/rejected": -3.763446092605591,
1037
+ "logps/chosen": -0.7242849469184875,
1038
+ "logps/rejected": -5.800142288208008,
1039
+ "loss": 0.7592,
1040
+ "odds_ratio_loss": 2.657700300216675,
1041
+ "rewards/accuracies": 0.8458333611488342,
1042
+ "rewards/chosen": -0.07242848724126816,
1043
+ "rewards/margins": 0.5075857639312744,
1044
+ "rewards/rejected": -0.5800142884254456,
1045
+ "sft_loss": 0.49342209100723267,
1046
+ "step": 610
1047
+ },
1048
+ {
1049
+ "epoch": 0.06530640906916423,
1050
+ "grad_norm": 7.555414199829102,
1051
+ "learning_rate": 0.0003263157894736842,
1052
+ "logits/chosen": -4.165302753448486,
1053
+ "logits/rejected": -4.1650519371032715,
1054
+ "logps/chosen": -0.7384843230247498,
1055
+ "logps/rejected": -3.710164785385132,
1056
+ "loss": 0.7768,
1057
+ "odds_ratio_loss": 2.5719528198242188,
1058
+ "rewards/accuracies": 0.8520833253860474,
1059
+ "rewards/chosen": -0.0738484337925911,
1060
+ "rewards/margins": 0.297168105840683,
1061
+ "rewards/rejected": -0.3710165023803711,
1062
+ "sft_loss": 0.5196101665496826,
1063
+ "step": 620
1064
+ },
1065
+ {
1066
+ "epoch": 0.06635973824769914,
1067
+ "grad_norm": 4.273881435394287,
1068
+ "learning_rate": 0.00033157894736842103,
1069
+ "logits/chosen": -4.187811374664307,
1070
+ "logits/rejected": -4.186800479888916,
1071
+ "logps/chosen": -0.6481006145477295,
1072
+ "logps/rejected": -5.178854942321777,
1073
+ "loss": 0.6803,
1074
+ "odds_ratio_loss": 2.3059561252593994,
1075
+ "rewards/accuracies": 0.856249988079071,
1076
+ "rewards/chosen": -0.06481005996465683,
1077
+ "rewards/margins": 0.45307546854019165,
1078
+ "rewards/rejected": -0.5178855061531067,
1079
+ "sft_loss": 0.4497505724430084,
1080
+ "step": 630
1081
+ },
1082
+ {
1083
+ "epoch": 0.06741306742623404,
1084
+ "grad_norm": 6.665101528167725,
1085
+ "learning_rate": 0.0003368421052631579,
1086
+ "logits/chosen": -4.168524265289307,
1087
+ "logits/rejected": -4.1669230461120605,
1088
+ "logps/chosen": -0.658748984336853,
1089
+ "logps/rejected": -6.14946174621582,
1090
+ "loss": 0.6936,
1091
+ "odds_ratio_loss": 2.615469455718994,
1092
+ "rewards/accuracies": 0.8520833253860474,
1093
+ "rewards/chosen": -0.06587490439414978,
1094
+ "rewards/margins": 0.5490713715553284,
1095
+ "rewards/rejected": -0.614946186542511,
1096
+ "sft_loss": 0.432014137506485,
1097
+ "step": 640
1098
+ },
1099
+ {
1100
+ "epoch": 0.06846639660476894,
1101
+ "grad_norm": 5.859743118286133,
1102
+ "learning_rate": 0.00034210526315789477,
1103
+ "logits/chosen": -4.12244176864624,
1104
+ "logits/rejected": -4.120962619781494,
1105
+ "logps/chosen": -0.703795850276947,
1106
+ "logps/rejected": -5.927857875823975,
1107
+ "loss": 0.739,
1108
+ "odds_ratio_loss": 2.5700552463531494,
1109
+ "rewards/accuracies": 0.856249988079071,
1110
+ "rewards/chosen": -0.0703795999288559,
1111
+ "rewards/margins": 0.5224061608314514,
1112
+ "rewards/rejected": -0.5927857756614685,
1113
+ "sft_loss": 0.48198458552360535,
1114
+ "step": 650
1115
+ },
1116
+ {
1117
+ "epoch": 0.06951972578330386,
1118
+ "grad_norm": 3.937659502029419,
1119
+ "learning_rate": 0.0003473684210526316,
1120
+ "logits/chosen": -4.144687175750732,
1121
+ "logits/rejected": -4.143020153045654,
1122
+ "logps/chosen": -0.6716140508651733,
1123
+ "logps/rejected": -6.169389247894287,
1124
+ "loss": 0.704,
1125
+ "odds_ratio_loss": 2.5956499576568604,
1126
+ "rewards/accuracies": 0.8583333492279053,
1127
+ "rewards/chosen": -0.06716141104698181,
1128
+ "rewards/margins": 0.549777626991272,
1129
+ "rewards/rejected": -0.6169389486312866,
1130
+ "sft_loss": 0.44441157579421997,
1131
+ "step": 660
1132
+ },
1133
+ {
1134
+ "epoch": 0.07057305496183876,
1135
+ "grad_norm": 4.0990681648254395,
1136
+ "learning_rate": 0.0003526315789473684,
1137
+ "logits/chosen": -4.208241939544678,
1138
+ "logits/rejected": -4.206976413726807,
1139
+ "logps/chosen": -0.6446244120597839,
1140
+ "logps/rejected": -5.427404403686523,
1141
+ "loss": 0.6791,
1142
+ "odds_ratio_loss": 2.412100315093994,
1143
+ "rewards/accuracies": 0.8520833253860474,
1144
+ "rewards/chosen": -0.06446244567632675,
1145
+ "rewards/margins": 0.4782780110836029,
1146
+ "rewards/rejected": -0.5427404642105103,
1147
+ "sft_loss": 0.4378568232059479,
1148
+ "step": 670
1149
+ },
1150
+ {
1151
+ "epoch": 0.07162638414037367,
1152
+ "grad_norm": 4.258831977844238,
1153
+ "learning_rate": 0.0003578947368421053,
1154
+ "logits/chosen": -4.341937065124512,
1155
+ "logits/rejected": -4.341104984283447,
1156
+ "logps/chosen": -0.7450679540634155,
1157
+ "logps/rejected": -4.367857933044434,
1158
+ "loss": 0.7874,
1159
+ "odds_ratio_loss": 2.708036184310913,
1160
+ "rewards/accuracies": 0.8083333373069763,
1161
+ "rewards/chosen": -0.07450678944587708,
1162
+ "rewards/margins": 0.3622789978981018,
1163
+ "rewards/rejected": -0.4367857873439789,
1164
+ "sft_loss": 0.5165507793426514,
1165
+ "step": 680
1166
+ },
1167
+ {
1168
+ "epoch": 0.07267971331890857,
1169
+ "grad_norm": 10.723002433776855,
1170
+ "learning_rate": 0.00036315789473684214,
1171
+ "logits/chosen": -4.344449996948242,
1172
+ "logits/rejected": -4.344136714935303,
1173
+ "logps/chosen": -0.8118324279785156,
1174
+ "logps/rejected": -3.4473140239715576,
1175
+ "loss": 0.852,
1176
+ "odds_ratio_loss": 2.871811628341675,
1177
+ "rewards/accuracies": 0.8104166388511658,
1178
+ "rewards/chosen": -0.08118324726819992,
1179
+ "rewards/margins": 0.26354819536209106,
1180
+ "rewards/rejected": -0.3447313904762268,
1181
+ "sft_loss": 0.5648209452629089,
1182
+ "step": 690
1183
+ },
1184
+ {
1185
+ "epoch": 0.07373304249744349,
1186
+ "grad_norm": 5.821114540100098,
1187
+ "learning_rate": 0.00036842105263157896,
1188
+ "logits/chosen": -4.07045316696167,
1189
+ "logits/rejected": -4.069707870483398,
1190
+ "logps/chosen": -0.8850536942481995,
1191
+ "logps/rejected": -5.450161933898926,
1192
+ "loss": 0.9181,
1193
+ "odds_ratio_loss": 3.165844678878784,
1194
+ "rewards/accuracies": 0.8416666388511658,
1195
+ "rewards/chosen": -0.08850537240505219,
1196
+ "rewards/margins": 0.4565107524394989,
1197
+ "rewards/rejected": -0.5450161695480347,
1198
+ "sft_loss": 0.6015486121177673,
1199
+ "step": 700
1200
+ },
1201
+ {
1202
+ "epoch": 0.07478637167597839,
1203
+ "grad_norm": 2.219165563583374,
1204
+ "learning_rate": 0.0003736842105263158,
1205
+ "logits/chosen": -3.7920498847961426,
1206
+ "logits/rejected": -3.7913458347320557,
1207
+ "logps/chosen": -0.7324831485748291,
1208
+ "logps/rejected": -4.996405601501465,
1209
+ "loss": 0.7707,
1210
+ "odds_ratio_loss": 2.7621920108795166,
1211
+ "rewards/accuracies": 0.8041666746139526,
1212
+ "rewards/chosen": -0.07324830442667007,
1213
+ "rewards/margins": 0.42639225721359253,
1214
+ "rewards/rejected": -0.499640554189682,
1215
+ "sft_loss": 0.49450069665908813,
1216
+ "step": 710
1217
+ },
1218
+ {
1219
+ "epoch": 0.07583970085451329,
1220
+ "grad_norm": 5.435701370239258,
1221
+ "learning_rate": 0.00037894736842105265,
1222
+ "logits/chosen": -4.781533718109131,
1223
+ "logits/rejected": -4.781356334686279,
1224
+ "logps/chosen": -0.6917392611503601,
1225
+ "logps/rejected": -4.243617057800293,
1226
+ "loss": 0.7244,
1227
+ "odds_ratio_loss": 2.5724165439605713,
1228
+ "rewards/accuracies": 0.8645833134651184,
1229
+ "rewards/chosen": -0.06917393207550049,
1230
+ "rewards/margins": 0.3551878333091736,
1231
+ "rewards/rejected": -0.4243617355823517,
1232
+ "sft_loss": 0.4671470522880554,
1233
+ "step": 720
1234
+ },
1235
+ {
1236
+ "epoch": 0.0768930300330482,
1237
+ "grad_norm": 4.722170352935791,
1238
+ "learning_rate": 0.00038421052631578946,
1239
+ "logits/chosen": -4.822556495666504,
1240
+ "logits/rejected": -4.822704792022705,
1241
+ "logps/chosen": -0.6918298006057739,
1242
+ "logps/rejected": -3.5728962421417236,
1243
+ "loss": 0.7267,
1244
+ "odds_ratio_loss": 2.5833892822265625,
1245
+ "rewards/accuracies": 0.8416666388511658,
1246
+ "rewards/chosen": -0.06918298453092575,
1247
+ "rewards/margins": 0.28810662031173706,
1248
+ "rewards/rejected": -0.3572896420955658,
1249
+ "sft_loss": 0.4683450758457184,
1250
+ "step": 730
1251
+ },
1252
+ {
1253
+ "epoch": 0.0779463592115831,
1254
+ "grad_norm": 2.800881862640381,
1255
+ "learning_rate": 0.00038947368421052633,
1256
+ "logits/chosen": -4.779958248138428,
1257
+ "logits/rejected": -4.780096530914307,
1258
+ "logps/chosen": -0.6186120510101318,
1259
+ "logps/rejected": -3.642204761505127,
1260
+ "loss": 0.6549,
1261
+ "odds_ratio_loss": 2.427229881286621,
1262
+ "rewards/accuracies": 0.8583333492279053,
1263
+ "rewards/chosen": -0.06186120584607124,
1264
+ "rewards/margins": 0.3023592531681061,
1265
+ "rewards/rejected": -0.3642204701900482,
1266
+ "sft_loss": 0.41214191913604736,
1267
+ "step": 740
1268
+ },
1269
+ {
1270
+ "epoch": 0.07899968839011802,
1271
+ "grad_norm": 5.068697452545166,
1272
+ "learning_rate": 0.00039473684210526315,
1273
+ "logits/chosen": -4.596142768859863,
1274
+ "logits/rejected": -4.595941066741943,
1275
+ "logps/chosen": -0.7380008697509766,
1276
+ "logps/rejected": -4.162142276763916,
1277
+ "loss": 0.7749,
1278
+ "odds_ratio_loss": 2.4714393615722656,
1279
+ "rewards/accuracies": 0.8333333134651184,
1280
+ "rewards/chosen": -0.07380008697509766,
1281
+ "rewards/margins": 0.34241411089897156,
1282
+ "rewards/rejected": -0.4162141978740692,
1283
+ "sft_loss": 0.527804970741272,
1284
+ "step": 750
1285
+ },
1286
+ {
1287
+ "epoch": 0.08005301756865292,
1288
+ "grad_norm": 3.4697628021240234,
1289
+ "learning_rate": 0.0004,
1290
+ "logits/chosen": -4.708858013153076,
1291
+ "logits/rejected": -4.708543300628662,
1292
+ "logps/chosen": -0.675973653793335,
1293
+ "logps/rejected": -4.2291083335876465,
1294
+ "loss": 0.7074,
1295
+ "odds_ratio_loss": 2.3750479221343994,
1296
+ "rewards/accuracies": 0.8416666388511658,
1297
+ "rewards/chosen": -0.06759736686944962,
1298
+ "rewards/margins": 0.3553134799003601,
1299
+ "rewards/rejected": -0.4229108393192291,
1300
+ "sft_loss": 0.4698618948459625,
1301
+ "step": 760
1302
+ },
1303
+ {
1304
+ "epoch": 0.08110634674718784,
1305
+ "grad_norm": 11.160131454467773,
1306
+ "learning_rate": 0.00040526315789473684,
1307
+ "logits/chosen": -5.051191329956055,
1308
+ "logits/rejected": -5.050747871398926,
1309
+ "logps/chosen": -0.7793533802032471,
1310
+ "logps/rejected": -5.09091854095459,
1311
+ "loss": 0.8153,
1312
+ "odds_ratio_loss": 2.829737901687622,
1313
+ "rewards/accuracies": 0.8291666507720947,
1314
+ "rewards/chosen": -0.07793533802032471,
1315
+ "rewards/margins": 0.43115654587745667,
1316
+ "rewards/rejected": -0.509091854095459,
1317
+ "sft_loss": 0.5323660969734192,
1318
+ "step": 770
1319
+ },
1320
+ {
1321
+ "epoch": 0.08215967592572274,
1322
+ "grad_norm": 3.8492166996002197,
1323
+ "learning_rate": 0.0004105263157894737,
1324
+ "logits/chosen": -4.681753158569336,
1325
+ "logits/rejected": -4.681027889251709,
1326
+ "logps/chosen": -0.67795729637146,
1327
+ "logps/rejected": -5.4289870262146,
1328
+ "loss": 0.7104,
1329
+ "odds_ratio_loss": 2.6001367568969727,
1330
+ "rewards/accuracies": 0.8520833253860474,
1331
+ "rewards/chosen": -0.06779572367668152,
1332
+ "rewards/margins": 0.4751029908657074,
1333
+ "rewards/rejected": -0.5428987145423889,
1334
+ "sft_loss": 0.45040473341941833,
1335
+ "step": 780
1336
+ },
1337
+ {
1338
+ "epoch": 0.08321300510425765,
1339
+ "grad_norm": 4.350924491882324,
1340
+ "learning_rate": 0.0004157894736842106,
1341
+ "logits/chosen": -5.090719699859619,
1342
+ "logits/rejected": -5.0898871421813965,
1343
+ "logps/chosen": -0.6309987902641296,
1344
+ "logps/rejected": -6.083089828491211,
1345
+ "loss": 0.6608,
1346
+ "odds_ratio_loss": 2.363413095474243,
1347
+ "rewards/accuracies": 0.8812500238418579,
1348
+ "rewards/chosen": -0.06309988349676132,
1349
+ "rewards/margins": 0.5452090501785278,
1350
+ "rewards/rejected": -0.6083090305328369,
1351
+ "sft_loss": 0.4244639277458191,
1352
+ "step": 790
1353
+ },
1354
+ {
1355
+ "epoch": 0.08426633428279255,
1356
+ "grad_norm": 4.629517078399658,
1357
+ "learning_rate": 0.00042105263157894734,
1358
+ "logits/chosen": -5.171376705169678,
1359
+ "logits/rejected": -5.170820713043213,
1360
+ "logps/chosen": -0.7821296453475952,
1361
+ "logps/rejected": -4.942056655883789,
1362
+ "loss": 0.8229,
1363
+ "odds_ratio_loss": 2.6525399684906006,
1364
+ "rewards/accuracies": 0.8208333253860474,
1365
+ "rewards/chosen": -0.07821296900510788,
1366
+ "rewards/margins": 0.41599270701408386,
1367
+ "rewards/rejected": -0.49420568346977234,
1368
+ "sft_loss": 0.557674765586853,
1369
+ "step": 800
1370
  }
1371
  ],
1372
  "logging_steps": 10,
 
1386
  "attributes": {}
1387
  }
1388
  },
1389
+ "total_flos": 5.905053933387448e+17,
1390
  "train_batch_size": 2,
1391
  "trial_name": null,
1392
  "trial_params": null