File size: 29,328 Bytes
cf6a0e7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 |
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 158,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"debug/policy_chosen_logits": 1.5589828491210938,
"debug/policy_chosen_logps": -258.5330810546875,
"debug/policy_rejected_logits": 1.9977812767028809,
"debug/policy_rejected_logps": -304.0617980957031,
"debug/reference_chosen_logps": -258.5330810546875,
"debug/reference_rejected_logps": -304.0617980957031,
"epoch": 0.006329113924050633,
"grad_norm": 5.915865288930895,
"learning_rate": 1e-06,
"logits/chosen": 1.5589828491210938,
"logits/rejected": 1.9977812767028809,
"logps/chosen": -258.5330810546875,
"logps/rejected": -304.0617980957031,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"debug/policy_chosen_logits": 0.8514629602432251,
"debug/policy_chosen_logps": -222.75827026367188,
"debug/policy_rejected_logits": 1.458482027053833,
"debug/policy_rejected_logps": -292.2978210449219,
"debug/reference_chosen_logps": -222.56484985351562,
"debug/reference_rejected_logps": -288.334716796875,
"epoch": 0.03164556962025317,
"grad_norm": 6.379094662882782,
"learning_rate": 1e-06,
"logits/chosen": 0.8514629602432251,
"logits/rejected": 1.458482027053833,
"logps/chosen": -222.75827026367188,
"logps/rejected": -292.2978210449219,
"loss": 0.4816,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.0019342182204127312,
"rewards/margins": 0.03769642859697342,
"rewards/rejected": -0.039630644023418427,
"step": 5
},
{
"debug/policy_chosen_logits": 1.1418471336364746,
"debug/policy_chosen_logps": -261.1085510253906,
"debug/policy_rejected_logits": 1.316489338874817,
"debug/policy_rejected_logps": -285.4795837402344,
"debug/reference_chosen_logps": -260.5736999511719,
"debug/reference_rejected_logps": -280.2572937011719,
"epoch": 0.06329113924050633,
"grad_norm": 12.885197123935471,
"learning_rate": 1e-06,
"logits/chosen": 1.1418471336364746,
"logits/rejected": 1.316489338874817,
"logps/chosen": -261.1085510253906,
"logps/rejected": -285.4795837402344,
"loss": 0.4629,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.005348391830921173,
"rewards/margins": 0.04687455669045448,
"rewards/rejected": -0.05222295597195625,
"step": 10
},
{
"debug/policy_chosen_logits": 1.4202030897140503,
"debug/policy_chosen_logps": -305.30096435546875,
"debug/policy_rejected_logits": 1.608795404434204,
"debug/policy_rejected_logps": -339.3628845214844,
"debug/reference_chosen_logps": -305.89739990234375,
"debug/reference_rejected_logps": -336.0830078125,
"epoch": 0.0949367088607595,
"grad_norm": 6.031873391940916,
"learning_rate": 1e-06,
"logits/chosen": 1.4202030897140503,
"logits/rejected": 1.608795404434204,
"logps/chosen": -305.30096435546875,
"logps/rejected": -339.3628845214844,
"loss": 0.462,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.005964324809610844,
"rewards/margins": 0.038763098418712616,
"rewards/rejected": -0.032798778265714645,
"step": 15
},
{
"debug/policy_chosen_logits": 1.2072508335113525,
"debug/policy_chosen_logps": -259.9560546875,
"debug/policy_rejected_logits": 1.4596980810165405,
"debug/policy_rejected_logps": -266.99896240234375,
"debug/reference_chosen_logps": -262.2249450683594,
"debug/reference_rejected_logps": -262.94488525390625,
"epoch": 0.12658227848101267,
"grad_norm": 5.929430664241562,
"learning_rate": 1e-06,
"logits/chosen": 1.2072508335113525,
"logits/rejected": 1.4596980810165405,
"logps/chosen": -259.9560546875,
"logps/rejected": -266.99896240234375,
"loss": 0.4568,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.02268880605697632,
"rewards/margins": 0.06322960555553436,
"rewards/rejected": -0.04054080322384834,
"step": 20
},
{
"debug/policy_chosen_logits": 0.9236510992050171,
"debug/policy_chosen_logps": -263.56951904296875,
"debug/policy_rejected_logits": 1.2153400182724,
"debug/policy_rejected_logps": -276.596923828125,
"debug/reference_chosen_logps": -264.62982177734375,
"debug/reference_rejected_logps": -272.1346130371094,
"epoch": 0.15822784810126583,
"grad_norm": 6.795022163630081,
"learning_rate": 1e-06,
"logits/chosen": 0.9236510992050171,
"logits/rejected": 1.2153400182724,
"logps/chosen": -263.56951904296875,
"logps/rejected": -276.596923828125,
"loss": 0.4609,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.010603101924061775,
"rewards/margins": 0.05522637441754341,
"rewards/rejected": -0.04462327063083649,
"step": 25
},
{
"debug/policy_chosen_logits": 0.8845943212509155,
"debug/policy_chosen_logps": -232.0923309326172,
"debug/policy_rejected_logits": 1.284155011177063,
"debug/policy_rejected_logps": -287.80389404296875,
"debug/reference_chosen_logps": -233.78652954101562,
"debug/reference_rejected_logps": -284.5167236328125,
"epoch": 0.189873417721519,
"grad_norm": 6.4445556777608255,
"learning_rate": 1e-06,
"logits/chosen": 0.8845943212509155,
"logits/rejected": 1.284155011177063,
"logps/chosen": -232.0923309326172,
"logps/rejected": -287.80389404296875,
"loss": 0.4609,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.016941774636507034,
"rewards/margins": 0.04981378838419914,
"rewards/rejected": -0.03287201002240181,
"step": 30
},
{
"debug/policy_chosen_logits": 1.1807546615600586,
"debug/policy_chosen_logps": -263.7032165527344,
"debug/policy_rejected_logits": 1.3615357875823975,
"debug/policy_rejected_logps": -295.0924377441406,
"debug/reference_chosen_logps": -264.52520751953125,
"debug/reference_rejected_logps": -289.96612548828125,
"epoch": 0.22151898734177214,
"grad_norm": 6.39988158389298,
"learning_rate": 1e-06,
"logits/chosen": 1.1807546615600586,
"logits/rejected": 1.3615357875823975,
"logps/chosen": -263.7032165527344,
"logps/rejected": -295.0924377441406,
"loss": 0.4495,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.008219520561397076,
"rewards/margins": 0.05948234722018242,
"rewards/rejected": -0.05126282572746277,
"step": 35
},
{
"debug/policy_chosen_logits": 0.918303370475769,
"debug/policy_chosen_logps": -224.531982421875,
"debug/policy_rejected_logits": 1.2155705690383911,
"debug/policy_rejected_logps": -266.7242431640625,
"debug/reference_chosen_logps": -227.6628875732422,
"debug/reference_rejected_logps": -259.6141052246094,
"epoch": 0.25316455696202533,
"grad_norm": 8.66786179216246,
"learning_rate": 1e-06,
"logits/chosen": 0.918303370475769,
"logits/rejected": 1.2155705690383911,
"logps/chosen": -224.531982421875,
"logps/rejected": -266.7242431640625,
"loss": 0.4495,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.031309086829423904,
"rewards/margins": 0.10241049528121948,
"rewards/rejected": -0.07110141217708588,
"step": 40
},
{
"debug/policy_chosen_logits": 0.8259471654891968,
"debug/policy_chosen_logps": -230.60250854492188,
"debug/policy_rejected_logits": 1.2626183032989502,
"debug/policy_rejected_logps": -303.4950866699219,
"debug/reference_chosen_logps": -230.0920867919922,
"debug/reference_rejected_logps": -302.10784912109375,
"epoch": 0.2848101265822785,
"grad_norm": 6.143825464676947,
"learning_rate": 1e-06,
"logits/chosen": 0.8259471654891968,
"logits/rejected": 1.2626183032989502,
"logps/chosen": -230.60250854492188,
"logps/rejected": -303.4950866699219,
"loss": 0.4802,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0051041776314377785,
"rewards/margins": 0.008768384344875813,
"rewards/rejected": -0.013872561976313591,
"step": 45
},
{
"debug/policy_chosen_logits": 0.9409104585647583,
"debug/policy_chosen_logps": -241.2617950439453,
"debug/policy_rejected_logits": 1.2857184410095215,
"debug/policy_rejected_logps": -291.4665222167969,
"debug/reference_chosen_logps": -244.69577026367188,
"debug/reference_rejected_logps": -284.1947021484375,
"epoch": 0.31645569620253167,
"grad_norm": 8.46649937885156,
"learning_rate": 1e-06,
"logits/chosen": 0.9409104585647583,
"logits/rejected": 1.2857184410095215,
"logps/chosen": -241.2617950439453,
"logps/rejected": -291.4665222167969,
"loss": 0.4411,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.034339673817157745,
"rewards/margins": 0.10705772787332535,
"rewards/rejected": -0.0727180689573288,
"step": 50
},
{
"debug/policy_chosen_logits": 0.8741863369941711,
"debug/policy_chosen_logps": -250.87057495117188,
"debug/policy_rejected_logits": 1.258837103843689,
"debug/policy_rejected_logps": -289.27069091796875,
"debug/reference_chosen_logps": -255.7415771484375,
"debug/reference_rejected_logps": -283.4430847167969,
"epoch": 0.34810126582278483,
"grad_norm": 9.716442001601763,
"learning_rate": 1e-06,
"logits/chosen": 0.8741863369941711,
"logits/rejected": 1.258837103843689,
"logps/chosen": -250.87057495117188,
"logps/rejected": -289.27069091796875,
"loss": 0.4436,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.048710085451602936,
"rewards/margins": 0.10698604583740234,
"rewards/rejected": -0.05827596038579941,
"step": 55
},
{
"debug/policy_chosen_logits": 0.6640017628669739,
"debug/policy_chosen_logps": -269.62237548828125,
"debug/policy_rejected_logits": 0.8445190191268921,
"debug/policy_rejected_logps": -291.27325439453125,
"debug/reference_chosen_logps": -269.4212951660156,
"debug/reference_rejected_logps": -285.77349853515625,
"epoch": 0.379746835443038,
"grad_norm": 7.925495242886814,
"learning_rate": 1e-06,
"logits/chosen": 0.6640017628669739,
"logits/rejected": 0.8445190191268921,
"logps/chosen": -269.62237548828125,
"logps/rejected": -291.27325439453125,
"loss": 0.438,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.002010857220739126,
"rewards/margins": 0.052986472845077515,
"rewards/rejected": -0.05499732494354248,
"step": 60
},
{
"debug/policy_chosen_logits": 1.0082881450653076,
"debug/policy_chosen_logps": -241.1085662841797,
"debug/policy_rejected_logits": 1.5921090841293335,
"debug/policy_rejected_logps": -303.08465576171875,
"debug/reference_chosen_logps": -245.0981903076172,
"debug/reference_rejected_logps": -300.36328125,
"epoch": 0.41139240506329117,
"grad_norm": 7.096776814684128,
"learning_rate": 1e-06,
"logits/chosen": 1.0082881450653076,
"logits/rejected": 1.5921090841293335,
"logps/chosen": -241.1085662841797,
"logps/rejected": -303.08465576171875,
"loss": 0.4602,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.03989603370428085,
"rewards/margins": 0.06710983067750931,
"rewards/rejected": -0.027213791385293007,
"step": 65
},
{
"debug/policy_chosen_logits": 0.7952272295951843,
"debug/policy_chosen_logps": -252.08798217773438,
"debug/policy_rejected_logits": 1.0696840286254883,
"debug/policy_rejected_logps": -287.27301025390625,
"debug/reference_chosen_logps": -253.79379272460938,
"debug/reference_rejected_logps": -279.5188903808594,
"epoch": 0.4430379746835443,
"grad_norm": 7.584678181203943,
"learning_rate": 1e-06,
"logits/chosen": 0.7952272295951843,
"logits/rejected": 1.0696840286254883,
"logps/chosen": -252.08798217773438,
"logps/rejected": -287.27301025390625,
"loss": 0.4335,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.017058206722140312,
"rewards/margins": 0.09459935128688812,
"rewards/rejected": -0.07754113525152206,
"step": 70
},
{
"debug/policy_chosen_logits": 0.9075101613998413,
"debug/policy_chosen_logps": -218.43185424804688,
"debug/policy_rejected_logits": 1.0321990251541138,
"debug/policy_rejected_logps": -245.87973022460938,
"debug/reference_chosen_logps": -221.93466186523438,
"debug/reference_rejected_logps": -243.0590057373047,
"epoch": 0.47468354430379744,
"grad_norm": 6.725884442562555,
"learning_rate": 1e-06,
"logits/chosen": 0.9075101613998413,
"logits/rejected": 1.0321990251541138,
"logps/chosen": -218.43185424804688,
"logps/rejected": -245.87973022460938,
"loss": 0.4441,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.03502799943089485,
"rewards/margins": 0.06323517113924026,
"rewards/rejected": -0.028207167983055115,
"step": 75
},
{
"debug/policy_chosen_logits": 0.6510931253433228,
"debug/policy_chosen_logps": -218.7671356201172,
"debug/policy_rejected_logits": 0.8215225338935852,
"debug/policy_rejected_logps": -276.33111572265625,
"debug/reference_chosen_logps": -222.28018188476562,
"debug/reference_rejected_logps": -267.1961364746094,
"epoch": 0.5063291139240507,
"grad_norm": 7.155350358859657,
"learning_rate": 1e-06,
"logits/chosen": 0.6510931253433228,
"logits/rejected": 0.8215225338935852,
"logps/chosen": -218.7671356201172,
"logps/rejected": -276.33111572265625,
"loss": 0.4348,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.03513062372803688,
"rewards/margins": 0.1264806091785431,
"rewards/rejected": -0.09134997427463531,
"step": 80
},
{
"debug/policy_chosen_logits": 0.9534305334091187,
"debug/policy_chosen_logps": -250.000244140625,
"debug/policy_rejected_logits": 1.0431879758834839,
"debug/policy_rejected_logps": -275.9551086425781,
"debug/reference_chosen_logps": -250.7502899169922,
"debug/reference_rejected_logps": -268.43548583984375,
"epoch": 0.5379746835443038,
"grad_norm": 26.837408837144096,
"learning_rate": 1e-06,
"logits/chosen": 0.9534305334091187,
"logits/rejected": 1.0431879758834839,
"logps/chosen": -250.000244140625,
"logps/rejected": -275.9551086425781,
"loss": 0.4926,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.007500249892473221,
"rewards/margins": 0.08269646763801575,
"rewards/rejected": -0.07519622147083282,
"step": 85
},
{
"debug/policy_chosen_logits": 1.1253650188446045,
"debug/policy_chosen_logps": -240.8356475830078,
"debug/policy_rejected_logits": 1.2428481578826904,
"debug/policy_rejected_logps": -265.67266845703125,
"debug/reference_chosen_logps": -245.643798828125,
"debug/reference_rejected_logps": -261.6888122558594,
"epoch": 0.569620253164557,
"grad_norm": 8.938690009286978,
"learning_rate": 1e-06,
"logits/chosen": 1.1253650188446045,
"logits/rejected": 1.2428481578826904,
"logps/chosen": -240.8356475830078,
"logps/rejected": -265.67266845703125,
"loss": 0.4314,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.04808169603347778,
"rewards/margins": 0.08791980892419815,
"rewards/rejected": -0.03983811289072037,
"step": 90
},
{
"debug/policy_chosen_logits": 0.9913564920425415,
"debug/policy_chosen_logps": -247.68453979492188,
"debug/policy_rejected_logits": 1.167474389076233,
"debug/policy_rejected_logps": -284.51300048828125,
"debug/reference_chosen_logps": -250.7725067138672,
"debug/reference_rejected_logps": -276.8506774902344,
"epoch": 0.6012658227848101,
"grad_norm": 7.214786092625251,
"learning_rate": 1e-06,
"logits/chosen": 0.9913564920425415,
"logits/rejected": 1.167474389076233,
"logps/chosen": -247.68453979492188,
"logps/rejected": -284.51300048828125,
"loss": 0.4481,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.030879342928528786,
"rewards/margins": 0.10750222206115723,
"rewards/rejected": -0.07662288844585419,
"step": 95
},
{
"debug/policy_chosen_logits": 1.548004388809204,
"debug/policy_chosen_logps": -286.9696350097656,
"debug/policy_rejected_logits": 1.2569023370742798,
"debug/policy_rejected_logps": -255.9474639892578,
"debug/reference_chosen_logps": -288.26263427734375,
"debug/reference_rejected_logps": -252.56982421875,
"epoch": 0.6329113924050633,
"grad_norm": 7.098617456221662,
"learning_rate": 1e-06,
"logits/chosen": 1.548004388809204,
"logits/rejected": 1.2569023370742798,
"logps/chosen": -286.9696350097656,
"logps/rejected": -255.9474639892578,
"loss": 0.4429,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.012930279597640038,
"rewards/margins": 0.04670674726366997,
"rewards/rejected": -0.03377646952867508,
"step": 100
},
{
"epoch": 0.6329113924050633,
"eval_debug/policy_chosen_logits": 1.2252188920974731,
"eval_debug/policy_chosen_logps": -250.68939208984375,
"eval_debug/policy_rejected_logits": 1.4343616962432861,
"eval_debug/policy_rejected_logps": -287.45086669921875,
"eval_debug/reference_chosen_logps": -255.34970092773438,
"eval_debug/reference_rejected_logps": -283.57049560546875,
"eval_logits/chosen": 1.2252188920974731,
"eval_logits/rejected": 1.4343616962432861,
"eval_logps/chosen": -250.68939208984375,
"eval_logps/rejected": -287.45086669921875,
"eval_loss": 0.43653252720832825,
"eval_rewards/accuracies": 0.5769230723381042,
"eval_rewards/chosen": 0.04660310223698616,
"eval_rewards/margins": 0.08540700376033783,
"eval_rewards/rejected": -0.03880389407277107,
"eval_runtime": 19.8549,
"eval_samples_per_second": 20.146,
"eval_steps_per_second": 0.655,
"step": 100
},
{
"debug/policy_chosen_logits": 1.011919617652893,
"debug/policy_chosen_logps": -279.73260498046875,
"debug/policy_rejected_logits": 1.211625337600708,
"debug/policy_rejected_logps": -298.412109375,
"debug/reference_chosen_logps": -281.5310974121094,
"debug/reference_rejected_logps": -292.20550537109375,
"epoch": 0.6645569620253164,
"grad_norm": 6.340425768293679,
"learning_rate": 1e-06,
"logits/chosen": 1.011919617652893,
"logits/rejected": 1.211625337600708,
"logps/chosen": -279.73260498046875,
"logps/rejected": -298.412109375,
"loss": 0.4362,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.017984820529818535,
"rewards/margins": 0.08005066215991974,
"rewards/rejected": -0.06206584721803665,
"step": 105
},
{
"debug/policy_chosen_logits": 1.0565037727355957,
"debug/policy_chosen_logps": -251.0978546142578,
"debug/policy_rejected_logits": 1.3947855234146118,
"debug/policy_rejected_logps": -316.4710998535156,
"debug/reference_chosen_logps": -253.4007110595703,
"debug/reference_rejected_logps": -309.9458923339844,
"epoch": 0.6962025316455697,
"grad_norm": 20.34165260676491,
"learning_rate": 1e-06,
"logits/chosen": 1.0565037727355957,
"logits/rejected": 1.3947855234146118,
"logps/chosen": -251.0978546142578,
"logps/rejected": -316.4710998535156,
"loss": 0.4383,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.023028511554002762,
"rewards/margins": 0.08828048408031464,
"rewards/rejected": -0.06525196880102158,
"step": 110
},
{
"debug/policy_chosen_logits": 0.8845629692077637,
"debug/policy_chosen_logps": -241.9716339111328,
"debug/policy_rejected_logits": 1.229775071144104,
"debug/policy_rejected_logps": -321.60186767578125,
"debug/reference_chosen_logps": -246.28433227539062,
"debug/reference_rejected_logps": -314.5198974609375,
"epoch": 0.7278481012658228,
"grad_norm": 7.789166803514712,
"learning_rate": 1e-06,
"logits/chosen": 0.8845629692077637,
"logits/rejected": 1.229775071144104,
"logps/chosen": -241.9716339111328,
"logps/rejected": -321.60186767578125,
"loss": 0.4426,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.043126728385686874,
"rewards/margins": 0.11394629627466202,
"rewards/rejected": -0.07081956416368484,
"step": 115
},
{
"debug/policy_chosen_logits": 0.6471331119537354,
"debug/policy_chosen_logps": -232.4429168701172,
"debug/policy_rejected_logits": 0.9131924510002136,
"debug/policy_rejected_logps": -279.41290283203125,
"debug/reference_chosen_logps": -237.39102172851562,
"debug/reference_rejected_logps": -273.61090087890625,
"epoch": 0.759493670886076,
"grad_norm": 7.468046301754059,
"learning_rate": 1e-06,
"logits/chosen": 0.6471331119537354,
"logits/rejected": 0.9131924510002136,
"logps/chosen": -232.4429168701172,
"logps/rejected": -279.41290283203125,
"loss": 0.4131,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04948071017861366,
"rewards/margins": 0.1075005754828453,
"rewards/rejected": -0.05801987648010254,
"step": 120
},
{
"debug/policy_chosen_logits": 0.9338349103927612,
"debug/policy_chosen_logps": -260.35235595703125,
"debug/policy_rejected_logits": 1.0534359216690063,
"debug/policy_rejected_logps": -297.56683349609375,
"debug/reference_chosen_logps": -264.9391174316406,
"debug/reference_rejected_logps": -289.8217468261719,
"epoch": 0.7911392405063291,
"grad_norm": 8.935461685140815,
"learning_rate": 1e-06,
"logits/chosen": 0.9338349103927612,
"logits/rejected": 1.0534359216690063,
"logps/chosen": -260.35235595703125,
"logps/rejected": -297.56683349609375,
"loss": 0.4303,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.045867711305618286,
"rewards/margins": 0.12331867218017578,
"rewards/rejected": -0.0774509608745575,
"step": 125
},
{
"debug/policy_chosen_logits": 0.8780291676521301,
"debug/policy_chosen_logps": -284.29205322265625,
"debug/policy_rejected_logits": 0.8824840784072876,
"debug/policy_rejected_logps": -287.76690673828125,
"debug/reference_chosen_logps": -286.41943359375,
"debug/reference_rejected_logps": -283.56903076171875,
"epoch": 0.8227848101265823,
"grad_norm": 6.948216331668783,
"learning_rate": 1e-06,
"logits/chosen": 0.8780291676521301,
"logits/rejected": 0.8824840784072876,
"logps/chosen": -284.29205322265625,
"logps/rejected": -287.76690673828125,
"loss": 0.4375,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.02127380482852459,
"rewards/margins": 0.06325232237577438,
"rewards/rejected": -0.041978511959314346,
"step": 130
},
{
"debug/policy_chosen_logits": 1.165907859802246,
"debug/policy_chosen_logps": -255.9198455810547,
"debug/policy_rejected_logits": 1.4020473957061768,
"debug/policy_rejected_logps": -301.6413879394531,
"debug/reference_chosen_logps": -260.84521484375,
"debug/reference_rejected_logps": -295.99700927734375,
"epoch": 0.8544303797468354,
"grad_norm": 6.0797186914906485,
"learning_rate": 1e-06,
"logits/chosen": 1.165907859802246,
"logits/rejected": 1.4020473957061768,
"logps/chosen": -255.9198455810547,
"logps/rejected": -301.6413879394531,
"loss": 0.4418,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.049253594130277634,
"rewards/margins": 0.10569741576910019,
"rewards/rejected": -0.056443821638822556,
"step": 135
},
{
"debug/policy_chosen_logits": 0.9684173464775085,
"debug/policy_chosen_logps": -240.7368927001953,
"debug/policy_rejected_logits": 1.522164225578308,
"debug/policy_rejected_logps": -300.8490295410156,
"debug/reference_chosen_logps": -244.41757202148438,
"debug/reference_rejected_logps": -289.0794372558594,
"epoch": 0.8860759493670886,
"grad_norm": 6.850074566718433,
"learning_rate": 1e-06,
"logits/chosen": 0.9684173464775085,
"logits/rejected": 1.522164225578308,
"logps/chosen": -240.7368927001953,
"logps/rejected": -300.8490295410156,
"loss": 0.43,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.03680698946118355,
"rewards/margins": 0.1545029729604721,
"rewards/rejected": -0.11769597232341766,
"step": 140
},
{
"debug/policy_chosen_logits": 1.095474123954773,
"debug/policy_chosen_logps": -281.7500305175781,
"debug/policy_rejected_logits": 1.0368950366973877,
"debug/policy_rejected_logps": -281.8016052246094,
"debug/reference_chosen_logps": -285.4373474121094,
"debug/reference_rejected_logps": -278.67181396484375,
"epoch": 0.9177215189873418,
"grad_norm": 6.330596887372699,
"learning_rate": 1e-06,
"logits/chosen": 1.095474123954773,
"logits/rejected": 1.0368950366973877,
"logps/chosen": -281.7500305175781,
"logps/rejected": -281.8016052246094,
"loss": 0.4243,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.036873430013656616,
"rewards/margins": 0.06817178428173065,
"rewards/rejected": -0.03129836544394493,
"step": 145
},
{
"debug/policy_chosen_logits": 0.9509929418563843,
"debug/policy_chosen_logps": -247.018310546875,
"debug/policy_rejected_logits": 1.1111629009246826,
"debug/policy_rejected_logps": -272.07684326171875,
"debug/reference_chosen_logps": -250.40658569335938,
"debug/reference_rejected_logps": -265.6427001953125,
"epoch": 0.9493670886075949,
"grad_norm": 8.073046871358697,
"learning_rate": 1e-06,
"logits/chosen": 0.9509929418563843,
"logits/rejected": 1.1111629009246826,
"logps/chosen": -247.018310546875,
"logps/rejected": -272.07684326171875,
"loss": 0.4234,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.03388286381959915,
"rewards/margins": 0.09822405129671097,
"rewards/rejected": -0.06434118002653122,
"step": 150
},
{
"debug/policy_chosen_logits": 0.6622827053070068,
"debug/policy_chosen_logps": -237.2403106689453,
"debug/policy_rejected_logits": 0.8520939946174622,
"debug/policy_rejected_logps": -286.5059509277344,
"debug/reference_chosen_logps": -241.94467163085938,
"debug/reference_rejected_logps": -278.73272705078125,
"epoch": 0.9810126582278481,
"grad_norm": 7.904037537559287,
"learning_rate": 1e-06,
"logits/chosen": 0.6622827053070068,
"logits/rejected": 0.8520939946174622,
"logps/chosen": -237.2403106689453,
"logps/rejected": -286.5059509277344,
"loss": 0.423,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.04704369604587555,
"rewards/margins": 0.12477605044841766,
"rewards/rejected": -0.07773236930370331,
"step": 155
},
{
"epoch": 1.0,
"step": 158,
"total_flos": 0.0,
"train_loss": 0.44511839181562013,
"train_runtime": 1281.3009,
"train_samples_per_second": 7.867,
"train_steps_per_second": 0.123
}
],
"logging_steps": 5,
"max_steps": 158,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}
|