{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.957345971563981, "eval_steps": 100, "global_step": 156, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 132.97353908293687, "learning_rate": 3.125e-08, "logits/chosen": 123.11854553222656, "logits/rejected": 97.00198364257812, "logps/chosen": -425.18585205078125, "logps/rejected": -424.1869201660156, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.19, "grad_norm": 206.0883100010928, "learning_rate": 3.1249999999999997e-07, "logits/chosen": 117.39097595214844, "logits/rejected": 136.3163299560547, "logps/chosen": -442.6399230957031, "logps/rejected": -524.91015625, "loss": 0.7186, "rewards/accuracies": 0.4930555522441864, "rewards/chosen": 0.037425246089696884, "rewards/margins": 0.07718456536531448, "rewards/rejected": -0.0397593155503273, "step": 10 }, { "epoch": 0.38, "grad_norm": 114.8435303205146, "learning_rate": 4.989935734988097e-07, "logits/chosen": 125.3319091796875, "logits/rejected": 132.9754638671875, "logps/chosen": -422.8042907714844, "logps/rejected": -491.63226318359375, "loss": 0.6164, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.17301546037197113, "rewards/margins": 0.379099041223526, "rewards/rejected": -0.20608356595039368, "step": 20 }, { "epoch": 0.57, "grad_norm": 99.27143207986335, "learning_rate": 4.877641290737883e-07, "logits/chosen": 122.47686767578125, "logits/rejected": 125.91865539550781, "logps/chosen": -466.9618225097656, "logps/rejected": -540.3817138671875, "loss": 0.5813, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5920838117599487, "rewards/margins": 1.0662639141082764, "rewards/rejected": -1.658347725868225, "step": 30 }, { "epoch": 0.76, "grad_norm": 109.14521515462766, "learning_rate": 4.646121984004665e-07, "logits/chosen": 124.97059631347656, "logits/rejected": 119.9173583984375, "logps/chosen": -497.7147521972656, "logps/rejected": -527.3887939453125, "loss": 0.5426, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.2266700267791748, "rewards/margins": 0.9045358896255493, "rewards/rejected": -2.1312055587768555, "step": 40 }, { "epoch": 0.95, "grad_norm": 115.6113002085735, "learning_rate": 4.3069871595684787e-07, "logits/chosen": 132.8910369873047, "logits/rejected": 133.22190856933594, "logps/chosen": -520.63037109375, "logps/rejected": -549.1149291992188, "loss": 0.5202, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.8903814554214478, "rewards/margins": 1.0029468536376953, "rewards/rejected": -2.8933284282684326, "step": 50 }, { "epoch": 1.14, "grad_norm": 46.9650475313439, "learning_rate": 3.877242453630256e-07, "logits/chosen": 131.47854614257812, "logits/rejected": 134.71681213378906, "logps/chosen": -481.8072814941406, "logps/rejected": -534.0516357421875, "loss": 0.2837, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.2520891427993774, "rewards/margins": 2.2355263233184814, "rewards/rejected": -3.4876155853271484, "step": 60 }, { "epoch": 1.33, "grad_norm": 47.26485069523079, "learning_rate": 3.378437060203357e-07, "logits/chosen": 126.1490707397461, "logits/rejected": 126.75111389160156, "logps/chosen": -452.6795349121094, "logps/rejected": -579.5133056640625, "loss": 0.1756, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.188291072845459, "rewards/margins": 2.6805100440979004, "rewards/rejected": -3.868800640106201, "step": 70 }, { "epoch": 1.52, "grad_norm": 46.43874254029814, "learning_rate": 2.8355831645441387e-07, "logits/chosen": 127.46858978271484, "logits/rejected": 128.4056396484375, "logps/chosen": -514.4637451171875, "logps/rejected": -621.2301635742188, "loss": 0.1711, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.268179178237915, "rewards/margins": 3.4565296173095703, "rewards/rejected": -4.724708557128906, "step": 80 }, { "epoch": 1.71, "grad_norm": 49.11808633093636, "learning_rate": 2.2759017277414164e-07, "logits/chosen": 112.5447998046875, "logits/rejected": 114.98893737792969, "logps/chosen": -497.70001220703125, "logps/rejected": -589.730224609375, "loss": 0.1524, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7766218185424805, "rewards/margins": 3.491291046142578, "rewards/rejected": -5.2679123878479, "step": 90 }, { "epoch": 1.9, "grad_norm": 47.47224806448749, "learning_rate": 1.7274575140626315e-07, "logits/chosen": 124.581787109375, "logits/rejected": 115.68563079833984, "logps/chosen": -516.1900634765625, "logps/rejected": -632.6817626953125, "loss": 0.1623, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8899990320205688, "rewards/margins": 3.6050896644592285, "rewards/rejected": -5.495089054107666, "step": 100 }, { "epoch": 1.9, "eval_logits/chosen": 93.8502197265625, "eval_logits/rejected": 87.7247543334961, "eval_logps/chosen": -512.8825073242188, "eval_logps/rejected": -541.5043334960938, "eval_loss": 0.48611319065093994, "eval_rewards/accuracies": 0.6770833134651184, "eval_rewards/chosen": -2.9739017486572266, "eval_rewards/margins": 1.5238369703292847, "eval_rewards/rejected": -4.497739315032959, "eval_runtime": 53.4905, "eval_samples_per_second": 14.021, "eval_steps_per_second": 0.449, "step": 100 }, { "epoch": 2.09, "grad_norm": 25.494387206609645, "learning_rate": 1.2177518064852348e-07, "logits/chosen": 102.986083984375, "logits/rejected": 116.60546875, "logps/chosen": -538.074951171875, "logps/rejected": -667.3218383789062, "loss": 0.1318, "rewards/accuracies": 0.96875, "rewards/chosen": -2.1493353843688965, "rewards/margins": 3.4991326332092285, "rewards/rejected": -5.648468017578125, "step": 110 }, { "epoch": 2.27, "grad_norm": 24.60483354043265, "learning_rate": 7.723433775328384e-08, "logits/chosen": 113.220703125, "logits/rejected": 114.29705810546875, "logps/chosen": -522.1823120117188, "logps/rejected": -628.0721435546875, "loss": 0.0837, "rewards/accuracies": 1.0, "rewards/chosen": -2.2452592849731445, "rewards/margins": 3.996518611907959, "rewards/rejected": -6.2417778968811035, "step": 120 }, { "epoch": 2.46, "grad_norm": 32.75955455536007, "learning_rate": 4.1356686569674335e-08, "logits/chosen": 115.95035552978516, "logits/rejected": 120.65645599365234, "logps/chosen": -537.8087158203125, "logps/rejected": -653.7862548828125, "loss": 0.0781, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2025058269500732, "rewards/margins": 4.200939178466797, "rewards/rejected": -6.403443813323975, "step": 130 }, { "epoch": 2.65, "grad_norm": 23.375967561613557, "learning_rate": 1.5941282340065697e-08, "logits/chosen": 101.51383972167969, "logits/rejected": 102.2659683227539, "logps/chosen": -499.16229248046875, "logps/rejected": -645.9388427734375, "loss": 0.0791, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.6822938919067383, "rewards/margins": 4.259942054748535, "rewards/rejected": -6.942234992980957, "step": 140 }, { "epoch": 2.84, "grad_norm": 27.725216044545164, "learning_rate": 2.2625595580163247e-09, "logits/chosen": 108.08512878417969, "logits/rejected": 121.6434097290039, "logps/chosen": -524.5687866210938, "logps/rejected": -647.0615844726562, "loss": 0.079, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.486959457397461, "rewards/margins": 4.458805084228516, "rewards/rejected": -6.945765018463135, "step": 150 }, { "epoch": 2.96, "step": 156, "total_flos": 0.0, "train_loss": 0.28389122929328525, "train_runtime": 1811.0132, "train_samples_per_second": 11.182, "train_steps_per_second": 0.086 } ], "logging_steps": 10, "max_steps": 156, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }