0.001_idpo_same_scratch_iter_1 / trainer_state.json
ShenaoZ's picture
Model save
7a394ab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984301412872841,
"eval_steps": 500,
"global_step": 159,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"learning_rate": 3.125e-08,
"logits/chosen": -2.99273681640625,
"logits/rejected": -2.9600584506988525,
"logps/chosen": -246.7786407470703,
"logps/rejected": -318.69512939453125,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.851263999938965,
"logits/rejected": -2.8660426139831543,
"logps/chosen": -268.8946533203125,
"logps/rejected": -260.3899230957031,
"loss": 0.6929,
"rewards/accuracies": 0.5069444179534912,
"rewards/chosen": -8.18173648440279e-05,
"rewards/margins": 0.00074238411616534,
"rewards/rejected": -0.0008242016192525625,
"step": 10
},
{
"epoch": 0.13,
"learning_rate": 4.990353313429303e-07,
"logits/chosen": -2.908505916595459,
"logits/rejected": -2.89821720123291,
"logps/chosen": -258.009033203125,
"logps/rejected": -228.437255859375,
"loss": 0.6879,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.005000133998692036,
"rewards/margins": 0.00945957750082016,
"rewards/rejected": -0.004459443502128124,
"step": 20
},
{
"epoch": 0.19,
"learning_rate": 4.882681251368548e-07,
"logits/chosen": -2.7995059490203857,
"logits/rejected": -2.752720832824707,
"logps/chosen": -290.11431884765625,
"logps/rejected": -256.45672607421875,
"loss": 0.6718,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.03622695803642273,
"rewards/margins": 0.06187431141734123,
"rewards/rejected": -0.0256473608314991,
"step": 30
},
{
"epoch": 0.25,
"learning_rate": 4.6604720940421207e-07,
"logits/chosen": -2.80169677734375,
"logits/rejected": -2.7976982593536377,
"logps/chosen": -295.5770568847656,
"logps/rejected": -270.68853759765625,
"loss": 0.6487,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.013251985423266888,
"rewards/margins": 0.11587367206811905,
"rewards/rejected": -0.1291256546974182,
"step": 40
},
{
"epoch": 0.31,
"learning_rate": 4.3344075855595097e-07,
"logits/chosen": -2.837135076522827,
"logits/rejected": -2.787645101547241,
"logps/chosen": -279.5509338378906,
"logps/rejected": -267.29339599609375,
"loss": 0.6334,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.16008788347244263,
"rewards/margins": 0.17389968037605286,
"rewards/rejected": -0.3339875638484955,
"step": 50
},
{
"epoch": 0.38,
"learning_rate": 3.920161866827889e-07,
"logits/chosen": -2.8221335411071777,
"logits/rejected": -2.773529529571533,
"logps/chosen": -317.88067626953125,
"logps/rejected": -303.8488464355469,
"loss": 0.6139,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.28249022364616394,
"rewards/margins": 0.21050170063972473,
"rewards/rejected": -0.49299192428588867,
"step": 60
},
{
"epoch": 0.44,
"learning_rate": 3.4376480090239047e-07,
"logits/chosen": -2.821988344192505,
"logits/rejected": -2.762542247772217,
"logps/chosen": -319.9324951171875,
"logps/rejected": -310.75811767578125,
"loss": 0.5811,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.38823598623275757,
"rewards/margins": 0.3824814260005951,
"rewards/rejected": -0.7707175016403198,
"step": 70
},
{
"epoch": 0.5,
"learning_rate": 2.910060778827554e-07,
"logits/chosen": -2.84806489944458,
"logits/rejected": -2.8013827800750732,
"logps/chosen": -344.35235595703125,
"logps/rejected": -294.4729309082031,
"loss": 0.6021,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.47417935729026794,
"rewards/margins": 0.2662152647972107,
"rewards/rejected": -0.7403945922851562,
"step": 80
},
{
"epoch": 0.57,
"learning_rate": 2.3627616503391812e-07,
"logits/chosen": -2.80991792678833,
"logits/rejected": -2.7958946228027344,
"logps/chosen": -332.05419921875,
"logps/rejected": -369.29388427734375,
"loss": 0.5745,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.49079465866088867,
"rewards/margins": 0.41097983717918396,
"rewards/rejected": -0.9017744064331055,
"step": 90
},
{
"epoch": 0.63,
"learning_rate": 1.8220596619089573e-07,
"logits/chosen": -2.7608766555786133,
"logits/rejected": -2.704923629760742,
"logps/chosen": -319.58941650390625,
"logps/rejected": -324.7294921875,
"loss": 0.592,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6348754167556763,
"rewards/margins": 0.2899959683418274,
"rewards/rejected": -0.9248712658882141,
"step": 100
},
{
"epoch": 0.69,
"learning_rate": 1.3139467229135998e-07,
"logits/chosen": -2.776845693588257,
"logits/rejected": -2.7345542907714844,
"logps/chosen": -305.9095153808594,
"logps/rejected": -340.0468444824219,
"loss": 0.5635,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.46963971853256226,
"rewards/margins": 0.42097076773643494,
"rewards/rejected": -0.8906105160713196,
"step": 110
},
{
"epoch": 0.75,
"learning_rate": 8.628481651367875e-08,
"logits/chosen": -2.7208383083343506,
"logits/rejected": -2.738800525665283,
"logps/chosen": -323.96221923828125,
"logps/rejected": -345.39813232421875,
"loss": 0.5747,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.4477911591529846,
"rewards/margins": 0.5023320317268372,
"rewards/rejected": -0.950123131275177,
"step": 120
},
{
"epoch": 0.82,
"learning_rate": 4.904486005914027e-08,
"logits/chosen": -2.710137367248535,
"logits/rejected": -2.7277586460113525,
"logps/chosen": -310.197509765625,
"logps/rejected": -322.44744873046875,
"loss": 0.575,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.43628835678100586,
"rewards/margins": 0.4412984848022461,
"rewards/rejected": -0.877586841583252,
"step": 130
},
{
"epoch": 0.88,
"learning_rate": 2.1464952759020856e-08,
"logits/chosen": -2.76245379447937,
"logits/rejected": -2.7269513607025146,
"logps/chosen": -323.8768005371094,
"logps/rejected": -376.01824951171875,
"loss": 0.5434,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.49710363149642944,
"rewards/margins": 0.6278212666511536,
"rewards/rejected": -1.124924898147583,
"step": 140
},
{
"epoch": 0.94,
"learning_rate": 4.8708793644441086e-09,
"logits/chosen": -2.6897733211517334,
"logits/rejected": -2.6578209400177,
"logps/chosen": -299.5566711425781,
"logps/rejected": -328.7222900390625,
"loss": 0.5567,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.4965454041957855,
"rewards/margins": 0.5010154843330383,
"rewards/rejected": -0.997560977935791,
"step": 150
},
{
"epoch": 1.0,
"step": 159,
"total_flos": 0.0,
"train_loss": 0.6054114065830063,
"train_runtime": 3723.997,
"train_samples_per_second": 5.472,
"train_steps_per_second": 0.043
}
],
"logging_steps": 10,
"max_steps": 159,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}