|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.03555620056599666, |
|
"eval_steps": 500, |
|
"global_step": 490, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007256367462448298, |
|
"grad_norm": 12.0, |
|
"learning_rate": 5e-05, |
|
"loss": 3.0993, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0014512734924896596, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 0.0001, |
|
"loss": 2.208, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0021769102387344894, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 0.00015, |
|
"loss": 1.3285, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0029025469849793192, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6895, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.003628183731224149, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.00025, |
|
"loss": 0.714, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004353820477468979, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4849, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.005079457223713809, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 0.00035, |
|
"loss": 0.3671, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0058050939699586385, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 0.0004, |
|
"loss": 0.5693, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.006530730716203468, |
|
"grad_norm": 3.296875, |
|
"learning_rate": 0.00045000000000000004, |
|
"loss": 0.4133, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.007256367462448298, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 0.0005, |
|
"loss": 0.2668, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.007982004208693128, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.000499999340865746, |
|
"loss": 0.2922, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.008707640954937958, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 0.0004999973634664594, |
|
"loss": 0.3996, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.009433277701182788, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.0004999940678125673, |
|
"loss": 0.2841, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.010158914447427617, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.000499989453921448, |
|
"loss": 0.3003, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.010884551193672447, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 0.0004999835218174307, |
|
"loss": 0.2747, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.011610187939917277, |
|
"grad_norm": 1.671875, |
|
"learning_rate": 0.000499976271531796, |
|
"loss": 0.37, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.012335824686162107, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.000499967703102775, |
|
"loss": 0.2163, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.013061461432406937, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.00049995781657555, |
|
"loss": 0.3652, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.013787098178651766, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 0.000499946612002253, |
|
"loss": 0.3177, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.014512734924896596, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 0.0004999340894419668, |
|
"loss": 0.2043, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.015238371671141426, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0004999202489607236, |
|
"loss": 0.2865, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.015964008417386256, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 0.0004999050906315055, |
|
"loss": 0.2039, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.016689645163631087, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.0004998886145342434, |
|
"loss": 0.3509, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.017415281909875915, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.0004998708207558168, |
|
"loss": 0.3208, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.018140918656120747, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.0004998517093900539, |
|
"loss": 0.2307, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.018866555402365575, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0004998312805377302, |
|
"loss": 0.2232, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.019592192148610407, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 0.0004998095343065685, |
|
"loss": 0.2587, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.020317828894855235, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.0004997864708112384, |
|
"loss": 0.2175, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.021043465641100066, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.0004997620901733554, |
|
"loss": 0.2185, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.021769102387344894, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.0004997363925214804, |
|
"loss": 0.2409, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.022494739133589726, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.000499709377991119, |
|
"loss": 0.1976, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.023220375879834554, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 0.0004996810467247207, |
|
"loss": 0.2899, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.023946012626079385, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.0004996513988716786, |
|
"loss": 0.2324, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.024671649372324213, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0004996204345883278, |
|
"loss": 0.2376, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.025397286118569045, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.0004995881540379454, |
|
"loss": 0.2927, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.026122922864813873, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 0.0004995545573907492, |
|
"loss": 0.2738, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.026848559611058705, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.0004995196448238966, |
|
"loss": 0.2427, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.027574196357303533, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 0.0004994834165214843, |
|
"loss": 0.2032, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.028299833103548364, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 0.0004994458726745468, |
|
"loss": 0.2748, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.029025469849793192, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0004994070134810556, |
|
"loss": 0.2275, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.029751106596038024, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.000499366839145918, |
|
"loss": 0.1725, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.030476743342282852, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 0.0004993253498809762, |
|
"loss": 0.2298, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.031202380088527683, |
|
"grad_norm": 2.875, |
|
"learning_rate": 0.0004992825459050064, |
|
"loss": 0.2721, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.03192801683477251, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 0.0004992384274437171, |
|
"loss": 0.248, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03265365358101734, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 0.000499192994729748, |
|
"loss": 0.1529, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.033379290327262175, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 0.0004991462480026693, |
|
"loss": 0.2584, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.034104927073507, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0004990981875089799, |
|
"loss": 0.25, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.03483056381975183, |
|
"grad_norm": 1.75, |
|
"learning_rate": 0.0004990488135021065, |
|
"loss": 0.199, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.03555620056599666, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0004989981262424017, |
|
"loss": 0.2546, |
|
"step": 490 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 13781, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"total_flos": 0.0, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|