|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.97191011235955, |
|
"eval_steps": 25, |
|
"global_step": 1775, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1404494382022472, |
|
"grad_norm": 1.0087393522262573, |
|
"learning_rate": 0.0001973018549747049, |
|
"loss": 1.3531, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2808988764044944, |
|
"grad_norm": 0.6913560032844543, |
|
"learning_rate": 0.0001944912872400225, |
|
"loss": 0.6347, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.42134831460674155, |
|
"grad_norm": 0.5822920799255371, |
|
"learning_rate": 0.00019168071950534008, |
|
"loss": 0.6301, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5617977528089888, |
|
"grad_norm": 0.6517189741134644, |
|
"learning_rate": 0.00018887015177065767, |
|
"loss": 0.5746, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.702247191011236, |
|
"grad_norm": 0.5613220930099487, |
|
"learning_rate": 0.00018605958403597526, |
|
"loss": 0.546, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8426966292134831, |
|
"grad_norm": 0.6231513023376465, |
|
"learning_rate": 0.00018324901630129288, |
|
"loss": 0.6111, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9831460674157303, |
|
"grad_norm": 0.5648068189620972, |
|
"learning_rate": 0.00018043844856661047, |
|
"loss": 0.4602, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1235955056179776, |
|
"grad_norm": 0.7641807794570923, |
|
"learning_rate": 0.00017762788083192804, |
|
"loss": 0.5375, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2640449438202248, |
|
"grad_norm": 0.8002265691757202, |
|
"learning_rate": 0.00017481731309724566, |
|
"loss": 0.384, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.404494382022472, |
|
"grad_norm": 0.5983501672744751, |
|
"learning_rate": 0.00017200674536256325, |
|
"loss": 0.478, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5449438202247192, |
|
"grad_norm": 0.6597617268562317, |
|
"learning_rate": 0.00016919617762788084, |
|
"loss": 0.4276, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.6853932584269664, |
|
"grad_norm": 0.6023315787315369, |
|
"learning_rate": 0.00016638560989319843, |
|
"loss": 0.3864, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8258426966292136, |
|
"grad_norm": 0.6361246705055237, |
|
"learning_rate": 0.00016357504215851602, |
|
"loss": 0.4311, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.9662921348314608, |
|
"grad_norm": 0.5704653859138489, |
|
"learning_rate": 0.00016076447442383361, |
|
"loss": 0.3237, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.106741573033708, |
|
"grad_norm": 0.6480958461761475, |
|
"learning_rate": 0.00015795390668915123, |
|
"loss": 0.3515, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.247191011235955, |
|
"grad_norm": 0.6552001237869263, |
|
"learning_rate": 0.00015514333895446882, |
|
"loss": 0.2377, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.3876404494382024, |
|
"grad_norm": 0.7502114176750183, |
|
"learning_rate": 0.0001523327712197864, |
|
"loss": 0.3069, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.5280898876404496, |
|
"grad_norm": 0.7289499640464783, |
|
"learning_rate": 0.000149522203485104, |
|
"loss": 0.2973, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.668539325842697, |
|
"grad_norm": 0.7269200682640076, |
|
"learning_rate": 0.0001467116357504216, |
|
"loss": 0.2612, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"grad_norm": 0.9184384346008301, |
|
"learning_rate": 0.0001439010680157392, |
|
"loss": 0.282, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.949438202247191, |
|
"grad_norm": 0.6768775582313538, |
|
"learning_rate": 0.00014109050028105678, |
|
"loss": 0.2647, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.0898876404494384, |
|
"grad_norm": 0.8616690039634705, |
|
"learning_rate": 0.00013827993254637437, |
|
"loss": 0.2385, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.2303370786516856, |
|
"grad_norm": 0.5999014377593994, |
|
"learning_rate": 0.00013546936481169196, |
|
"loss": 0.1662, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 3.370786516853933, |
|
"grad_norm": 0.5833967328071594, |
|
"learning_rate": 0.00013265879707700956, |
|
"loss": 0.2198, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.51123595505618, |
|
"grad_norm": 0.694567084312439, |
|
"learning_rate": 0.00012984822934232717, |
|
"loss": 0.1762, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.6516853932584272, |
|
"grad_norm": 0.6547160744667053, |
|
"learning_rate": 0.00012703766160764474, |
|
"loss": 0.2045, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.7921348314606744, |
|
"grad_norm": 0.7818899750709534, |
|
"learning_rate": 0.00012422709387296233, |
|
"loss": 0.2029, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.932584269662921, |
|
"grad_norm": 0.47738873958587646, |
|
"learning_rate": 0.00012141652613827993, |
|
"loss": 0.1734, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.073033707865169, |
|
"grad_norm": 0.6599411368370056, |
|
"learning_rate": 0.00011860595840359754, |
|
"loss": 0.1773, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 4.213483146067416, |
|
"grad_norm": 0.4201154410839081, |
|
"learning_rate": 0.00011579539066891512, |
|
"loss": 0.1276, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.353932584269663, |
|
"grad_norm": 0.495446115732193, |
|
"learning_rate": 0.00011298482293423271, |
|
"loss": 0.146, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 4.49438202247191, |
|
"grad_norm": 0.27687525749206543, |
|
"learning_rate": 0.00011017425519955031, |
|
"loss": 0.1302, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 4.634831460674158, |
|
"grad_norm": 0.44135782122612, |
|
"learning_rate": 0.00010736368746486792, |
|
"loss": 0.157, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.775280898876405, |
|
"grad_norm": 1.0475701093673706, |
|
"learning_rate": 0.00010455311973018551, |
|
"loss": 0.1463, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.915730337078652, |
|
"grad_norm": 0.5669655799865723, |
|
"learning_rate": 0.00010174255199550309, |
|
"loss": 0.1427, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 5.056179775280899, |
|
"grad_norm": 0.4750053286552429, |
|
"learning_rate": 9.893198426082069e-05, |
|
"loss": 0.137, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.196629213483146, |
|
"grad_norm": 0.5436720252037048, |
|
"learning_rate": 9.612141652613828e-05, |
|
"loss": 0.1049, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 5.337078651685394, |
|
"grad_norm": 0.9001078009605408, |
|
"learning_rate": 9.331084879145588e-05, |
|
"loss": 0.1214, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 5.477528089887641, |
|
"grad_norm": 0.4534567892551422, |
|
"learning_rate": 9.050028105677347e-05, |
|
"loss": 0.1126, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 5.617977528089888, |
|
"grad_norm": 0.5312709212303162, |
|
"learning_rate": 8.768971332209107e-05, |
|
"loss": 0.1134, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.758426966292134, |
|
"grad_norm": 0.4822239577770233, |
|
"learning_rate": 8.487914558740866e-05, |
|
"loss": 0.1171, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 5.898876404494382, |
|
"grad_norm": 1.4501484632492065, |
|
"learning_rate": 8.206857785272625e-05, |
|
"loss": 0.1189, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.03932584269663, |
|
"grad_norm": 0.40537115931510925, |
|
"learning_rate": 7.925801011804385e-05, |
|
"loss": 0.1166, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 6.179775280898877, |
|
"grad_norm": 0.5476611852645874, |
|
"learning_rate": 7.644744238336145e-05, |
|
"loss": 0.0919, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 6.320224719101123, |
|
"grad_norm": 0.2506895959377289, |
|
"learning_rate": 7.363687464867903e-05, |
|
"loss": 0.0974, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 6.460674157303371, |
|
"grad_norm": 0.3818029463291168, |
|
"learning_rate": 7.082630691399663e-05, |
|
"loss": 0.0999, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 6.601123595505618, |
|
"grad_norm": 0.4769856631755829, |
|
"learning_rate": 6.801573917931423e-05, |
|
"loss": 0.1042, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 6.741573033707866, |
|
"grad_norm": 0.2570767402648926, |
|
"learning_rate": 6.520517144463182e-05, |
|
"loss": 0.1067, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 6.882022471910112, |
|
"grad_norm": 0.39180973172187805, |
|
"learning_rate": 6.239460370994942e-05, |
|
"loss": 0.0967, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 7.022471910112359, |
|
"grad_norm": 0.4090649485588074, |
|
"learning_rate": 5.9584035975267006e-05, |
|
"loss": 0.1026, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 7.162921348314606, |
|
"grad_norm": 0.29195311665534973, |
|
"learning_rate": 5.6773468240584605e-05, |
|
"loss": 0.0831, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 7.303370786516854, |
|
"grad_norm": 0.4446125030517578, |
|
"learning_rate": 5.396290050590219e-05, |
|
"loss": 0.091, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 7.443820224719101, |
|
"grad_norm": 0.3004734516143799, |
|
"learning_rate": 5.115233277121979e-05, |
|
"loss": 0.0894, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 7.584269662921348, |
|
"grad_norm": 0.34026992321014404, |
|
"learning_rate": 4.8341765036537386e-05, |
|
"loss": 0.089, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 7.724719101123595, |
|
"grad_norm": 0.35366305708885193, |
|
"learning_rate": 4.553119730185498e-05, |
|
"loss": 0.0973, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 7.865168539325842, |
|
"grad_norm": 0.46121978759765625, |
|
"learning_rate": 4.272062956717257e-05, |
|
"loss": 0.0893, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 8.00561797752809, |
|
"grad_norm": 0.30849024653434753, |
|
"learning_rate": 3.991006183249017e-05, |
|
"loss": 0.0967, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 8.146067415730338, |
|
"grad_norm": 0.26774147152900696, |
|
"learning_rate": 3.709949409780776e-05, |
|
"loss": 0.0749, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 8.286516853932584, |
|
"grad_norm": 0.451695054769516, |
|
"learning_rate": 3.428892636312535e-05, |
|
"loss": 0.0822, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 8.426966292134832, |
|
"grad_norm": 0.3220645487308502, |
|
"learning_rate": 3.147835862844295e-05, |
|
"loss": 0.08, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 8.567415730337078, |
|
"grad_norm": 0.3273387551307678, |
|
"learning_rate": 2.8667790893760543e-05, |
|
"loss": 0.0847, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 8.707865168539326, |
|
"grad_norm": 0.3072643280029297, |
|
"learning_rate": 2.5857223159078137e-05, |
|
"loss": 0.0869, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 8.848314606741573, |
|
"grad_norm": 0.38988426327705383, |
|
"learning_rate": 2.304665542439573e-05, |
|
"loss": 0.0835, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 8.98876404494382, |
|
"grad_norm": 0.2914678156375885, |
|
"learning_rate": 2.0236087689713324e-05, |
|
"loss": 0.0929, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 9.129213483146067, |
|
"grad_norm": 0.2762880027294159, |
|
"learning_rate": 1.742551995503092e-05, |
|
"loss": 0.0742, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 9.269662921348315, |
|
"grad_norm": 0.33493509888648987, |
|
"learning_rate": 1.461495222034851e-05, |
|
"loss": 0.0791, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 9.410112359550562, |
|
"grad_norm": 0.3052024841308594, |
|
"learning_rate": 1.1804384485666105e-05, |
|
"loss": 0.075, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 9.55056179775281, |
|
"grad_norm": 0.38245493173599243, |
|
"learning_rate": 8.9938167509837e-06, |
|
"loss": 0.0795, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 9.691011235955056, |
|
"grad_norm": 0.2810506522655487, |
|
"learning_rate": 6.183249016301293e-06, |
|
"loss": 0.0814, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 9.831460674157304, |
|
"grad_norm": 0.34336212277412415, |
|
"learning_rate": 3.372681281618887e-06, |
|
"loss": 0.0774, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 9.97191011235955, |
|
"grad_norm": 0.3593064546585083, |
|
"learning_rate": 5.621135469364812e-07, |
|
"loss": 0.0837, |
|
"step": 1775 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1780, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 25, |
|
"total_flos": 6273062618664960.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|