|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.46345811051693403, |
|
"eval_steps": 202, |
|
"global_step": 150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0030897207367795603, |
|
"grad_norm": 122071.1171875, |
|
"learning_rate": 4.0000000000000004e-11, |
|
"loss": 28.8142, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006179441473559121, |
|
"grad_norm": 119214.984375, |
|
"learning_rate": 8.000000000000001e-11, |
|
"loss": 28.2909, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.009269162210338681, |
|
"grad_norm": 126446.515625, |
|
"learning_rate": 1.2e-10, |
|
"loss": 27.4632, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.012358882947118241, |
|
"grad_norm": 113183.5078125, |
|
"learning_rate": 1.6000000000000002e-10, |
|
"loss": 29.2478, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.015448603683897801, |
|
"grad_norm": 122417.453125, |
|
"learning_rate": 2e-10, |
|
"loss": 27.6777, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.018538324420677363, |
|
"grad_norm": 122875.0234375, |
|
"learning_rate": 1.9998728465660104e-10, |
|
"loss": 26.864, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02162804515745692, |
|
"grad_norm": 123327.859375, |
|
"learning_rate": 1.9994914186000328e-10, |
|
"loss": 27.4961, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.024717765894236483, |
|
"grad_norm": 125148.296875, |
|
"learning_rate": 1.9988558131018187e-10, |
|
"loss": 27.0758, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.027807486631016044, |
|
"grad_norm": 123805.0078125, |
|
"learning_rate": 1.9979661917102113e-10, |
|
"loss": 26.7955, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.030897207367795602, |
|
"grad_norm": 137252.546875, |
|
"learning_rate": 1.996822780662041e-10, |
|
"loss": 23.848, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03398692810457516, |
|
"grad_norm": 123848.6953125, |
|
"learning_rate": 1.99542587073459e-10, |
|
"loss": 26.2061, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.037076648841354726, |
|
"grad_norm": 103722.9296875, |
|
"learning_rate": 1.9937758171716467e-10, |
|
"loss": 30.3732, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.040166369578134284, |
|
"grad_norm": 132914.09375, |
|
"learning_rate": 1.9918730395931647e-10, |
|
"loss": 25.836, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04325609031491384, |
|
"grad_norm": 122652.0546875, |
|
"learning_rate": 1.9897180218885506e-10, |
|
"loss": 26.7338, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04634581105169341, |
|
"grad_norm": 125711.875, |
|
"learning_rate": 1.9873113120936074e-10, |
|
"loss": 27.9079, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.049435531788472965, |
|
"grad_norm": 131257.78125, |
|
"learning_rate": 1.9846535222511647e-10, |
|
"loss": 25.2777, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.052525252525252523, |
|
"grad_norm": 135439.875, |
|
"learning_rate": 1.9817453282554334e-10, |
|
"loss": 25.622, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05561497326203209, |
|
"grad_norm": 112875.90625, |
|
"learning_rate": 1.97858746968012e-10, |
|
"loss": 28.8657, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.05870469399881165, |
|
"grad_norm": 124934.046875, |
|
"learning_rate": 1.9751807495903485e-10, |
|
"loss": 27.4469, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.061794414735591205, |
|
"grad_norm": 126672.046875, |
|
"learning_rate": 1.9715260343384348e-10, |
|
"loss": 26.0207, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06488413547237076, |
|
"grad_norm": 121388.1953125, |
|
"learning_rate": 1.9676242533435677e-10, |
|
"loss": 28.2452, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06797385620915032, |
|
"grad_norm": 126791.9921875, |
|
"learning_rate": 1.963476398855452e-10, |
|
"loss": 27.0343, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0710635769459299, |
|
"grad_norm": 120207.7734375, |
|
"learning_rate": 1.9590835257019716e-10, |
|
"loss": 27.06, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07415329768270945, |
|
"grad_norm": 119256.28125, |
|
"learning_rate": 1.9544467510209388e-10, |
|
"loss": 27.0009, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07724301841948901, |
|
"grad_norm": 132457.265625, |
|
"learning_rate": 1.9495672539760009e-10, |
|
"loss": 26.5873, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.08033273915626857, |
|
"grad_norm": 130291.3515625, |
|
"learning_rate": 1.9444462754567682e-10, |
|
"loss": 27.6641, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.08342245989304813, |
|
"grad_norm": 122393.9375, |
|
"learning_rate": 1.9390851177632496e-10, |
|
"loss": 26.8649, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08651218062982768, |
|
"grad_norm": 123576.078125, |
|
"learning_rate": 1.9334851442746664e-10, |
|
"loss": 28.0714, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08960190136660724, |
|
"grad_norm": 121570.703125, |
|
"learning_rate": 1.9276477791027375e-10, |
|
"loss": 26.8387, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.09269162210338681, |
|
"grad_norm": 117390.9921875, |
|
"learning_rate": 1.9215745067295168e-10, |
|
"loss": 28.8172, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09578134284016637, |
|
"grad_norm": 127661.9609375, |
|
"learning_rate": 1.9152668716298797e-10, |
|
"loss": 27.3977, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09887106357694593, |
|
"grad_norm": 129576.4453125, |
|
"learning_rate": 1.9087264778787533e-10, |
|
"loss": 25.7966, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.10196078431372549, |
|
"grad_norm": 117011.7578125, |
|
"learning_rate": 1.9019549887431877e-10, |
|
"loss": 29.9255, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.10505050505050505, |
|
"grad_norm": 120770.2265625, |
|
"learning_rate": 1.894954126259376e-10, |
|
"loss": 27.4423, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.1081402257872846, |
|
"grad_norm": 118635.3671875, |
|
"learning_rate": 1.8877256707947306e-10, |
|
"loss": 29.0251, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.11122994652406418, |
|
"grad_norm": 122824.8203125, |
|
"learning_rate": 1.88027146059512e-10, |
|
"loss": 28.9256, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.11431966726084374, |
|
"grad_norm": 126411.421875, |
|
"learning_rate": 1.872593391317394e-10, |
|
"loss": 26.5279, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1174093879976233, |
|
"grad_norm": 132717.421875, |
|
"learning_rate": 1.8646934155473023e-10, |
|
"loss": 24.6933, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.12049910873440285, |
|
"grad_norm": 121920.703125, |
|
"learning_rate": 1.8565735423029405e-10, |
|
"loss": 27.9129, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.12358882947118241, |
|
"grad_norm": 118110.3515625, |
|
"learning_rate": 1.8482358365238414e-10, |
|
"loss": 28.6163, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12667855020796198, |
|
"grad_norm": 120299.3359375, |
|
"learning_rate": 1.839682418545848e-10, |
|
"loss": 28.5066, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12976827094474153, |
|
"grad_norm": 140847.53125, |
|
"learning_rate": 1.8309154635618964e-10, |
|
"loss": 26.3515, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1328579916815211, |
|
"grad_norm": 112302.9609375, |
|
"learning_rate": 1.8219372010688515e-10, |
|
"loss": 28.644, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.13594771241830064, |
|
"grad_norm": 123308.875, |
|
"learning_rate": 1.8127499143005265e-10, |
|
"loss": 27.6619, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.13903743315508021, |
|
"grad_norm": 125378.4921875, |
|
"learning_rate": 1.8033559396470454e-10, |
|
"loss": 26.452, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1421271538918598, |
|
"grad_norm": 123326.15625, |
|
"learning_rate": 1.7937576660606797e-10, |
|
"loss": 26.3463, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.14521687462863933, |
|
"grad_norm": 117453.265625, |
|
"learning_rate": 1.7839575344483237e-10, |
|
"loss": 29.4332, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1483065953654189, |
|
"grad_norm": 118000.7265625, |
|
"learning_rate": 1.773958037050753e-10, |
|
"loss": 28.7868, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.15139631610219845, |
|
"grad_norm": 127199.6953125, |
|
"learning_rate": 1.7637617168088326e-10, |
|
"loss": 25.8592, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.15448603683897802, |
|
"grad_norm": 115966.7265625, |
|
"learning_rate": 1.753371166716828e-10, |
|
"loss": 28.3176, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.15757575757575756, |
|
"grad_norm": 120307.578125, |
|
"learning_rate": 1.7427890291629892e-10, |
|
"loss": 28.7395, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.16066547831253714, |
|
"grad_norm": 110086.4296875, |
|
"learning_rate": 1.732017995257575e-10, |
|
"loss": 29.0903, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1637551990493167, |
|
"grad_norm": 125848.5, |
|
"learning_rate": 1.721060804148482e-10, |
|
"loss": 27.0135, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.16684491978609625, |
|
"grad_norm": 138127.296875, |
|
"learning_rate": 1.7099202423246632e-10, |
|
"loss": 24.2078, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.16993464052287582, |
|
"grad_norm": 126173.71875, |
|
"learning_rate": 1.6985991429075038e-10, |
|
"loss": 27.1029, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.17302436125965537, |
|
"grad_norm": 113517.828125, |
|
"learning_rate": 1.687100384930338e-10, |
|
"loss": 28.6511, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.17611408199643494, |
|
"grad_norm": 125443.1640625, |
|
"learning_rate": 1.6754268926062938e-10, |
|
"loss": 28.0516, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.17920380273321448, |
|
"grad_norm": 120414.265625, |
|
"learning_rate": 1.6635816345846412e-10, |
|
"loss": 26.6481, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.18229352346999406, |
|
"grad_norm": 142621.21875, |
|
"learning_rate": 1.6515676231958488e-10, |
|
"loss": 24.7811, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.18538324420677363, |
|
"grad_norm": 128307.296875, |
|
"learning_rate": 1.6393879136855248e-10, |
|
"loss": 24.9562, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.18847296494355317, |
|
"grad_norm": 131207.0625, |
|
"learning_rate": 1.6270456034374474e-10, |
|
"loss": 26.3174, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.19156268568033274, |
|
"grad_norm": 126856.7578125, |
|
"learning_rate": 1.6145438311858797e-10, |
|
"loss": 24.581, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1946524064171123, |
|
"grad_norm": 130663.9921875, |
|
"learning_rate": 1.601885776217367e-10, |
|
"loss": 24.543, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.19774212715389186, |
|
"grad_norm": 117330.453125, |
|
"learning_rate": 1.589074657562223e-10, |
|
"loss": 26.1565, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.20083184789067143, |
|
"grad_norm": 126654.546875, |
|
"learning_rate": 1.5761137331759085e-10, |
|
"loss": 26.9999, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.20392156862745098, |
|
"grad_norm": 122709.8984375, |
|
"learning_rate": 1.5630062991105098e-10, |
|
"loss": 26.9336, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.20701128936423055, |
|
"grad_norm": 111839.421875, |
|
"learning_rate": 1.5497556886765316e-10, |
|
"loss": 29.4629, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2101010101010101, |
|
"grad_norm": 115566.3515625, |
|
"learning_rate": 1.536365271595212e-10, |
|
"loss": 28.8732, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.21319073083778967, |
|
"grad_norm": 120251.84375, |
|
"learning_rate": 1.5228384531415808e-10, |
|
"loss": 28.2254, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2162804515745692, |
|
"grad_norm": 109853.484375, |
|
"learning_rate": 1.5091786732784717e-10, |
|
"loss": 28.8846, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21937017231134878, |
|
"grad_norm": 124965.2890625, |
|
"learning_rate": 1.495389405781719e-10, |
|
"loss": 27.476, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.22245989304812835, |
|
"grad_norm": 123831.765625, |
|
"learning_rate": 1.4814741573567514e-10, |
|
"loss": 28.3641, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2255496137849079, |
|
"grad_norm": 122635.5625, |
|
"learning_rate": 1.467436466746814e-10, |
|
"loss": 26.8985, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.22863933452168747, |
|
"grad_norm": 114768.140625, |
|
"learning_rate": 1.4532799038330386e-10, |
|
"loss": 27.3799, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.23172905525846701, |
|
"grad_norm": 126590.0, |
|
"learning_rate": 1.4390080687266012e-10, |
|
"loss": 27.1862, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2348187759952466, |
|
"grad_norm": 121103.6640625, |
|
"learning_rate": 1.4246245908531884e-10, |
|
"loss": 28.5261, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.23790849673202613, |
|
"grad_norm": 116113.4765625, |
|
"learning_rate": 1.410133128030009e-10, |
|
"loss": 29.2815, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.2409982174688057, |
|
"grad_norm": 124221.265625, |
|
"learning_rate": 1.3955373655355853e-10, |
|
"loss": 28.5093, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.24408793820558528, |
|
"grad_norm": 122242.0625, |
|
"learning_rate": 1.3808410151725631e-10, |
|
"loss": 27.9406, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.24717765894236482, |
|
"grad_norm": 129292.234375, |
|
"learning_rate": 1.3660478143237748e-10, |
|
"loss": 25.4098, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.25026737967914436, |
|
"grad_norm": 113323.875, |
|
"learning_rate": 1.351161525001795e-10, |
|
"loss": 30.2781, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.25335710041592396, |
|
"grad_norm": 120109.671875, |
|
"learning_rate": 1.3361859328922368e-10, |
|
"loss": 28.7683, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2564468211527035, |
|
"grad_norm": 131119.078125, |
|
"learning_rate": 1.3211248463910263e-10, |
|
"loss": 26.0915, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.25953654188948305, |
|
"grad_norm": 133106.078125, |
|
"learning_rate": 1.3059820956358996e-10, |
|
"loss": 25.8208, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.26262626262626265, |
|
"grad_norm": 117423.5859375, |
|
"learning_rate": 1.290761531532374e-10, |
|
"loss": 28.491, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2657159833630422, |
|
"grad_norm": 135454.3125, |
|
"learning_rate": 1.2754670247744354e-10, |
|
"loss": 24.7346, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.26880570409982174, |
|
"grad_norm": 125783.6484375, |
|
"learning_rate": 1.260102464860195e-10, |
|
"loss": 25.684, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2718954248366013, |
|
"grad_norm": 124170.6484375, |
|
"learning_rate": 1.2446717591027624e-10, |
|
"loss": 27.912, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2749851455733809, |
|
"grad_norm": 123135.8984375, |
|
"learning_rate": 1.2291788316365887e-10, |
|
"loss": 27.8059, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.27807486631016043, |
|
"grad_norm": 127220.2890625, |
|
"learning_rate": 1.213627622419535e-10, |
|
"loss": 26.8872, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.28116458704694, |
|
"grad_norm": 114622.0625, |
|
"learning_rate": 1.1980220862309098e-10, |
|
"loss": 27.7952, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2842543077837196, |
|
"grad_norm": 129463.1484375, |
|
"learning_rate": 1.182366191665744e-10, |
|
"loss": 25.9395, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2873440285204991, |
|
"grad_norm": 138215.8125, |
|
"learning_rate": 1.1666639201255506e-10, |
|
"loss": 25.4167, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.29043374925727866, |
|
"grad_norm": 113110.0, |
|
"learning_rate": 1.1509192648058249e-10, |
|
"loss": 28.0616, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2935234699940582, |
|
"grad_norm": 120913.9765625, |
|
"learning_rate": 1.1351362296805485e-10, |
|
"loss": 28.1966, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2966131907308378, |
|
"grad_norm": 117201.953125, |
|
"learning_rate": 1.1193188284839518e-10, |
|
"loss": 27.8393, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.29970291146761735, |
|
"grad_norm": 124783.2890625, |
|
"learning_rate": 1.1034710836897921e-10, |
|
"loss": 26.5054, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3027926322043969, |
|
"grad_norm": 127615.96875, |
|
"learning_rate": 1.0875970254884129e-10, |
|
"loss": 26.9284, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3058823529411765, |
|
"grad_norm": 132355.703125, |
|
"learning_rate": 1.0717006907618376e-10, |
|
"loss": 25.1614, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.30897207367795604, |
|
"grad_norm": 132260.8125, |
|
"learning_rate": 1.0557861220571625e-10, |
|
"loss": 24.961, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3120617944147356, |
|
"grad_norm": 117828.6796875, |
|
"learning_rate": 1.0398573665585105e-10, |
|
"loss": 28.4477, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.3151515151515151, |
|
"grad_norm": 130773.75, |
|
"learning_rate": 1.023918475057803e-10, |
|
"loss": 27.1515, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3182412358882947, |
|
"grad_norm": 121530.078125, |
|
"learning_rate": 1.0079735009246167e-10, |
|
"loss": 27.9482, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.32133095662507427, |
|
"grad_norm": 122278.96875, |
|
"learning_rate": 9.920264990753837e-11, |
|
"loss": 28.2168, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.3244206773618538, |
|
"grad_norm": 112929.9140625, |
|
"learning_rate": 9.760815249421973e-11, |
|
"loss": 29.3778, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3275103980986334, |
|
"grad_norm": 116076.59375, |
|
"learning_rate": 9.601426334414898e-11, |
|
"loss": 28.591, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.33060011883541296, |
|
"grad_norm": 128873.78125, |
|
"learning_rate": 9.442138779428376e-11, |
|
"loss": 26.1022, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.3336898395721925, |
|
"grad_norm": 127668.328125, |
|
"learning_rate": 9.282993092381625e-11, |
|
"loss": 25.6443, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.33677956030897205, |
|
"grad_norm": 122660.5625, |
|
"learning_rate": 9.12402974511587e-11, |
|
"loss": 26.6564, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.33986928104575165, |
|
"grad_norm": 120201.34375, |
|
"learning_rate": 8.965289163102078e-11, |
|
"loss": 25.9894, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3429590017825312, |
|
"grad_norm": 127018.890625, |
|
"learning_rate": 8.806811715160484e-11, |
|
"loss": 25.5922, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.34604872251931074, |
|
"grad_norm": 126997.375, |
|
"learning_rate": 8.648637703194516e-11, |
|
"loss": 27.1782, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.34913844325609034, |
|
"grad_norm": 111071.9921875, |
|
"learning_rate": 8.490807351941753e-11, |
|
"loss": 29.0618, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3522281639928699, |
|
"grad_norm": 109156.3125, |
|
"learning_rate": 8.333360798744496e-11, |
|
"loss": 31.8425, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.3553178847296494, |
|
"grad_norm": 126704.4375, |
|
"learning_rate": 8.17633808334256e-11, |
|
"loss": 26.271, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.35840760546642897, |
|
"grad_norm": 129102.53125, |
|
"learning_rate": 8.019779137690906e-11, |
|
"loss": 26.1617, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.36149732620320857, |
|
"grad_norm": 116444.125, |
|
"learning_rate": 7.863723775804651e-11, |
|
"loss": 29.3636, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.3645870469399881, |
|
"grad_norm": 119892.703125, |
|
"learning_rate": 7.708211683634111e-11, |
|
"loss": 28.351, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.36767676767676766, |
|
"grad_norm": 133809.671875, |
|
"learning_rate": 7.553282408972381e-11, |
|
"loss": 25.5753, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.37076648841354726, |
|
"grad_norm": 131220.359375, |
|
"learning_rate": 7.398975351398053e-11, |
|
"loss": 26.8812, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3738562091503268, |
|
"grad_norm": 116611.2734375, |
|
"learning_rate": 7.245329752255648e-11, |
|
"loss": 29.5172, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.37694592988710635, |
|
"grad_norm": 133015.046875, |
|
"learning_rate": 7.092384684676262e-11, |
|
"loss": 25.8481, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.38003565062388595, |
|
"grad_norm": 136814.09375, |
|
"learning_rate": 6.940179043641005e-11, |
|
"loss": 24.7821, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3831253713606655, |
|
"grad_norm": 111482.6875, |
|
"learning_rate": 6.788751536089739e-11, |
|
"loss": 28.6458, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.38621509209744503, |
|
"grad_norm": 105425.6796875, |
|
"learning_rate": 6.638140671077632e-11, |
|
"loss": 30.565, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3893048128342246, |
|
"grad_norm": 126840.734375, |
|
"learning_rate": 6.488384749982054e-11, |
|
"loss": 24.9263, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3923945335710042, |
|
"grad_norm": 126449.0078125, |
|
"learning_rate": 6.339521856762254e-11, |
|
"loss": 27.0906, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3954842543077837, |
|
"grad_norm": 110226.4375, |
|
"learning_rate": 6.191589848274368e-11, |
|
"loss": 28.7828, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.39857397504456327, |
|
"grad_norm": 117383.6171875, |
|
"learning_rate": 6.04462634464415e-11, |
|
"loss": 27.1669, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.40166369578134287, |
|
"grad_norm": 129581.0390625, |
|
"learning_rate": 5.898668719699914e-11, |
|
"loss": 24.9744, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4047534165181224, |
|
"grad_norm": 126919.765625, |
|
"learning_rate": 5.753754091468115e-11, |
|
"loss": 26.5326, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.40784313725490196, |
|
"grad_norm": 126623.8046875, |
|
"learning_rate": 5.6099193127339865e-11, |
|
"loss": 28.2028, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4109328579916815, |
|
"grad_norm": 128991.1171875, |
|
"learning_rate": 5.467200961669618e-11, |
|
"loss": 27.0198, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4140225787284611, |
|
"grad_norm": 127212.984375, |
|
"learning_rate": 5.325635332531864e-11, |
|
"loss": 27.5968, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.41711229946524064, |
|
"grad_norm": 114877.2421875, |
|
"learning_rate": 5.1852584264324866e-11, |
|
"loss": 29.2745, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.4202020202020202, |
|
"grad_norm": 110463.8828125, |
|
"learning_rate": 5.046105942182815e-11, |
|
"loss": 28.0633, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.4232917409387998, |
|
"grad_norm": 124949.09375, |
|
"learning_rate": 4.908213267215287e-11, |
|
"loss": 27.0064, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.42638146167557933, |
|
"grad_norm": 137615.125, |
|
"learning_rate": 4.771615468584194e-11, |
|
"loss": 25.8549, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.4294711824123589, |
|
"grad_norm": 125163.0, |
|
"learning_rate": 4.636347284047877e-11, |
|
"loss": 28.2664, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.4325609031491384, |
|
"grad_norm": 120346.09375, |
|
"learning_rate": 4.502443113234688e-11, |
|
"loss": 27.093, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.435650623885918, |
|
"grad_norm": 130197.5234375, |
|
"learning_rate": 4.3699370088949064e-11, |
|
"loss": 25.7805, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.43874034462269756, |
|
"grad_norm": 141218.203125, |
|
"learning_rate": 4.238862668240919e-11, |
|
"loss": 25.3932, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.4418300653594771, |
|
"grad_norm": 124990.84375, |
|
"learning_rate": 4.1092534243777726e-11, |
|
"loss": 25.7902, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.4449197860962567, |
|
"grad_norm": 131223.625, |
|
"learning_rate": 3.981142237826332e-11, |
|
"loss": 25.7262, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.44800950683303625, |
|
"grad_norm": 119055.7109375, |
|
"learning_rate": 3.854561688141205e-11, |
|
"loss": 27.7554, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4510992275698158, |
|
"grad_norm": 127996.890625, |
|
"learning_rate": 3.729543965625526e-11, |
|
"loss": 25.1842, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.45418894830659534, |
|
"grad_norm": 130496.4609375, |
|
"learning_rate": 3.606120863144753e-11, |
|
"loss": 26.7469, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.45727866904337494, |
|
"grad_norm": 138091.28125, |
|
"learning_rate": 3.484323768041515e-11, |
|
"loss": 24.2791, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.4603683897801545, |
|
"grad_norm": 114539.9140625, |
|
"learning_rate": 3.364183654153592e-11, |
|
"loss": 27.8677, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.46345811051693403, |
|
"grad_norm": 129665.9140625, |
|
"learning_rate": 3.245731073937068e-11, |
|
"loss": 26.8488, |
|
"step": 150 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 202, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.489068960874496e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|