|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9997864616698697, |
|
"eval_steps": 500, |
|
"global_step": 2341, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002135383301302584, |
|
"grad_norm": 0.17446549236774445, |
|
"learning_rate": 4.999943721137594e-05, |
|
"loss": 1.0426, |
|
"num_input_tokens_seen": 127072, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004270766602605168, |
|
"grad_norm": 0.19204697012901306, |
|
"learning_rate": 4.999774887084225e-05, |
|
"loss": 1.0578, |
|
"num_input_tokens_seen": 281728, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0064061499039077515, |
|
"grad_norm": 0.09412319213151932, |
|
"learning_rate": 4.999493505441324e-05, |
|
"loss": 0.6602, |
|
"num_input_tokens_seen": 394304, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.008541533205210335, |
|
"grad_norm": 0.06181873008608818, |
|
"learning_rate": 4.9990995888775614e-05, |
|
"loss": 0.961, |
|
"num_input_tokens_seen": 561888, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.010676916506512918, |
|
"grad_norm": 0.06059432402253151, |
|
"learning_rate": 4.9985931551282785e-05, |
|
"loss": 0.9722, |
|
"num_input_tokens_seen": 722496, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.012812299807815503, |
|
"grad_norm": 0.05892226845026016, |
|
"learning_rate": 4.997974226994687e-05, |
|
"loss": 0.7008, |
|
"num_input_tokens_seen": 850176, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.014947683109118086, |
|
"grad_norm": 0.0467025563120842, |
|
"learning_rate": 4.9972428323428444e-05, |
|
"loss": 0.9035, |
|
"num_input_tokens_seen": 1017280, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01708306641042067, |
|
"grad_norm": 0.04231492057442665, |
|
"learning_rate": 4.996399004102397e-05, |
|
"loss": 0.868, |
|
"num_input_tokens_seen": 1199552, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.019218449711723255, |
|
"grad_norm": 0.05744878575205803, |
|
"learning_rate": 4.9954427802651014e-05, |
|
"loss": 0.7532, |
|
"num_input_tokens_seen": 1343744, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.021353833013025837, |
|
"grad_norm": 0.04732651636004448, |
|
"learning_rate": 4.9943742038831076e-05, |
|
"loss": 0.7994, |
|
"num_input_tokens_seen": 1513920, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02348921631432842, |
|
"grad_norm": 0.06059388071298599, |
|
"learning_rate": 4.993193323067027e-05, |
|
"loss": 0.8082, |
|
"num_input_tokens_seen": 1680096, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.025624599615631006, |
|
"grad_norm": 0.05906912684440613, |
|
"learning_rate": 4.9919001909837625e-05, |
|
"loss": 0.7103, |
|
"num_input_tokens_seen": 1853312, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02775998291693359, |
|
"grad_norm": 0.05951849743723869, |
|
"learning_rate": 4.990494865854116e-05, |
|
"loss": 0.6723, |
|
"num_input_tokens_seen": 2007040, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.029895366218236172, |
|
"grad_norm": 0.05470618978142738, |
|
"learning_rate": 4.9889774109501675e-05, |
|
"loss": 0.6473, |
|
"num_input_tokens_seen": 2127456, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.032030749519538756, |
|
"grad_norm": 0.052827127277851105, |
|
"learning_rate": 4.987347894592426e-05, |
|
"loss": 0.8123, |
|
"num_input_tokens_seen": 2276992, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03416613282084134, |
|
"grad_norm": 0.04660721495747566, |
|
"learning_rate": 4.985606390146752e-05, |
|
"loss": 0.8803, |
|
"num_input_tokens_seen": 2463776, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.036301516122143926, |
|
"grad_norm": 0.047477494925260544, |
|
"learning_rate": 4.983752976021058e-05, |
|
"loss": 0.7062, |
|
"num_input_tokens_seen": 2619296, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.03843689942344651, |
|
"grad_norm": 0.050083596259355545, |
|
"learning_rate": 4.981787735661774e-05, |
|
"loss": 0.7329, |
|
"num_input_tokens_seen": 2807456, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.040572282724749095, |
|
"grad_norm": 0.07300040125846863, |
|
"learning_rate": 4.9797107575500934e-05, |
|
"loss": 0.708, |
|
"num_input_tokens_seen": 2982592, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.04270766602605167, |
|
"grad_norm": 0.0638803094625473, |
|
"learning_rate": 4.977522135197988e-05, |
|
"loss": 0.686, |
|
"num_input_tokens_seen": 3118176, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04484304932735426, |
|
"grad_norm": 0.058782994747161865, |
|
"learning_rate": 4.975221967144e-05, |
|
"loss": 0.8154, |
|
"num_input_tokens_seen": 3318528, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04697843262865684, |
|
"grad_norm": 0.06658528745174408, |
|
"learning_rate": 4.972810356948803e-05, |
|
"loss": 0.7786, |
|
"num_input_tokens_seen": 3446496, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04911381592995943, |
|
"grad_norm": 0.05051959306001663, |
|
"learning_rate": 4.9702874131905375e-05, |
|
"loss": 0.848, |
|
"num_input_tokens_seen": 3633536, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05124919923126201, |
|
"grad_norm": 0.055378127843141556, |
|
"learning_rate": 4.967653249459928e-05, |
|
"loss": 0.6415, |
|
"num_input_tokens_seen": 3797920, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.053384582532564596, |
|
"grad_norm": 0.053418923169374466, |
|
"learning_rate": 4.9649079843551663e-05, |
|
"loss": 0.885, |
|
"num_input_tokens_seen": 3972288, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.05551996583386718, |
|
"grad_norm": 0.042733389884233475, |
|
"learning_rate": 4.9620517414765685e-05, |
|
"loss": 0.7594, |
|
"num_input_tokens_seen": 4155520, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.057655349135169766, |
|
"grad_norm": 0.05269218608736992, |
|
"learning_rate": 4.959084649421016e-05, |
|
"loss": 0.5479, |
|
"num_input_tokens_seen": 4319488, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.059790732436472344, |
|
"grad_norm": 0.06381044536828995, |
|
"learning_rate": 4.9560068417761595e-05, |
|
"loss": 0.7997, |
|
"num_input_tokens_seen": 4473120, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06192611573777493, |
|
"grad_norm": 0.09567151963710785, |
|
"learning_rate": 4.952818457114411e-05, |
|
"loss": 0.7524, |
|
"num_input_tokens_seen": 4612768, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.06406149903907751, |
|
"grad_norm": 0.054249729961156845, |
|
"learning_rate": 4.9495196389866995e-05, |
|
"loss": 0.7524, |
|
"num_input_tokens_seen": 4747680, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0661968823403801, |
|
"grad_norm": 0.05188503488898277, |
|
"learning_rate": 4.946110535916009e-05, |
|
"loss": 0.7265, |
|
"num_input_tokens_seen": 4943264, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.06833226564168268, |
|
"grad_norm": 0.05143864452838898, |
|
"learning_rate": 4.942591301390695e-05, |
|
"loss": 0.6025, |
|
"num_input_tokens_seen": 5085408, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07046764894298527, |
|
"grad_norm": 0.05344095826148987, |
|
"learning_rate": 4.9389620938575695e-05, |
|
"loss": 0.6499, |
|
"num_input_tokens_seen": 5256288, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.07260303224428785, |
|
"grad_norm": 0.06509006023406982, |
|
"learning_rate": 4.935223076714769e-05, |
|
"loss": 0.723, |
|
"num_input_tokens_seen": 5437312, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07473841554559044, |
|
"grad_norm": 0.06998533755540848, |
|
"learning_rate": 4.9313744183044e-05, |
|
"loss": 0.7537, |
|
"num_input_tokens_seen": 5568800, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.07687379884689302, |
|
"grad_norm": 0.05648740753531456, |
|
"learning_rate": 4.927416291904955e-05, |
|
"loss": 0.7523, |
|
"num_input_tokens_seen": 5721568, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0790091821481956, |
|
"grad_norm": 0.057639315724372864, |
|
"learning_rate": 4.9233488757235145e-05, |
|
"loss": 0.6245, |
|
"num_input_tokens_seen": 5874336, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.08114456544949819, |
|
"grad_norm": 0.06750814616680145, |
|
"learning_rate": 4.919172352887725e-05, |
|
"loss": 0.7379, |
|
"num_input_tokens_seen": 6031904, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08327994875080078, |
|
"grad_norm": 0.0563182458281517, |
|
"learning_rate": 4.914886911437547e-05, |
|
"loss": 0.6706, |
|
"num_input_tokens_seen": 6201152, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.08541533205210335, |
|
"grad_norm": 0.05895683914422989, |
|
"learning_rate": 4.910492744316799e-05, |
|
"loss": 0.6494, |
|
"num_input_tokens_seen": 6356416, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08755071535340593, |
|
"grad_norm": 0.05769870802760124, |
|
"learning_rate": 4.905990049364461e-05, |
|
"loss": 0.7371, |
|
"num_input_tokens_seen": 6502272, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.08968609865470852, |
|
"grad_norm": 0.06006137281656265, |
|
"learning_rate": 4.9013790293057714e-05, |
|
"loss": 0.7105, |
|
"num_input_tokens_seen": 6662432, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0918214819560111, |
|
"grad_norm": 0.050949618220329285, |
|
"learning_rate": 4.8966598917431036e-05, |
|
"loss": 0.6886, |
|
"num_input_tokens_seen": 6826048, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.09395686525731368, |
|
"grad_norm": 0.06648615002632141, |
|
"learning_rate": 4.8918328491466106e-05, |
|
"loss": 0.6842, |
|
"num_input_tokens_seen": 6992928, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09609224855861627, |
|
"grad_norm": 0.05005276948213577, |
|
"learning_rate": 4.886898118844666e-05, |
|
"loss": 0.733, |
|
"num_input_tokens_seen": 7128704, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.09822763185991885, |
|
"grad_norm": 0.09183106571435928, |
|
"learning_rate": 4.881855923014076e-05, |
|
"loss": 0.5728, |
|
"num_input_tokens_seen": 7266464, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10036301516122144, |
|
"grad_norm": 0.05412563309073448, |
|
"learning_rate": 4.876706488670077e-05, |
|
"loss": 0.6706, |
|
"num_input_tokens_seen": 7430912, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.10249839846252402, |
|
"grad_norm": 0.07257558405399323, |
|
"learning_rate": 4.871450047656114e-05, |
|
"loss": 0.6395, |
|
"num_input_tokens_seen": 7560288, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.10463378176382661, |
|
"grad_norm": 0.058103807270526886, |
|
"learning_rate": 4.866086836633403e-05, |
|
"loss": 0.6815, |
|
"num_input_tokens_seen": 7708480, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.10676916506512919, |
|
"grad_norm": 0.0581187903881073, |
|
"learning_rate": 4.860617097070278e-05, |
|
"loss": 0.8152, |
|
"num_input_tokens_seen": 7871168, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.10890454836643178, |
|
"grad_norm": 0.1157078966498375, |
|
"learning_rate": 4.855041075231314e-05, |
|
"loss": 0.7163, |
|
"num_input_tokens_seen": 8011264, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.11103993166773436, |
|
"grad_norm": 0.06340761482715607, |
|
"learning_rate": 4.8493590221662436e-05, |
|
"loss": 0.73, |
|
"num_input_tokens_seen": 8152320, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11317531496903695, |
|
"grad_norm": 0.15619327127933502, |
|
"learning_rate": 4.843571193698653e-05, |
|
"loss": 0.8089, |
|
"num_input_tokens_seen": 8293312, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.11531069827033953, |
|
"grad_norm": 0.07074210792779922, |
|
"learning_rate": 4.837677850414464e-05, |
|
"loss": 0.6812, |
|
"num_input_tokens_seen": 8472896, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11744608157164212, |
|
"grad_norm": 0.07977280020713806, |
|
"learning_rate": 4.8316792576502004e-05, |
|
"loss": 0.7619, |
|
"num_input_tokens_seen": 8643552, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.11958146487294469, |
|
"grad_norm": 0.07114008814096451, |
|
"learning_rate": 4.825575685481045e-05, |
|
"loss": 0.7461, |
|
"num_input_tokens_seen": 8804736, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12171684817424727, |
|
"grad_norm": 0.06181742250919342, |
|
"learning_rate": 4.819367408708676e-05, |
|
"loss": 0.8225, |
|
"num_input_tokens_seen": 8951648, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.12385223147554986, |
|
"grad_norm": 0.07226210832595825, |
|
"learning_rate": 4.8130547068488954e-05, |
|
"loss": 0.7792, |
|
"num_input_tokens_seen": 9097312, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12598761477685244, |
|
"grad_norm": 0.0596122108399868, |
|
"learning_rate": 4.806637864119049e-05, |
|
"loss": 0.8316, |
|
"num_input_tokens_seen": 9234688, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.12812299807815503, |
|
"grad_norm": 0.057102903723716736, |
|
"learning_rate": 4.800117169425223e-05, |
|
"loss": 0.7616, |
|
"num_input_tokens_seen": 9410528, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1302583813794576, |
|
"grad_norm": 0.06964308768510818, |
|
"learning_rate": 4.79349291634924e-05, |
|
"loss": 0.7982, |
|
"num_input_tokens_seen": 9593280, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.1323937646807602, |
|
"grad_norm": 0.06177399307489395, |
|
"learning_rate": 4.786765403135444e-05, |
|
"loss": 0.7515, |
|
"num_input_tokens_seen": 9769824, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13452914798206278, |
|
"grad_norm": 0.08495648950338364, |
|
"learning_rate": 4.779934932677265e-05, |
|
"loss": 0.6677, |
|
"num_input_tokens_seen": 9921536, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.13666453128336536, |
|
"grad_norm": 0.060477472841739655, |
|
"learning_rate": 4.77300181250359e-05, |
|
"loss": 0.7559, |
|
"num_input_tokens_seen": 10089088, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13879991458466795, |
|
"grad_norm": 0.05845116078853607, |
|
"learning_rate": 4.7659663547649124e-05, |
|
"loss": 0.7337, |
|
"num_input_tokens_seen": 10282272, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.14093529788597053, |
|
"grad_norm": 0.06424874067306519, |
|
"learning_rate": 4.758828876219278e-05, |
|
"loss": 0.8009, |
|
"num_input_tokens_seen": 10450848, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14307068118727312, |
|
"grad_norm": 0.06901010870933533, |
|
"learning_rate": 4.751589698218026e-05, |
|
"loss": 0.7203, |
|
"num_input_tokens_seen": 10617664, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.1452060644885757, |
|
"grad_norm": 0.07261721789836884, |
|
"learning_rate": 4.744249146691317e-05, |
|
"loss": 0.5286, |
|
"num_input_tokens_seen": 10794880, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1473414477898783, |
|
"grad_norm": 0.07624544203281403, |
|
"learning_rate": 4.736807552133464e-05, |
|
"loss": 0.6662, |
|
"num_input_tokens_seen": 10956960, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.14947683109118087, |
|
"grad_norm": 0.06444702297449112, |
|
"learning_rate": 4.729265249588046e-05, |
|
"loss": 0.6554, |
|
"num_input_tokens_seen": 11105440, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15161221439248346, |
|
"grad_norm": 0.08421933650970459, |
|
"learning_rate": 4.721622578632832e-05, |
|
"loss": 0.7981, |
|
"num_input_tokens_seen": 11248448, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.15374759769378604, |
|
"grad_norm": 0.060541413724422455, |
|
"learning_rate": 4.71387988336448e-05, |
|
"loss": 0.5976, |
|
"num_input_tokens_seen": 11404928, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15588298099508863, |
|
"grad_norm": 0.07518257945775986, |
|
"learning_rate": 4.706037512383058e-05, |
|
"loss": 0.7783, |
|
"num_input_tokens_seen": 11586880, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.1580183642963912, |
|
"grad_norm": 0.051343463361263275, |
|
"learning_rate": 4.6980958187763394e-05, |
|
"loss": 0.6556, |
|
"num_input_tokens_seen": 11746368, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1601537475976938, |
|
"grad_norm": 0.08890614658594131, |
|
"learning_rate": 4.690055160103908e-05, |
|
"loss": 0.5951, |
|
"num_input_tokens_seen": 11876928, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.16228913089899638, |
|
"grad_norm": 0.049633271992206573, |
|
"learning_rate": 4.681915898381064e-05, |
|
"loss": 0.7438, |
|
"num_input_tokens_seen": 12070656, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.16442451420029897, |
|
"grad_norm": 0.06845410168170929, |
|
"learning_rate": 4.67367840006252e-05, |
|
"loss": 0.861, |
|
"num_input_tokens_seen": 12215104, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.16655989750160155, |
|
"grad_norm": 0.09894266724586487, |
|
"learning_rate": 4.6653430360259015e-05, |
|
"loss": 0.597, |
|
"num_input_tokens_seen": 12367616, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.16869528080290414, |
|
"grad_norm": 0.07006240636110306, |
|
"learning_rate": 4.656910181555055e-05, |
|
"loss": 0.6786, |
|
"num_input_tokens_seen": 12550368, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.1708306641042067, |
|
"grad_norm": 0.08737102895975113, |
|
"learning_rate": 4.648380216323145e-05, |
|
"loss": 0.6539, |
|
"num_input_tokens_seen": 12693248, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17296604740550928, |
|
"grad_norm": 0.08132334798574448, |
|
"learning_rate": 4.639753524375564e-05, |
|
"loss": 0.8733, |
|
"num_input_tokens_seen": 12856832, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.17510143070681186, |
|
"grad_norm": 0.061612244695425034, |
|
"learning_rate": 4.631030494112638e-05, |
|
"loss": 0.636, |
|
"num_input_tokens_seen": 13028352, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17723681400811445, |
|
"grad_norm": 0.07655072212219238, |
|
"learning_rate": 4.622211518272144e-05, |
|
"loss": 0.7299, |
|
"num_input_tokens_seen": 13163616, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.17937219730941703, |
|
"grad_norm": 0.06312955170869827, |
|
"learning_rate": 4.613296993911623e-05, |
|
"loss": 0.5954, |
|
"num_input_tokens_seen": 13336608, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18150758061071962, |
|
"grad_norm": 0.07038469612598419, |
|
"learning_rate": 4.604287322390509e-05, |
|
"loss": 0.8243, |
|
"num_input_tokens_seen": 13505408, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.1836429639120222, |
|
"grad_norm": 0.0707494243979454, |
|
"learning_rate": 4.59518290935205e-05, |
|
"loss": 0.5552, |
|
"num_input_tokens_seen": 13642592, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18577834721332478, |
|
"grad_norm": 0.06867733597755432, |
|
"learning_rate": 4.5859841647050565e-05, |
|
"loss": 0.6857, |
|
"num_input_tokens_seen": 13790976, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.18791373051462737, |
|
"grad_norm": 0.06942213326692581, |
|
"learning_rate": 4.576691502605434e-05, |
|
"loss": 0.6743, |
|
"num_input_tokens_seen": 13956224, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.19004911381592995, |
|
"grad_norm": 0.06548978388309479, |
|
"learning_rate": 4.5673053414375436e-05, |
|
"loss": 0.579, |
|
"num_input_tokens_seen": 14115296, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.19218449711723254, |
|
"grad_norm": 0.07146024703979492, |
|
"learning_rate": 4.557826103795364e-05, |
|
"loss": 0.8965, |
|
"num_input_tokens_seen": 14267168, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.19431988041853512, |
|
"grad_norm": 0.10247491300106049, |
|
"learning_rate": 4.548254216463465e-05, |
|
"loss": 0.8137, |
|
"num_input_tokens_seen": 14413312, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.1964552637198377, |
|
"grad_norm": 0.08518624305725098, |
|
"learning_rate": 4.538590110397789e-05, |
|
"loss": 0.617, |
|
"num_input_tokens_seen": 14550880, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.1985906470211403, |
|
"grad_norm": 0.1248399019241333, |
|
"learning_rate": 4.528834220706253e-05, |
|
"loss": 0.9175, |
|
"num_input_tokens_seen": 14691712, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.20072603032244288, |
|
"grad_norm": 0.06742729991674423, |
|
"learning_rate": 4.518986986629157e-05, |
|
"loss": 0.7633, |
|
"num_input_tokens_seen": 14861408, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.20286141362374546, |
|
"grad_norm": 0.09116410464048386, |
|
"learning_rate": 4.509048851519404e-05, |
|
"loss": 0.6935, |
|
"num_input_tokens_seen": 15003328, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.20499679692504805, |
|
"grad_norm": 0.08975204825401306, |
|
"learning_rate": 4.499020262822547e-05, |
|
"loss": 0.6322, |
|
"num_input_tokens_seen": 15125792, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.20713218022635063, |
|
"grad_norm": 0.10138271003961563, |
|
"learning_rate": 4.4889016720566355e-05, |
|
"loss": 0.9118, |
|
"num_input_tokens_seen": 15301856, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.20926756352765322, |
|
"grad_norm": 0.07376892864704132, |
|
"learning_rate": 4.478693534791893e-05, |
|
"loss": 0.6331, |
|
"num_input_tokens_seen": 15487488, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2114029468289558, |
|
"grad_norm": 0.07480096817016602, |
|
"learning_rate": 4.4683963106302e-05, |
|
"loss": 0.7326, |
|
"num_input_tokens_seen": 15657312, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.21353833013025839, |
|
"grad_norm": 0.06383755058050156, |
|
"learning_rate": 4.458010463184405e-05, |
|
"loss": 0.6806, |
|
"num_input_tokens_seen": 15850912, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.21567371343156097, |
|
"grad_norm": 0.06868927925825119, |
|
"learning_rate": 4.4475364600574535e-05, |
|
"loss": 0.7017, |
|
"num_input_tokens_seen": 15986400, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.21780909673286356, |
|
"grad_norm": 0.09151501208543777, |
|
"learning_rate": 4.43697477282133e-05, |
|
"loss": 0.6438, |
|
"num_input_tokens_seen": 16144960, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.21994448003416614, |
|
"grad_norm": 0.09519924968481064, |
|
"learning_rate": 4.4263258769958274e-05, |
|
"loss": 0.757, |
|
"num_input_tokens_seen": 16289856, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.22207986333546872, |
|
"grad_norm": 0.09690000116825104, |
|
"learning_rate": 4.415590252027141e-05, |
|
"loss": 0.6478, |
|
"num_input_tokens_seen": 16439328, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2242152466367713, |
|
"grad_norm": 0.06739991158246994, |
|
"learning_rate": 4.404768381266279e-05, |
|
"loss": 0.7572, |
|
"num_input_tokens_seen": 16575552, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.2263506299380739, |
|
"grad_norm": 0.08569491654634476, |
|
"learning_rate": 4.393860751947302e-05, |
|
"loss": 0.7073, |
|
"num_input_tokens_seen": 16754016, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22848601323937648, |
|
"grad_norm": 0.07734335213899612, |
|
"learning_rate": 4.382867855165386e-05, |
|
"loss": 0.6275, |
|
"num_input_tokens_seen": 16897248, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.23062139654067906, |
|
"grad_norm": 0.10210688412189484, |
|
"learning_rate": 4.371790185854709e-05, |
|
"loss": 0.6937, |
|
"num_input_tokens_seen": 17077792, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.23275677984198165, |
|
"grad_norm": 0.08407072722911835, |
|
"learning_rate": 4.360628242766175e-05, |
|
"loss": 0.7242, |
|
"num_input_tokens_seen": 17232480, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.23489216314328423, |
|
"grad_norm": 0.07300622761249542, |
|
"learning_rate": 4.3493825284449515e-05, |
|
"loss": 0.6462, |
|
"num_input_tokens_seen": 17371008, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.2370275464445868, |
|
"grad_norm": 0.0677730068564415, |
|
"learning_rate": 4.338053549207844e-05, |
|
"loss": 0.6891, |
|
"num_input_tokens_seen": 17502016, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.23916292974588937, |
|
"grad_norm": 0.07456525415182114, |
|
"learning_rate": 4.326641815120505e-05, |
|
"loss": 0.6293, |
|
"num_input_tokens_seen": 17661632, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.24129831304719196, |
|
"grad_norm": 0.0730578750371933, |
|
"learning_rate": 4.315147839974464e-05, |
|
"loss": 0.7189, |
|
"num_input_tokens_seen": 17781440, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.24343369634849454, |
|
"grad_norm": 0.10547315329313278, |
|
"learning_rate": 4.303572141263997e-05, |
|
"loss": 0.6933, |
|
"num_input_tokens_seen": 17983840, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.24556907964979713, |
|
"grad_norm": 0.08231621235609055, |
|
"learning_rate": 4.2919152401628284e-05, |
|
"loss": 0.6973, |
|
"num_input_tokens_seen": 18166592, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.2477044629510997, |
|
"grad_norm": 0.0755239874124527, |
|
"learning_rate": 4.2801776615006644e-05, |
|
"loss": 0.5742, |
|
"num_input_tokens_seen": 18302912, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2498398462524023, |
|
"grad_norm": 0.0680757462978363, |
|
"learning_rate": 4.2683599337395655e-05, |
|
"loss": 0.6087, |
|
"num_input_tokens_seen": 18469344, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.2519752295537049, |
|
"grad_norm": 0.07045536488294601, |
|
"learning_rate": 4.2564625889501496e-05, |
|
"loss": 0.6595, |
|
"num_input_tokens_seen": 18599104, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.25411061285500747, |
|
"grad_norm": 0.09744574129581451, |
|
"learning_rate": 4.2444861627876444e-05, |
|
"loss": 0.7353, |
|
"num_input_tokens_seen": 18785696, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.25624599615631005, |
|
"grad_norm": 0.07259754836559296, |
|
"learning_rate": 4.2324311944677585e-05, |
|
"loss": 0.8322, |
|
"num_input_tokens_seen": 18972224, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25838137945761264, |
|
"grad_norm": 0.08454828709363937, |
|
"learning_rate": 4.220298226742415e-05, |
|
"loss": 0.6534, |
|
"num_input_tokens_seen": 19107968, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.2605167627589152, |
|
"grad_norm": 0.07386191189289093, |
|
"learning_rate": 4.208087805875314e-05, |
|
"loss": 0.7441, |
|
"num_input_tokens_seen": 19295072, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.2626521460602178, |
|
"grad_norm": 0.07102880626916885, |
|
"learning_rate": 4.195800481617328e-05, |
|
"loss": 0.816, |
|
"num_input_tokens_seen": 19440384, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.2647875293615204, |
|
"grad_norm": 0.07378337532281876, |
|
"learning_rate": 4.183436807181765e-05, |
|
"loss": 0.7341, |
|
"num_input_tokens_seen": 19619680, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.266922912662823, |
|
"grad_norm": 0.09023339301347733, |
|
"learning_rate": 4.17099733921945e-05, |
|
"loss": 0.6835, |
|
"num_input_tokens_seen": 19759200, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.26905829596412556, |
|
"grad_norm": 0.07276886701583862, |
|
"learning_rate": 4.158482637793667e-05, |
|
"loss": 0.7359, |
|
"num_input_tokens_seen": 19924448, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.27119367926542814, |
|
"grad_norm": 0.09461617469787598, |
|
"learning_rate": 4.145893266354944e-05, |
|
"loss": 0.6531, |
|
"num_input_tokens_seen": 20077888, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.27332906256673073, |
|
"grad_norm": 0.10658084601163864, |
|
"learning_rate": 4.133229791715685e-05, |
|
"loss": 0.5728, |
|
"num_input_tokens_seen": 20223296, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2754644458680333, |
|
"grad_norm": 0.07908082008361816, |
|
"learning_rate": 4.1204927840246455e-05, |
|
"loss": 0.7355, |
|
"num_input_tokens_seen": 20352928, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.2775998291693359, |
|
"grad_norm": 0.07296961545944214, |
|
"learning_rate": 4.1076828167412683e-05, |
|
"loss": 0.6645, |
|
"num_input_tokens_seen": 20511232, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2797352124706385, |
|
"grad_norm": 0.0914238691329956, |
|
"learning_rate": 4.0948004666098625e-05, |
|
"loss": 0.5866, |
|
"num_input_tokens_seen": 20684032, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.28187059577194107, |
|
"grad_norm": 0.08596916496753693, |
|
"learning_rate": 4.081846313633637e-05, |
|
"loss": 0.6235, |
|
"num_input_tokens_seen": 20826176, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.28400597907324365, |
|
"grad_norm": 0.08031884580850601, |
|
"learning_rate": 4.068820941048587e-05, |
|
"loss": 0.6974, |
|
"num_input_tokens_seen": 21007264, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.28614136237454624, |
|
"grad_norm": 0.09857963025569916, |
|
"learning_rate": 4.0557249352972316e-05, |
|
"loss": 0.6629, |
|
"num_input_tokens_seen": 21145024, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2882767456758488, |
|
"grad_norm": 0.07686656713485718, |
|
"learning_rate": 4.0425588860022166e-05, |
|
"loss": 0.7321, |
|
"num_input_tokens_seen": 21295104, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.2904121289771514, |
|
"grad_norm": 0.0786074548959732, |
|
"learning_rate": 4.029323385939763e-05, |
|
"loss": 0.6325, |
|
"num_input_tokens_seen": 21440256, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.292547512278454, |
|
"grad_norm": 0.0865710899233818, |
|
"learning_rate": 4.0160190310129806e-05, |
|
"loss": 0.6882, |
|
"num_input_tokens_seen": 21592768, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.2946828955797566, |
|
"grad_norm": 0.0743151381611824, |
|
"learning_rate": 4.0026464202250375e-05, |
|
"loss": 0.659, |
|
"num_input_tokens_seen": 21763360, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.29681827888105916, |
|
"grad_norm": 0.060234300792217255, |
|
"learning_rate": 3.989206155652192e-05, |
|
"loss": 0.6757, |
|
"num_input_tokens_seen": 21917792, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.29895366218236175, |
|
"grad_norm": 0.09439852088689804, |
|
"learning_rate": 3.975698842416684e-05, |
|
"loss": 0.6238, |
|
"num_input_tokens_seen": 22052384, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.30108904548366433, |
|
"grad_norm": 0.07870359718799591, |
|
"learning_rate": 3.962125088659492e-05, |
|
"loss": 0.688, |
|
"num_input_tokens_seen": 22225568, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.3032244287849669, |
|
"grad_norm": 0.08472148329019547, |
|
"learning_rate": 3.948485505512953e-05, |
|
"loss": 0.7123, |
|
"num_input_tokens_seen": 22388160, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3053598120862695, |
|
"grad_norm": 0.07081770896911621, |
|
"learning_rate": 3.9347807070732444e-05, |
|
"loss": 0.6638, |
|
"num_input_tokens_seen": 22579936, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.3074951953875721, |
|
"grad_norm": 0.07737255096435547, |
|
"learning_rate": 3.921011310372739e-05, |
|
"loss": 0.7064, |
|
"num_input_tokens_seen": 22730048, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.30963057868887467, |
|
"grad_norm": 0.0714409351348877, |
|
"learning_rate": 3.907177935352223e-05, |
|
"loss": 0.5651, |
|
"num_input_tokens_seen": 22911168, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.31176596199017725, |
|
"grad_norm": 0.06933268904685974, |
|
"learning_rate": 3.893281204832984e-05, |
|
"loss": 0.6695, |
|
"num_input_tokens_seen": 23088096, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.31390134529147984, |
|
"grad_norm": 0.10002848505973816, |
|
"learning_rate": 3.87932174448877e-05, |
|
"loss": 0.5989, |
|
"num_input_tokens_seen": 23243616, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.3160367285927824, |
|
"grad_norm": 0.07605909556150436, |
|
"learning_rate": 3.8653001828176185e-05, |
|
"loss": 0.5707, |
|
"num_input_tokens_seen": 23402240, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.318172111894085, |
|
"grad_norm": 0.09124422818422318, |
|
"learning_rate": 3.8512171511135616e-05, |
|
"loss": 0.6727, |
|
"num_input_tokens_seen": 23568096, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.3203074951953876, |
|
"grad_norm": 0.08254604786634445, |
|
"learning_rate": 3.8370732834382025e-05, |
|
"loss": 0.7122, |
|
"num_input_tokens_seen": 23723968, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.3224428784966902, |
|
"grad_norm": 0.07256225496530533, |
|
"learning_rate": 3.822869216592167e-05, |
|
"loss": 0.6667, |
|
"num_input_tokens_seen": 23882016, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.32457826179799276, |
|
"grad_norm": 0.07740245759487152, |
|
"learning_rate": 3.8086055900864356e-05, |
|
"loss": 0.7896, |
|
"num_input_tokens_seen": 24037088, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.32671364509929535, |
|
"grad_norm": 0.0661853477358818, |
|
"learning_rate": 3.794283046113546e-05, |
|
"loss": 0.6208, |
|
"num_input_tokens_seen": 24180032, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.32884902840059793, |
|
"grad_norm": 0.1970444917678833, |
|
"learning_rate": 3.7799022295186823e-05, |
|
"loss": 0.6193, |
|
"num_input_tokens_seen": 24363168, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3309844117019005, |
|
"grad_norm": 0.0673363208770752, |
|
"learning_rate": 3.765463787770645e-05, |
|
"loss": 0.6024, |
|
"num_input_tokens_seen": 24522112, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.3331197950032031, |
|
"grad_norm": 0.11159452795982361, |
|
"learning_rate": 3.750968370932694e-05, |
|
"loss": 0.7026, |
|
"num_input_tokens_seen": 24694048, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.3352551783045057, |
|
"grad_norm": 0.0691002830862999, |
|
"learning_rate": 3.736416631633286e-05, |
|
"loss": 0.6094, |
|
"num_input_tokens_seen": 24847616, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.33739056160580827, |
|
"grad_norm": 0.10651443898677826, |
|
"learning_rate": 3.721809225036688e-05, |
|
"loss": 0.6167, |
|
"num_input_tokens_seen": 24992096, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3395259449071108, |
|
"grad_norm": 0.1445714682340622, |
|
"learning_rate": 3.7071468088134806e-05, |
|
"loss": 0.6861, |
|
"num_input_tokens_seen": 25145792, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.3416613282084134, |
|
"grad_norm": 0.10462247580289841, |
|
"learning_rate": 3.692430043110947e-05, |
|
"loss": 0.8109, |
|
"num_input_tokens_seen": 25316896, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.34379671150971597, |
|
"grad_norm": 0.09085245430469513, |
|
"learning_rate": 3.677659590523354e-05, |
|
"loss": 0.6796, |
|
"num_input_tokens_seen": 25452608, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.34593209481101855, |
|
"grad_norm": 0.07898429781198502, |
|
"learning_rate": 3.662836116062117e-05, |
|
"loss": 0.8018, |
|
"num_input_tokens_seen": 25597056, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.34806747811232114, |
|
"grad_norm": 0.07476533204317093, |
|
"learning_rate": 3.647960287125859e-05, |
|
"loss": 0.7318, |
|
"num_input_tokens_seen": 25764224, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.3502028614136237, |
|
"grad_norm": 0.09074775129556656, |
|
"learning_rate": 3.6330327734703626e-05, |
|
"loss": 0.6615, |
|
"num_input_tokens_seen": 25893824, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3523382447149263, |
|
"grad_norm": 0.07897800952196121, |
|
"learning_rate": 3.61805424717842e-05, |
|
"loss": 0.6466, |
|
"num_input_tokens_seen": 26034304, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.3544736280162289, |
|
"grad_norm": 0.07650279998779297, |
|
"learning_rate": 3.603025382629565e-05, |
|
"loss": 0.7432, |
|
"num_input_tokens_seen": 26187712, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3566090113175315, |
|
"grad_norm": 0.09015358239412308, |
|
"learning_rate": 3.58794685646972e-05, |
|
"loss": 0.6275, |
|
"num_input_tokens_seen": 26338080, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.35874439461883406, |
|
"grad_norm": 0.08263330161571503, |
|
"learning_rate": 3.572819347580722e-05, |
|
"loss": 0.6545, |
|
"num_input_tokens_seen": 26501440, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.36087977792013665, |
|
"grad_norm": 0.06605567783117294, |
|
"learning_rate": 3.5576435370497655e-05, |
|
"loss": 0.6806, |
|
"num_input_tokens_seen": 26663936, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.36301516122143923, |
|
"grad_norm": 0.08297718316316605, |
|
"learning_rate": 3.542420108138732e-05, |
|
"loss": 0.6517, |
|
"num_input_tokens_seen": 26834176, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3651505445227418, |
|
"grad_norm": 0.14128510653972626, |
|
"learning_rate": 3.527149746253431e-05, |
|
"loss": 0.7356, |
|
"num_input_tokens_seen": 26996928, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.3672859278240444, |
|
"grad_norm": 0.09986595809459686, |
|
"learning_rate": 3.511833138912738e-05, |
|
"loss": 0.8021, |
|
"num_input_tokens_seen": 27162304, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.369421311125347, |
|
"grad_norm": 0.0766059085726738, |
|
"learning_rate": 3.496470975717643e-05, |
|
"loss": 0.7542, |
|
"num_input_tokens_seen": 27319392, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.37155669442664957, |
|
"grad_norm": 0.07614283263683319, |
|
"learning_rate": 3.4810639483202015e-05, |
|
"loss": 0.6407, |
|
"num_input_tokens_seen": 27511360, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.37369207772795215, |
|
"grad_norm": 0.06801874190568924, |
|
"learning_rate": 3.465612750392393e-05, |
|
"loss": 0.7553, |
|
"num_input_tokens_seen": 27703488, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.37582746102925474, |
|
"grad_norm": 0.09346262365579605, |
|
"learning_rate": 3.450118077594891e-05, |
|
"loss": 0.6873, |
|
"num_input_tokens_seen": 27866880, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3779628443305573, |
|
"grad_norm": 0.07162796705961227, |
|
"learning_rate": 3.434580627545743e-05, |
|
"loss": 0.6827, |
|
"num_input_tokens_seen": 28052480, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.3800982276318599, |
|
"grad_norm": 0.09126775711774826, |
|
"learning_rate": 3.419001099788959e-05, |
|
"loss": 0.6143, |
|
"num_input_tokens_seen": 28229600, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.3822336109331625, |
|
"grad_norm": 0.07642071694135666, |
|
"learning_rate": 3.403380195763018e-05, |
|
"loss": 0.5969, |
|
"num_input_tokens_seen": 28392992, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.3843689942344651, |
|
"grad_norm": 0.08265725523233414, |
|
"learning_rate": 3.387718618769287e-05, |
|
"loss": 0.4596, |
|
"num_input_tokens_seen": 28569344, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.38650437753576766, |
|
"grad_norm": 0.08538588136434555, |
|
"learning_rate": 3.372017073940355e-05, |
|
"loss": 0.6412, |
|
"num_input_tokens_seen": 28732608, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.38863976083707025, |
|
"grad_norm": 0.08859037607908249, |
|
"learning_rate": 3.356276268208289e-05, |
|
"loss": 0.7309, |
|
"num_input_tokens_seen": 28885792, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.39077514413837283, |
|
"grad_norm": 0.08217044919729233, |
|
"learning_rate": 3.340496910272798e-05, |
|
"loss": 0.5964, |
|
"num_input_tokens_seen": 29023008, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.3929105274396754, |
|
"grad_norm": 0.0807810053229332, |
|
"learning_rate": 3.324679710569334e-05, |
|
"loss": 0.6368, |
|
"num_input_tokens_seen": 29167584, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.395045910740978, |
|
"grad_norm": 0.07328809797763824, |
|
"learning_rate": 3.308825381237103e-05, |
|
"loss": 0.626, |
|
"num_input_tokens_seen": 29322720, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3971812940422806, |
|
"grad_norm": 0.09321283549070358, |
|
"learning_rate": 3.292934636086998e-05, |
|
"loss": 0.8989, |
|
"num_input_tokens_seen": 29487200, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.39931667734358317, |
|
"grad_norm": 0.08408747613430023, |
|
"learning_rate": 3.2770081905694696e-05, |
|
"loss": 0.7116, |
|
"num_input_tokens_seen": 29651232, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.40145206064488576, |
|
"grad_norm": 0.09342040121555328, |
|
"learning_rate": 3.261046761742305e-05, |
|
"loss": 0.7665, |
|
"num_input_tokens_seen": 29805216, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.40358744394618834, |
|
"grad_norm": 0.07169543951749802, |
|
"learning_rate": 3.245051068238348e-05, |
|
"loss": 0.6187, |
|
"num_input_tokens_seen": 29967360, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.4057228272474909, |
|
"grad_norm": 0.09437992423772812, |
|
"learning_rate": 3.229021830233149e-05, |
|
"loss": 0.7386, |
|
"num_input_tokens_seen": 30123104, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.4078582105487935, |
|
"grad_norm": 0.10910359770059586, |
|
"learning_rate": 3.2129597694125296e-05, |
|
"loss": 0.7952, |
|
"num_input_tokens_seen": 30302240, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.4099935938500961, |
|
"grad_norm": 0.10328692942857742, |
|
"learning_rate": 3.1968656089401e-05, |
|
"loss": 0.6779, |
|
"num_input_tokens_seen": 30445184, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.4121289771513987, |
|
"grad_norm": 0.06910215318202972, |
|
"learning_rate": 3.180740073424693e-05, |
|
"loss": 0.5771, |
|
"num_input_tokens_seen": 30596384, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.41426436045270126, |
|
"grad_norm": 0.0915064588189125, |
|
"learning_rate": 3.164583888887746e-05, |
|
"loss": 0.6306, |
|
"num_input_tokens_seen": 30778592, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.41639974375400385, |
|
"grad_norm": 0.08599945902824402, |
|
"learning_rate": 3.1483977827306054e-05, |
|
"loss": 0.693, |
|
"num_input_tokens_seen": 30943360, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.41853512705530643, |
|
"grad_norm": 0.08215602487325668, |
|
"learning_rate": 3.1321824837017875e-05, |
|
"loss": 0.5558, |
|
"num_input_tokens_seen": 31062304, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.420670510356609, |
|
"grad_norm": 0.11376603692770004, |
|
"learning_rate": 3.1159387218641575e-05, |
|
"loss": 0.7323, |
|
"num_input_tokens_seen": 31233792, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.4228058936579116, |
|
"grad_norm": 0.08716494590044022, |
|
"learning_rate": 3.099667228562064e-05, |
|
"loss": 0.6371, |
|
"num_input_tokens_seen": 31383616, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4249412769592142, |
|
"grad_norm": 0.05800577253103256, |
|
"learning_rate": 3.083368736388414e-05, |
|
"loss": 0.6631, |
|
"num_input_tokens_seen": 31559968, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.42707666026051677, |
|
"grad_norm": 0.08554735034704208, |
|
"learning_rate": 3.067043979151687e-05, |
|
"loss": 0.6021, |
|
"num_input_tokens_seen": 31716480, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.42921204356181936, |
|
"grad_norm": 0.08346949517726898, |
|
"learning_rate": 3.0506936918428947e-05, |
|
"loss": 0.5901, |
|
"num_input_tokens_seen": 31861568, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.43134742686312194, |
|
"grad_norm": 0.16743424534797668, |
|
"learning_rate": 3.0343186106024946e-05, |
|
"loss": 0.5969, |
|
"num_input_tokens_seen": 32023008, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.4334828101644245, |
|
"grad_norm": 0.08071965724229813, |
|
"learning_rate": 3.01791947268724e-05, |
|
"loss": 0.6469, |
|
"num_input_tokens_seen": 32213024, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.4356181934657271, |
|
"grad_norm": 0.11266499757766724, |
|
"learning_rate": 3.0014970164369936e-05, |
|
"loss": 0.6257, |
|
"num_input_tokens_seen": 32382752, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.4377535767670297, |
|
"grad_norm": 0.09486319869756699, |
|
"learning_rate": 2.985051981241479e-05, |
|
"loss": 0.7496, |
|
"num_input_tokens_seen": 32520832, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.4398889600683323, |
|
"grad_norm": 0.1076025515794754, |
|
"learning_rate": 2.9685851075069954e-05, |
|
"loss": 0.8778, |
|
"num_input_tokens_seen": 32673472, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.44202434336963486, |
|
"grad_norm": 0.12652435898780823, |
|
"learning_rate": 2.9520971366230783e-05, |
|
"loss": 0.7424, |
|
"num_input_tokens_seen": 32850272, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.44415972667093745, |
|
"grad_norm": 0.11113929003477097, |
|
"learning_rate": 2.9355888109291247e-05, |
|
"loss": 0.8948, |
|
"num_input_tokens_seen": 32994432, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.44629510997224003, |
|
"grad_norm": 0.07004854828119278, |
|
"learning_rate": 2.9190608736809664e-05, |
|
"loss": 0.6752, |
|
"num_input_tokens_seen": 33134112, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.4484304932735426, |
|
"grad_norm": 0.10912331193685532, |
|
"learning_rate": 2.902514069017409e-05, |
|
"loss": 0.8079, |
|
"num_input_tokens_seen": 33307008, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.4505658765748452, |
|
"grad_norm": 0.08094992488622665, |
|
"learning_rate": 2.8859491419267264e-05, |
|
"loss": 0.6908, |
|
"num_input_tokens_seen": 33478752, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.4527012598761478, |
|
"grad_norm": 0.09789257496595383, |
|
"learning_rate": 2.86936683821312e-05, |
|
"loss": 0.6369, |
|
"num_input_tokens_seen": 33641728, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.4548366431774504, |
|
"grad_norm": 0.07772962003946304, |
|
"learning_rate": 2.8527679044631417e-05, |
|
"loss": 0.6272, |
|
"num_input_tokens_seen": 33819104, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.45697202647875296, |
|
"grad_norm": 0.07876738905906677, |
|
"learning_rate": 2.836153088012078e-05, |
|
"loss": 0.5017, |
|
"num_input_tokens_seen": 33946336, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.45910740978005554, |
|
"grad_norm": 0.07158119231462479, |
|
"learning_rate": 2.8195231369103042e-05, |
|
"loss": 0.5854, |
|
"num_input_tokens_seen": 34111232, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.4612427930813581, |
|
"grad_norm": 0.07409899681806564, |
|
"learning_rate": 2.802878799889605e-05, |
|
"loss": 0.5877, |
|
"num_input_tokens_seen": 34269536, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.4633781763826607, |
|
"grad_norm": 0.16344216465950012, |
|
"learning_rate": 2.786220826329462e-05, |
|
"loss": 0.7302, |
|
"num_input_tokens_seen": 34420224, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.4655135596839633, |
|
"grad_norm": 0.09065761417150497, |
|
"learning_rate": 2.7695499662233164e-05, |
|
"loss": 0.9365, |
|
"num_input_tokens_seen": 34559872, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4676489429852659, |
|
"grad_norm": 0.07718425989151001, |
|
"learning_rate": 2.752866970144803e-05, |
|
"loss": 0.6596, |
|
"num_input_tokens_seen": 34734400, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.46978432628656847, |
|
"grad_norm": 0.08346325904130936, |
|
"learning_rate": 2.7361725892139533e-05, |
|
"loss": 0.7114, |
|
"num_input_tokens_seen": 34888416, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.47191970958787105, |
|
"grad_norm": 0.08522050827741623, |
|
"learning_rate": 2.719467575063382e-05, |
|
"loss": 0.5746, |
|
"num_input_tokens_seen": 35020992, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.4740550928891736, |
|
"grad_norm": 0.09076400846242905, |
|
"learning_rate": 2.7027526798044427e-05, |
|
"loss": 0.7177, |
|
"num_input_tokens_seen": 35215072, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.06955017149448395, |
|
"learning_rate": 2.6860286559933684e-05, |
|
"loss": 0.6877, |
|
"num_input_tokens_seen": 35380928, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.47832585949177875, |
|
"grad_norm": 0.08468913286924362, |
|
"learning_rate": 2.6692962565973866e-05, |
|
"loss": 0.6099, |
|
"num_input_tokens_seen": 35540480, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.48046124279308133, |
|
"grad_norm": 0.08094287663698196, |
|
"learning_rate": 2.652556234960821e-05, |
|
"loss": 0.5757, |
|
"num_input_tokens_seen": 35704256, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.4825966260943839, |
|
"grad_norm": 0.09746932238340378, |
|
"learning_rate": 2.635809344771169e-05, |
|
"loss": 0.683, |
|
"num_input_tokens_seen": 35856608, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4847320093956865, |
|
"grad_norm": 0.08693865686655045, |
|
"learning_rate": 2.619056340025175e-05, |
|
"loss": 0.6502, |
|
"num_input_tokens_seen": 35999840, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.4868673926969891, |
|
"grad_norm": 0.09562770277261734, |
|
"learning_rate": 2.6022979749948783e-05, |
|
"loss": 0.6337, |
|
"num_input_tokens_seen": 36129152, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.48900277599829167, |
|
"grad_norm": 0.11900558322668076, |
|
"learning_rate": 2.5855350041936537e-05, |
|
"loss": 0.7166, |
|
"num_input_tokens_seen": 36293152, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.49113815929959426, |
|
"grad_norm": 0.08834047615528107, |
|
"learning_rate": 2.5687681823422445e-05, |
|
"loss": 0.7633, |
|
"num_input_tokens_seen": 36445696, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.49327354260089684, |
|
"grad_norm": 0.07423476129770279, |
|
"learning_rate": 2.551998264334777e-05, |
|
"loss": 0.6183, |
|
"num_input_tokens_seen": 36614528, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.4954089259021994, |
|
"grad_norm": 0.08447694778442383, |
|
"learning_rate": 2.5352260052047788e-05, |
|
"loss": 0.5267, |
|
"num_input_tokens_seen": 36754880, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.497544309203502, |
|
"grad_norm": 0.09028150141239166, |
|
"learning_rate": 2.518452160091181e-05, |
|
"loss": 0.684, |
|
"num_input_tokens_seen": 36932000, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.4996796925048046, |
|
"grad_norm": 0.10303398221731186, |
|
"learning_rate": 2.5016774842043194e-05, |
|
"loss": 0.7886, |
|
"num_input_tokens_seen": 37093504, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5018150758061072, |
|
"grad_norm": 0.08447935432195663, |
|
"learning_rate": 2.484902732791936e-05, |
|
"loss": 0.691, |
|
"num_input_tokens_seen": 37272736, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.5039504591074098, |
|
"grad_norm": 0.08549617975950241, |
|
"learning_rate": 2.4681286611051708e-05, |
|
"loss": 0.7877, |
|
"num_input_tokens_seen": 37425024, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5060858424087123, |
|
"grad_norm": 0.08385903388261795, |
|
"learning_rate": 2.4513560243645635e-05, |
|
"loss": 0.6496, |
|
"num_input_tokens_seen": 37600736, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.5082212257100149, |
|
"grad_norm": 0.08815981447696686, |
|
"learning_rate": 2.4345855777260462e-05, |
|
"loss": 0.6722, |
|
"num_input_tokens_seen": 37775072, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5103566090113175, |
|
"grad_norm": 0.12655802071094513, |
|
"learning_rate": 2.4178180762469447e-05, |
|
"loss": 0.6637, |
|
"num_input_tokens_seen": 37908864, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.5124919923126201, |
|
"grad_norm": 0.09083867073059082, |
|
"learning_rate": 2.4010542748519863e-05, |
|
"loss": 0.6507, |
|
"num_input_tokens_seen": 38099328, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5146273756139227, |
|
"grad_norm": 0.11199730634689331, |
|
"learning_rate": 2.384294928299309e-05, |
|
"loss": 0.8343, |
|
"num_input_tokens_seen": 38247072, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.5167627589152253, |
|
"grad_norm": 0.08594491332769394, |
|
"learning_rate": 2.3675407911464788e-05, |
|
"loss": 0.598, |
|
"num_input_tokens_seen": 38391168, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5188981422165279, |
|
"grad_norm": 0.10429448634386063, |
|
"learning_rate": 2.350792617716521e-05, |
|
"loss": 0.6245, |
|
"num_input_tokens_seen": 38573664, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.5210335255178304, |
|
"grad_norm": 0.11104902625083923, |
|
"learning_rate": 2.334051162063953e-05, |
|
"loss": 0.72, |
|
"num_input_tokens_seen": 38740672, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.523168908819133, |
|
"grad_norm": 0.10164003819227219, |
|
"learning_rate": 2.3173171779408386e-05, |
|
"loss": 0.6333, |
|
"num_input_tokens_seen": 38864224, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.5253042921204356, |
|
"grad_norm": 0.10649612545967102, |
|
"learning_rate": 2.3005914187628492e-05, |
|
"loss": 0.7262, |
|
"num_input_tokens_seen": 39000320, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5274396754217382, |
|
"grad_norm": 0.10383658111095428, |
|
"learning_rate": 2.2838746375753456e-05, |
|
"loss": 0.5828, |
|
"num_input_tokens_seen": 39198400, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.5295750587230408, |
|
"grad_norm": 0.10013597458600998, |
|
"learning_rate": 2.2671675870194677e-05, |
|
"loss": 0.6544, |
|
"num_input_tokens_seen": 39359232, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.5317104420243434, |
|
"grad_norm": 0.13857851922512054, |
|
"learning_rate": 2.2504710192982575e-05, |
|
"loss": 0.6669, |
|
"num_input_tokens_seen": 39502176, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.533845825325646, |
|
"grad_norm": 0.08885691314935684, |
|
"learning_rate": 2.2337856861427843e-05, |
|
"loss": 0.8427, |
|
"num_input_tokens_seen": 39717472, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5359812086269485, |
|
"grad_norm": 0.11478804051876068, |
|
"learning_rate": 2.2171123387783028e-05, |
|
"loss": 0.5687, |
|
"num_input_tokens_seen": 39836000, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.5381165919282511, |
|
"grad_norm": 0.1051030158996582, |
|
"learning_rate": 2.2004517278904316e-05, |
|
"loss": 0.6957, |
|
"num_input_tokens_seen": 39995200, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.5402519752295537, |
|
"grad_norm": 0.07015421241521835, |
|
"learning_rate": 2.183804603591352e-05, |
|
"loss": 0.6944, |
|
"num_input_tokens_seen": 40173280, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.5423873585308563, |
|
"grad_norm": 0.10149814933538437, |
|
"learning_rate": 2.1671717153860385e-05, |
|
"loss": 0.7211, |
|
"num_input_tokens_seen": 40315296, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.5445227418321589, |
|
"grad_norm": 0.09945672750473022, |
|
"learning_rate": 2.1505538121385127e-05, |
|
"loss": 0.6752, |
|
"num_input_tokens_seen": 40485504, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.5466581251334615, |
|
"grad_norm": 0.07678119838237762, |
|
"learning_rate": 2.133951642038127e-05, |
|
"loss": 0.7874, |
|
"num_input_tokens_seen": 40678624, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.548793508434764, |
|
"grad_norm": 0.11939999461174011, |
|
"learning_rate": 2.117365952565879e-05, |
|
"loss": 0.6918, |
|
"num_input_tokens_seen": 40829472, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.5509288917360666, |
|
"grad_norm": 0.09344258159399033, |
|
"learning_rate": 2.100797490460756e-05, |
|
"loss": 0.6707, |
|
"num_input_tokens_seen": 40954304, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.5530642750373692, |
|
"grad_norm": 0.10135383903980255, |
|
"learning_rate": 2.0842470016861184e-05, |
|
"loss": 0.6515, |
|
"num_input_tokens_seen": 41120160, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.5551996583386718, |
|
"grad_norm": 0.12063171714544296, |
|
"learning_rate": 2.06771523139611e-05, |
|
"loss": 0.7781, |
|
"num_input_tokens_seen": 41283680, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5573350416399744, |
|
"grad_norm": 0.09838173538446426, |
|
"learning_rate": 2.051202923902112e-05, |
|
"loss": 0.6262, |
|
"num_input_tokens_seen": 41416448, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.559470424941277, |
|
"grad_norm": 0.11905540525913239, |
|
"learning_rate": 2.0347108226392285e-05, |
|
"loss": 0.5474, |
|
"num_input_tokens_seen": 41563552, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.5616058082425796, |
|
"grad_norm": 0.09312383085489273, |
|
"learning_rate": 2.0182396701328187e-05, |
|
"loss": 0.7023, |
|
"num_input_tokens_seen": 41713152, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.5637411915438821, |
|
"grad_norm": 0.09516125172376633, |
|
"learning_rate": 2.001790207965062e-05, |
|
"loss": 0.8375, |
|
"num_input_tokens_seen": 41901728, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.5658765748451847, |
|
"grad_norm": 0.10551753640174866, |
|
"learning_rate": 1.9853631767415737e-05, |
|
"loss": 0.7857, |
|
"num_input_tokens_seen": 42031776, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.5680119581464873, |
|
"grad_norm": 0.09541548788547516, |
|
"learning_rate": 1.9689593160580577e-05, |
|
"loss": 0.7697, |
|
"num_input_tokens_seen": 42196352, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.5701473414477899, |
|
"grad_norm": 0.1404384821653366, |
|
"learning_rate": 1.9525793644670094e-05, |
|
"loss": 0.8586, |
|
"num_input_tokens_seen": 42341088, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.5722827247490925, |
|
"grad_norm": 0.1053939163684845, |
|
"learning_rate": 1.93622405944446e-05, |
|
"loss": 0.8365, |
|
"num_input_tokens_seen": 42495424, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.5744181080503951, |
|
"grad_norm": 0.1150602251291275, |
|
"learning_rate": 1.9198941373567797e-05, |
|
"loss": 0.6521, |
|
"num_input_tokens_seen": 42622080, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.5765534913516976, |
|
"grad_norm": 0.09714847803115845, |
|
"learning_rate": 1.9035903334275186e-05, |
|
"loss": 0.8343, |
|
"num_input_tokens_seen": 42817472, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.5786888746530002, |
|
"grad_norm": 0.11403302848339081, |
|
"learning_rate": 1.887313381704308e-05, |
|
"loss": 0.6469, |
|
"num_input_tokens_seen": 42967968, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.5808242579543028, |
|
"grad_norm": 0.10145643353462219, |
|
"learning_rate": 1.871064015025808e-05, |
|
"loss": 0.6199, |
|
"num_input_tokens_seen": 43113120, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5829596412556054, |
|
"grad_norm": 0.12413822114467621, |
|
"learning_rate": 1.8548429649887167e-05, |
|
"loss": 0.6748, |
|
"num_input_tokens_seen": 43311584, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.585095024556908, |
|
"grad_norm": 0.10621116310358047, |
|
"learning_rate": 1.8386509619148283e-05, |
|
"loss": 0.6825, |
|
"num_input_tokens_seen": 43468704, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5872304078582106, |
|
"grad_norm": 0.08581121265888214, |
|
"learning_rate": 1.822488734818153e-05, |
|
"loss": 0.7961, |
|
"num_input_tokens_seen": 43629152, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.5893657911595132, |
|
"grad_norm": 0.10057251155376434, |
|
"learning_rate": 1.8063570113720955e-05, |
|
"loss": 0.7024, |
|
"num_input_tokens_seen": 43796384, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5915011744608157, |
|
"grad_norm": 0.145149365067482, |
|
"learning_rate": 1.79025651787669e-05, |
|
"loss": 0.7315, |
|
"num_input_tokens_seen": 43972640, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.5936365577621183, |
|
"grad_norm": 0.09588214010000229, |
|
"learning_rate": 1.7741879792259033e-05, |
|
"loss": 0.7955, |
|
"num_input_tokens_seen": 44110080, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5957719410634209, |
|
"grad_norm": 0.10795921087265015, |
|
"learning_rate": 1.7581521188749968e-05, |
|
"loss": 0.8156, |
|
"num_input_tokens_seen": 44270080, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.5979073243647235, |
|
"grad_norm": 0.13513167202472687, |
|
"learning_rate": 1.742149658807952e-05, |
|
"loss": 0.688, |
|
"num_input_tokens_seen": 44437280, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6000427076660261, |
|
"grad_norm": 0.0809662714600563, |
|
"learning_rate": 1.7261813195049682e-05, |
|
"loss": 0.7067, |
|
"num_input_tokens_seen": 44579680, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.6021780909673287, |
|
"grad_norm": 0.08051643520593643, |
|
"learning_rate": 1.7102478199100218e-05, |
|
"loss": 0.565, |
|
"num_input_tokens_seen": 44788832, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6043134742686312, |
|
"grad_norm": 0.08201641589403152, |
|
"learning_rate": 1.6943498773984974e-05, |
|
"loss": 0.5555, |
|
"num_input_tokens_seen": 44951488, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.6064488575699338, |
|
"grad_norm": 0.07378476113080978, |
|
"learning_rate": 1.678488207744891e-05, |
|
"loss": 0.7106, |
|
"num_input_tokens_seen": 45127232, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.6085842408712364, |
|
"grad_norm": 0.08412224799394608, |
|
"learning_rate": 1.6626635250905813e-05, |
|
"loss": 0.8088, |
|
"num_input_tokens_seen": 45290592, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.610719624172539, |
|
"grad_norm": 0.09182008355855942, |
|
"learning_rate": 1.646876541911679e-05, |
|
"loss": 0.5566, |
|
"num_input_tokens_seen": 45429920, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.6128550074738416, |
|
"grad_norm": 0.11553499102592468, |
|
"learning_rate": 1.6311279689869464e-05, |
|
"loss": 0.6124, |
|
"num_input_tokens_seen": 45612000, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.6149903907751442, |
|
"grad_norm": 0.1281968653202057, |
|
"learning_rate": 1.615418515365799e-05, |
|
"loss": 0.764, |
|
"num_input_tokens_seen": 45752192, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.6171257740764468, |
|
"grad_norm": 0.11949111521244049, |
|
"learning_rate": 1.5997488883363804e-05, |
|
"loss": 0.6346, |
|
"num_input_tokens_seen": 45927808, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.6192611573777493, |
|
"grad_norm": 0.1383758783340454, |
|
"learning_rate": 1.5841197933937164e-05, |
|
"loss": 0.5827, |
|
"num_input_tokens_seen": 46082432, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6213965406790519, |
|
"grad_norm": 0.09209062159061432, |
|
"learning_rate": 1.568531934207955e-05, |
|
"loss": 0.6316, |
|
"num_input_tokens_seen": 46226688, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.6235319239803545, |
|
"grad_norm": 0.16895094513893127, |
|
"learning_rate": 1.552986012592681e-05, |
|
"loss": 0.7383, |
|
"num_input_tokens_seen": 46361216, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6256673072816571, |
|
"grad_norm": 0.07766853272914886, |
|
"learning_rate": 1.5374827284733223e-05, |
|
"loss": 0.598, |
|
"num_input_tokens_seen": 46511840, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.6278026905829597, |
|
"grad_norm": 0.09342877566814423, |
|
"learning_rate": 1.5220227798556333e-05, |
|
"loss": 0.6047, |
|
"num_input_tokens_seen": 46635328, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6299380738842623, |
|
"grad_norm": 0.07859272509813309, |
|
"learning_rate": 1.5066068627942714e-05, |
|
"loss": 0.5981, |
|
"num_input_tokens_seen": 46791520, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.6320734571855648, |
|
"grad_norm": 0.0829625129699707, |
|
"learning_rate": 1.4912356713614573e-05, |
|
"loss": 0.9216, |
|
"num_input_tokens_seen": 46964672, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.6342088404868674, |
|
"grad_norm": 0.08610516041517258, |
|
"learning_rate": 1.4759098976157227e-05, |
|
"loss": 0.7327, |
|
"num_input_tokens_seen": 47116864, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.63634422378817, |
|
"grad_norm": 0.10078553855419159, |
|
"learning_rate": 1.4606302315707587e-05, |
|
"loss": 0.6273, |
|
"num_input_tokens_seen": 47249824, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.6384796070894726, |
|
"grad_norm": 0.10765385627746582, |
|
"learning_rate": 1.4453973611643445e-05, |
|
"loss": 0.6039, |
|
"num_input_tokens_seen": 47405440, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.6406149903907752, |
|
"grad_norm": 0.08604435622692108, |
|
"learning_rate": 1.4302119722273727e-05, |
|
"loss": 0.6372, |
|
"num_input_tokens_seen": 47560960, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6427503736920778, |
|
"grad_norm": 0.09638124704360962, |
|
"learning_rate": 1.4150747484529758e-05, |
|
"loss": 0.5995, |
|
"num_input_tokens_seen": 47726656, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.6448857569933804, |
|
"grad_norm": 0.08920534700155258, |
|
"learning_rate": 1.3999863713657405e-05, |
|
"loss": 0.7475, |
|
"num_input_tokens_seen": 47882784, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.6470211402946829, |
|
"grad_norm": 0.10143899917602539, |
|
"learning_rate": 1.3849475202910244e-05, |
|
"loss": 0.7008, |
|
"num_input_tokens_seen": 48048608, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.6491565235959855, |
|
"grad_norm": 0.10630396008491516, |
|
"learning_rate": 1.369958872324374e-05, |
|
"loss": 0.5906, |
|
"num_input_tokens_seen": 48167424, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.6512919068972881, |
|
"grad_norm": 0.10320613533258438, |
|
"learning_rate": 1.3550211023010346e-05, |
|
"loss": 0.7876, |
|
"num_input_tokens_seen": 48342048, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.6534272901985907, |
|
"grad_norm": 0.10990385711193085, |
|
"learning_rate": 1.3401348827655665e-05, |
|
"loss": 0.6946, |
|
"num_input_tokens_seen": 48519488, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.6555626734998933, |
|
"grad_norm": 0.08516086637973785, |
|
"learning_rate": 1.3253008839415726e-05, |
|
"loss": 0.661, |
|
"num_input_tokens_seen": 48671424, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.6576980568011959, |
|
"grad_norm": 0.11356549710035324, |
|
"learning_rate": 1.310519773701515e-05, |
|
"loss": 0.6125, |
|
"num_input_tokens_seen": 48796000, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.6598334401024984, |
|
"grad_norm": 0.10029956698417664, |
|
"learning_rate": 1.2957922175366493e-05, |
|
"loss": 0.6231, |
|
"num_input_tokens_seen": 48973024, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.661968823403801, |
|
"grad_norm": 0.09604058414697647, |
|
"learning_rate": 1.2811188785270617e-05, |
|
"loss": 0.836, |
|
"num_input_tokens_seen": 49140192, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.6641042067051036, |
|
"grad_norm": 0.09177996963262558, |
|
"learning_rate": 1.2665004173118136e-05, |
|
"loss": 0.6581, |
|
"num_input_tokens_seen": 49313920, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.6662395900064062, |
|
"grad_norm": 0.10683578252792358, |
|
"learning_rate": 1.2519374920591987e-05, |
|
"loss": 0.6878, |
|
"num_input_tokens_seen": 49480096, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.6683749733077088, |
|
"grad_norm": 0.09613426774740219, |
|
"learning_rate": 1.2374307584371104e-05, |
|
"loss": 0.7337, |
|
"num_input_tokens_seen": 49635936, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.6705103566090114, |
|
"grad_norm": 0.08746462315320969, |
|
"learning_rate": 1.222980869583521e-05, |
|
"loss": 0.6751, |
|
"num_input_tokens_seen": 49749408, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.672645739910314, |
|
"grad_norm": 0.11159204691648483, |
|
"learning_rate": 1.2085884760770755e-05, |
|
"loss": 0.7597, |
|
"num_input_tokens_seen": 49916512, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.6747811232116165, |
|
"grad_norm": 0.08674119412899017, |
|
"learning_rate": 1.1942542259078013e-05, |
|
"loss": 0.7161, |
|
"num_input_tokens_seen": 50054080, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.676916506512919, |
|
"grad_norm": 0.0944414883852005, |
|
"learning_rate": 1.1799787644479329e-05, |
|
"loss": 0.6078, |
|
"num_input_tokens_seen": 50209472, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.6790518898142216, |
|
"grad_norm": 0.10381105542182922, |
|
"learning_rate": 1.165762734422855e-05, |
|
"loss": 0.7661, |
|
"num_input_tokens_seen": 50374560, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.6811872731155242, |
|
"grad_norm": 0.09648651629686356, |
|
"learning_rate": 1.1516067758821658e-05, |
|
"loss": 0.7189, |
|
"num_input_tokens_seen": 50525632, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.6833226564168268, |
|
"grad_norm": 0.10135359317064285, |
|
"learning_rate": 1.13751152617086e-05, |
|
"loss": 0.7739, |
|
"num_input_tokens_seen": 50678080, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6854580397181294, |
|
"grad_norm": 0.09060854464769363, |
|
"learning_rate": 1.1234776199006324e-05, |
|
"loss": 0.8047, |
|
"num_input_tokens_seen": 50845056, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.6875934230194319, |
|
"grad_norm": 0.06740930676460266, |
|
"learning_rate": 1.1095056889213073e-05, |
|
"loss": 0.599, |
|
"num_input_tokens_seen": 51008896, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.6897288063207345, |
|
"grad_norm": 0.09671995788812637, |
|
"learning_rate": 1.0955963622923896e-05, |
|
"loss": 0.6548, |
|
"num_input_tokens_seen": 51176448, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.6918641896220371, |
|
"grad_norm": 0.0861692875623703, |
|
"learning_rate": 1.0817502662547426e-05, |
|
"loss": 0.6567, |
|
"num_input_tokens_seen": 51347616, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.6939995729233397, |
|
"grad_norm": 0.11806908249855042, |
|
"learning_rate": 1.0679680242023946e-05, |
|
"loss": 0.5926, |
|
"num_input_tokens_seen": 51512000, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.6961349562246423, |
|
"grad_norm": 0.10389918833971024, |
|
"learning_rate": 1.0542502566544668e-05, |
|
"loss": 0.8239, |
|
"num_input_tokens_seen": 51659328, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6982703395259449, |
|
"grad_norm": 0.07497014105319977, |
|
"learning_rate": 1.040597581227242e-05, |
|
"loss": 0.7617, |
|
"num_input_tokens_seen": 51806176, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.7004057228272474, |
|
"grad_norm": 0.07773059606552124, |
|
"learning_rate": 1.0270106126063539e-05, |
|
"loss": 0.6469, |
|
"num_input_tokens_seen": 51930816, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.70254110612855, |
|
"grad_norm": 0.10639885812997818, |
|
"learning_rate": 1.0134899625191124e-05, |
|
"loss": 0.8937, |
|
"num_input_tokens_seen": 52054944, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.7046764894298526, |
|
"grad_norm": 0.09907250851392746, |
|
"learning_rate": 1.0000362397069612e-05, |
|
"loss": 0.6863, |
|
"num_input_tokens_seen": 52213536, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7068118727311552, |
|
"grad_norm": 0.11581376940011978, |
|
"learning_rate": 9.866500498980744e-06, |
|
"loss": 0.6294, |
|
"num_input_tokens_seen": 52366624, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.7089472560324578, |
|
"grad_norm": 0.10165643692016602, |
|
"learning_rate": 9.733319957800781e-06, |
|
"loss": 0.644, |
|
"num_input_tokens_seen": 52518688, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.7110826393337604, |
|
"grad_norm": 0.09698858112096786, |
|
"learning_rate": 9.60082676972921e-06, |
|
"loss": 0.658, |
|
"num_input_tokens_seen": 52656384, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.713218022635063, |
|
"grad_norm": 0.1165652796626091, |
|
"learning_rate": 9.469026900018758e-06, |
|
"loss": 0.7008, |
|
"num_input_tokens_seen": 52816832, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.7153534059363655, |
|
"grad_norm": 0.11195079982280731, |
|
"learning_rate": 9.337926282706794e-06, |
|
"loss": 0.6814, |
|
"num_input_tokens_seen": 52979936, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.7174887892376681, |
|
"grad_norm": 0.07277271896600723, |
|
"learning_rate": 9.20753082034821e-06, |
|
"loss": 0.5933, |
|
"num_input_tokens_seen": 53151136, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.7196241725389707, |
|
"grad_norm": 0.11374859511852264, |
|
"learning_rate": 9.077846383749631e-06, |
|
"loss": 0.7048, |
|
"num_input_tokens_seen": 53375680, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.7217595558402733, |
|
"grad_norm": 0.08321022987365723, |
|
"learning_rate": 8.948878811705109e-06, |
|
"loss": 0.7039, |
|
"num_input_tokens_seen": 53558240, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.7238949391415759, |
|
"grad_norm": 0.09429024904966354, |
|
"learning_rate": 8.820633910733237e-06, |
|
"loss": 0.7525, |
|
"num_input_tokens_seen": 53744960, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.7260303224428785, |
|
"grad_norm": 0.09550992399454117, |
|
"learning_rate": 8.693117454815728e-06, |
|
"loss": 0.595, |
|
"num_input_tokens_seen": 53884480, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.728165705744181, |
|
"grad_norm": 0.09551380574703217, |
|
"learning_rate": 8.566335185137437e-06, |
|
"loss": 0.5853, |
|
"num_input_tokens_seen": 54077792, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.7303010890454836, |
|
"grad_norm": 0.10770967602729797, |
|
"learning_rate": 8.440292809827898e-06, |
|
"loss": 0.7973, |
|
"num_input_tokens_seen": 54246368, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7324364723467862, |
|
"grad_norm": 0.12636590003967285, |
|
"learning_rate": 8.314996003704305e-06, |
|
"loss": 0.8046, |
|
"num_input_tokens_seen": 54422240, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.7345718556480888, |
|
"grad_norm": 0.10689777135848999, |
|
"learning_rate": 8.190450408016032e-06, |
|
"loss": 0.5263, |
|
"num_input_tokens_seen": 54574592, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7367072389493914, |
|
"grad_norm": 0.09278780221939087, |
|
"learning_rate": 8.06666163019063e-06, |
|
"loss": 0.6577, |
|
"num_input_tokens_seen": 54728160, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.738842622250694, |
|
"grad_norm": 0.10053995251655579, |
|
"learning_rate": 7.943635243581373e-06, |
|
"loss": 0.6628, |
|
"num_input_tokens_seen": 54895072, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.7409780055519966, |
|
"grad_norm": 0.10549025237560272, |
|
"learning_rate": 7.821376787216333e-06, |
|
"loss": 0.6087, |
|
"num_input_tokens_seen": 55072256, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.7431133888532991, |
|
"grad_norm": 0.08755512535572052, |
|
"learning_rate": 7.699891765548983e-06, |
|
"loss": 0.6766, |
|
"num_input_tokens_seen": 55237888, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.7452487721546017, |
|
"grad_norm": 0.10339244455099106, |
|
"learning_rate": 7.5791856482103765e-06, |
|
"loss": 0.6222, |
|
"num_input_tokens_seen": 55398048, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.7473841554559043, |
|
"grad_norm": 0.09155864268541336, |
|
"learning_rate": 7.459263869762892e-06, |
|
"loss": 0.6083, |
|
"num_input_tokens_seen": 55558336, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.7495195387572069, |
|
"grad_norm": 0.11388752609491348, |
|
"learning_rate": 7.340131829455541e-06, |
|
"loss": 0.7643, |
|
"num_input_tokens_seen": 55717888, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.7516549220585095, |
|
"grad_norm": 0.0928613469004631, |
|
"learning_rate": 7.221794890980888e-06, |
|
"loss": 0.6745, |
|
"num_input_tokens_seen": 55894816, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.7537903053598121, |
|
"grad_norm": 0.09511938691139221, |
|
"learning_rate": 7.104258382233556e-06, |
|
"loss": 0.6846, |
|
"num_input_tokens_seen": 56071360, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.7559256886611146, |
|
"grad_norm": 0.07386107742786407, |
|
"learning_rate": 6.987527595070356e-06, |
|
"loss": 0.625, |
|
"num_input_tokens_seen": 56188384, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.7580610719624172, |
|
"grad_norm": 0.09641123563051224, |
|
"learning_rate": 6.871607785071999e-06, |
|
"loss": 0.6852, |
|
"num_input_tokens_seen": 56365312, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.7601964552637198, |
|
"grad_norm": 0.08215915411710739, |
|
"learning_rate": 6.756504171306521e-06, |
|
"loss": 0.7002, |
|
"num_input_tokens_seen": 56509120, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.7623318385650224, |
|
"grad_norm": 0.09883769601583481, |
|
"learning_rate": 6.642221936094281e-06, |
|
"loss": 0.7087, |
|
"num_input_tokens_seen": 56652384, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.764467221866325, |
|
"grad_norm": 0.09604239463806152, |
|
"learning_rate": 6.528766224774619e-06, |
|
"loss": 0.5355, |
|
"num_input_tokens_seen": 56796704, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.7666026051676276, |
|
"grad_norm": 0.0826464369893074, |
|
"learning_rate": 6.416142145474244e-06, |
|
"loss": 0.727, |
|
"num_input_tokens_seen": 56975872, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.7687379884689302, |
|
"grad_norm": 0.08775708824396133, |
|
"learning_rate": 6.304354768877196e-06, |
|
"loss": 0.7101, |
|
"num_input_tokens_seen": 57147296, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7708733717702327, |
|
"grad_norm": 0.07710240036249161, |
|
"learning_rate": 6.1934091279965915e-06, |
|
"loss": 0.799, |
|
"num_input_tokens_seen": 57302368, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.7730087550715353, |
|
"grad_norm": 0.10319597274065018, |
|
"learning_rate": 6.083310217947991e-06, |
|
"loss": 0.6874, |
|
"num_input_tokens_seen": 57471200, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.7751441383728379, |
|
"grad_norm": 0.12237267196178436, |
|
"learning_rate": 5.974062995724527e-06, |
|
"loss": 0.7995, |
|
"num_input_tokens_seen": 57679840, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.7772795216741405, |
|
"grad_norm": 0.11243870854377747, |
|
"learning_rate": 5.865672379973702e-06, |
|
"loss": 0.6763, |
|
"num_input_tokens_seen": 57849248, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.7794149049754431, |
|
"grad_norm": 0.08665511757135391, |
|
"learning_rate": 5.75814325077596e-06, |
|
"loss": 0.5619, |
|
"num_input_tokens_seen": 57993952, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.7815502882767457, |
|
"grad_norm": 0.09945985674858093, |
|
"learning_rate": 5.651480449424954e-06, |
|
"loss": 0.6884, |
|
"num_input_tokens_seen": 58146592, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.7836856715780482, |
|
"grad_norm": 0.10123780369758606, |
|
"learning_rate": 5.545688778209579e-06, |
|
"loss": 0.7584, |
|
"num_input_tokens_seen": 58307808, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.7858210548793508, |
|
"grad_norm": 0.08710220456123352, |
|
"learning_rate": 5.440773000197763e-06, |
|
"loss": 0.7216, |
|
"num_input_tokens_seen": 58462528, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.7879564381806534, |
|
"grad_norm": 0.10483860224485397, |
|
"learning_rate": 5.3367378390220184e-06, |
|
"loss": 0.5983, |
|
"num_input_tokens_seen": 58626784, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.790091821481956, |
|
"grad_norm": 0.10729069262742996, |
|
"learning_rate": 5.233587978666754e-06, |
|
"loss": 0.5874, |
|
"num_input_tokens_seen": 58805760, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.7922272047832586, |
|
"grad_norm": 0.08131475001573563, |
|
"learning_rate": 5.131328063257415e-06, |
|
"loss": 0.6549, |
|
"num_input_tokens_seen": 58989248, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.7943625880845612, |
|
"grad_norm": 0.12807467579841614, |
|
"learning_rate": 5.029962696851365e-06, |
|
"loss": 0.7086, |
|
"num_input_tokens_seen": 59127904, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.7964979713858638, |
|
"grad_norm": 0.1114497184753418, |
|
"learning_rate": 4.9294964432306105e-06, |
|
"loss": 0.6751, |
|
"num_input_tokens_seen": 59290880, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.7986333546871663, |
|
"grad_norm": 0.0979105532169342, |
|
"learning_rate": 4.829933825696328e-06, |
|
"loss": 0.6631, |
|
"num_input_tokens_seen": 59453504, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.8007687379884689, |
|
"grad_norm": 0.10672794282436371, |
|
"learning_rate": 4.731279326865193e-06, |
|
"loss": 0.6248, |
|
"num_input_tokens_seen": 59628704, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.8029041212897715, |
|
"grad_norm": 0.09161815047264099, |
|
"learning_rate": 4.633537388467582e-06, |
|
"loss": 0.6742, |
|
"num_input_tokens_seen": 59770720, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.8050395045910741, |
|
"grad_norm": 0.10243742913007736, |
|
"learning_rate": 4.536712411147573e-06, |
|
"loss": 0.6084, |
|
"num_input_tokens_seen": 59929280, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.8071748878923767, |
|
"grad_norm": 0.09579010307788849, |
|
"learning_rate": 4.4408087542648334e-06, |
|
"loss": 0.7314, |
|
"num_input_tokens_seen": 60045152, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.8093102711936793, |
|
"grad_norm": 0.10613362491130829, |
|
"learning_rate": 4.345830735698322e-06, |
|
"loss": 0.6492, |
|
"num_input_tokens_seen": 60163840, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.8114456544949818, |
|
"grad_norm": 0.10478969663381577, |
|
"learning_rate": 4.251782631651918e-06, |
|
"loss": 0.7565, |
|
"num_input_tokens_seen": 60329152, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8135810377962844, |
|
"grad_norm": 0.1022254079580307, |
|
"learning_rate": 4.158668676461866e-06, |
|
"loss": 0.6302, |
|
"num_input_tokens_seen": 60451264, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.815716421097587, |
|
"grad_norm": 0.12434552609920502, |
|
"learning_rate": 4.0664930624061375e-06, |
|
"loss": 0.6156, |
|
"num_input_tokens_seen": 60607008, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.8178518043988896, |
|
"grad_norm": 0.09911098331212997, |
|
"learning_rate": 3.975259939515708e-06, |
|
"loss": 0.6657, |
|
"num_input_tokens_seen": 60764064, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.8199871877001922, |
|
"grad_norm": 0.10193871706724167, |
|
"learning_rate": 3.884973415387652e-06, |
|
"loss": 0.834, |
|
"num_input_tokens_seen": 60919072, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.8221225710014948, |
|
"grad_norm": 0.09091677516698837, |
|
"learning_rate": 3.79563755500027e-06, |
|
"loss": 0.6426, |
|
"num_input_tokens_seen": 61074976, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.8242579543027974, |
|
"grad_norm": 0.09682322293519974, |
|
"learning_rate": 3.7072563805300497e-06, |
|
"loss": 0.7106, |
|
"num_input_tokens_seen": 61209088, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.8263933376040999, |
|
"grad_norm": 0.09818655252456665, |
|
"learning_rate": 3.61983387117055e-06, |
|
"loss": 0.724, |
|
"num_input_tokens_seen": 61367360, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.8285287209054025, |
|
"grad_norm": 0.0938807874917984, |
|
"learning_rate": 3.533373962953271e-06, |
|
"loss": 0.7054, |
|
"num_input_tokens_seen": 61506976, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.8306641042067051, |
|
"grad_norm": 0.09612589329481125, |
|
"learning_rate": 3.447880548570434e-06, |
|
"loss": 0.5991, |
|
"num_input_tokens_seen": 61661280, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.8327994875080077, |
|
"grad_norm": 0.10615026950836182, |
|
"learning_rate": 3.3633574771997245e-06, |
|
"loss": 0.8037, |
|
"num_input_tokens_seen": 61813056, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8349348708093103, |
|
"grad_norm": 0.08966366946697235, |
|
"learning_rate": 3.2798085543309847e-06, |
|
"loss": 0.7369, |
|
"num_input_tokens_seen": 61970752, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.8370702541106129, |
|
"grad_norm": 0.10236942023038864, |
|
"learning_rate": 3.1972375415948884e-06, |
|
"loss": 0.5092, |
|
"num_input_tokens_seen": 62085728, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8392056374119155, |
|
"grad_norm": 0.09586668014526367, |
|
"learning_rate": 3.1156481565935563e-06, |
|
"loss": 0.5488, |
|
"num_input_tokens_seen": 62232288, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.841341020713218, |
|
"grad_norm": 0.09763219207525253, |
|
"learning_rate": 3.035044072733209e-06, |
|
"loss": 0.8189, |
|
"num_input_tokens_seen": 62418272, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.8434764040145206, |
|
"grad_norm": 0.09863479435443878, |
|
"learning_rate": 2.955428919058767e-06, |
|
"loss": 0.7843, |
|
"num_input_tokens_seen": 62560416, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.8456117873158232, |
|
"grad_norm": 0.10871785879135132, |
|
"learning_rate": 2.876806280090449e-06, |
|
"loss": 0.6783, |
|
"num_input_tokens_seen": 62713120, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.8477471706171258, |
|
"grad_norm": 0.08632975071668625, |
|
"learning_rate": 2.7991796956624017e-06, |
|
"loss": 0.6642, |
|
"num_input_tokens_seen": 62906304, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.8498825539184284, |
|
"grad_norm": 0.11040724813938141, |
|
"learning_rate": 2.7225526607633167e-06, |
|
"loss": 0.697, |
|
"num_input_tokens_seen": 63043552, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.852017937219731, |
|
"grad_norm": 0.08328652381896973, |
|
"learning_rate": 2.6469286253790777e-06, |
|
"loss": 0.549, |
|
"num_input_tokens_seen": 63192608, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.8541533205210335, |
|
"grad_norm": 0.11789990216493607, |
|
"learning_rate": 2.5723109943374264e-06, |
|
"loss": 0.8259, |
|
"num_input_tokens_seen": 63379296, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8562887038223361, |
|
"grad_norm": 0.08858389407396317, |
|
"learning_rate": 2.4987031271546753e-06, |
|
"loss": 0.6236, |
|
"num_input_tokens_seen": 63540576, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.8584240871236387, |
|
"grad_norm": 0.08800710737705231, |
|
"learning_rate": 2.4261083378844557e-06, |
|
"loss": 0.6153, |
|
"num_input_tokens_seen": 63710688, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.8605594704249413, |
|
"grad_norm": 0.11924576759338379, |
|
"learning_rate": 2.354529894968485e-06, |
|
"loss": 0.6785, |
|
"num_input_tokens_seen": 63879584, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.8626948537262439, |
|
"grad_norm": 0.08962240815162659, |
|
"learning_rate": 2.2839710210894372e-06, |
|
"loss": 0.6377, |
|
"num_input_tokens_seen": 64015744, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.8648302370275465, |
|
"grad_norm": 0.115207739174366, |
|
"learning_rate": 2.214434893025838e-06, |
|
"loss": 0.4801, |
|
"num_input_tokens_seen": 64187232, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.866965620328849, |
|
"grad_norm": 0.1438085287809372, |
|
"learning_rate": 2.1459246415090312e-06, |
|
"loss": 0.7073, |
|
"num_input_tokens_seen": 64331968, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.8691010036301516, |
|
"grad_norm": 0.12350678443908691, |
|
"learning_rate": 2.078443351082232e-06, |
|
"loss": 0.7264, |
|
"num_input_tokens_seen": 64482816, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.8712363869314542, |
|
"grad_norm": 0.1743326038122177, |
|
"learning_rate": 2.011994059961647e-06, |
|
"loss": 0.7054, |
|
"num_input_tokens_seen": 64634368, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.8733717702327568, |
|
"grad_norm": 0.10089342296123505, |
|
"learning_rate": 1.9465797598996914e-06, |
|
"loss": 0.7034, |
|
"num_input_tokens_seen": 64787424, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.8755071535340594, |
|
"grad_norm": 0.10029490292072296, |
|
"learning_rate": 1.8822033960502722e-06, |
|
"loss": 0.593, |
|
"num_input_tokens_seen": 64935616, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.877642536835362, |
|
"grad_norm": 0.13283027708530426, |
|
"learning_rate": 1.8188678668362102e-06, |
|
"loss": 0.7639, |
|
"num_input_tokens_seen": 65103392, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.8797779201366646, |
|
"grad_norm": 0.10776066035032272, |
|
"learning_rate": 1.7565760238187401e-06, |
|
"loss": 0.6378, |
|
"num_input_tokens_seen": 65236032, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.8819133034379671, |
|
"grad_norm": 0.11559037119150162, |
|
"learning_rate": 1.6953306715690925e-06, |
|
"loss": 0.572, |
|
"num_input_tokens_seen": 65374432, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.8840486867392697, |
|
"grad_norm": 0.10408779978752136, |
|
"learning_rate": 1.6351345675422874e-06, |
|
"loss": 0.6153, |
|
"num_input_tokens_seen": 65554048, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.8861840700405723, |
|
"grad_norm": 0.1286764293909073, |
|
"learning_rate": 1.5759904219529249e-06, |
|
"loss": 0.7024, |
|
"num_input_tokens_seen": 65719584, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.8883194533418749, |
|
"grad_norm": 0.10344738513231277, |
|
"learning_rate": 1.5179008976531878e-06, |
|
"loss": 0.6698, |
|
"num_input_tokens_seen": 65911616, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.8904548366431775, |
|
"grad_norm": 0.10264074802398682, |
|
"learning_rate": 1.4608686100129553e-06, |
|
"loss": 0.7602, |
|
"num_input_tokens_seen": 66080480, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.8925902199444801, |
|
"grad_norm": 0.08111412823200226, |
|
"learning_rate": 1.4048961268020384e-06, |
|
"loss": 0.5967, |
|
"num_input_tokens_seen": 66237376, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.8947256032457827, |
|
"grad_norm": 0.10298115760087967, |
|
"learning_rate": 1.3499859680745852e-06, |
|
"loss": 0.7729, |
|
"num_input_tokens_seen": 66404128, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.8968609865470852, |
|
"grad_norm": 0.13184696435928345, |
|
"learning_rate": 1.2961406060556097e-06, |
|
"loss": 0.7682, |
|
"num_input_tokens_seen": 66587872, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8989963698483878, |
|
"grad_norm": 0.1918001025915146, |
|
"learning_rate": 1.2433624650296905e-06, |
|
"loss": 0.8945, |
|
"num_input_tokens_seen": 66708672, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.9011317531496904, |
|
"grad_norm": 0.09620165079832077, |
|
"learning_rate": 1.191653921231811e-06, |
|
"loss": 0.5526, |
|
"num_input_tokens_seen": 66862912, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.903267136450993, |
|
"grad_norm": 0.11877051740884781, |
|
"learning_rate": 1.1410173027403882e-06, |
|
"loss": 0.6192, |
|
"num_input_tokens_seen": 66976480, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.9054025197522956, |
|
"grad_norm": 0.08998879045248032, |
|
"learning_rate": 1.0914548893724563e-06, |
|
"loss": 0.6662, |
|
"num_input_tokens_seen": 67155712, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.9075379030535982, |
|
"grad_norm": 0.10443535447120667, |
|
"learning_rate": 1.042968912581005e-06, |
|
"loss": 0.6332, |
|
"num_input_tokens_seen": 67288000, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.9096732863549007, |
|
"grad_norm": 0.10673485696315765, |
|
"learning_rate": 9.955615553545295e-07, |
|
"loss": 0.8033, |
|
"num_input_tokens_seen": 67437632, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.9118086696562033, |
|
"grad_norm": 0.07716970145702362, |
|
"learning_rate": 9.492349521187355e-07, |
|
"loss": 0.6562, |
|
"num_input_tokens_seen": 67584288, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.9139440529575059, |
|
"grad_norm": 0.13065995275974274, |
|
"learning_rate": 9.039911886404462e-07, |
|
"loss": 0.6629, |
|
"num_input_tokens_seen": 67741024, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.9160794362588085, |
|
"grad_norm": 0.09159885346889496, |
|
"learning_rate": 8.59832301933694e-07, |
|
"loss": 0.7827, |
|
"num_input_tokens_seen": 67904928, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.9182148195601111, |
|
"grad_norm": 0.09007798880338669, |
|
"learning_rate": 8.16760280168008e-07, |
|
"loss": 0.6068, |
|
"num_input_tokens_seen": 68084128, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.9203502028614137, |
|
"grad_norm": 0.0953405424952507, |
|
"learning_rate": 7.747770625788964e-07, |
|
"loss": 0.6923, |
|
"num_input_tokens_seen": 68252704, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.9224855861627163, |
|
"grad_norm": 0.10631420463323593, |
|
"learning_rate": 7.338845393805388e-07, |
|
"loss": 0.6895, |
|
"num_input_tokens_seen": 68375360, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.9246209694640188, |
|
"grad_norm": 0.12354771047830582, |
|
"learning_rate": 6.940845516806849e-07, |
|
"loss": 0.721, |
|
"num_input_tokens_seen": 68542272, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.9267563527653214, |
|
"grad_norm": 0.07840372622013092, |
|
"learning_rate": 6.553788913977593e-07, |
|
"loss": 0.7807, |
|
"num_input_tokens_seen": 68703584, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.928891736066624, |
|
"grad_norm": 0.09137984365224838, |
|
"learning_rate": 6.177693011801877e-07, |
|
"loss": 0.6796, |
|
"num_input_tokens_seen": 68845760, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.9310271193679266, |
|
"grad_norm": 0.0817914679646492, |
|
"learning_rate": 5.812574743279286e-07, |
|
"loss": 0.6509, |
|
"num_input_tokens_seen": 69031072, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.9331625026692292, |
|
"grad_norm": 0.13196462392807007, |
|
"learning_rate": 5.458450547162486e-07, |
|
"loss": 0.7432, |
|
"num_input_tokens_seen": 69207200, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.9352978859705318, |
|
"grad_norm": 0.08276062458753586, |
|
"learning_rate": 5.115336367217005e-07, |
|
"loss": 0.6785, |
|
"num_input_tokens_seen": 69374944, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.9374332692718343, |
|
"grad_norm": 0.1287129819393158, |
|
"learning_rate": 4.783247651503398e-07, |
|
"loss": 0.561, |
|
"num_input_tokens_seen": 69527520, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.9395686525731369, |
|
"grad_norm": 0.1089451014995575, |
|
"learning_rate": 4.4621993516818227e-07, |
|
"loss": 0.6363, |
|
"num_input_tokens_seen": 69688256, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9417040358744395, |
|
"grad_norm": 0.10153474658727646, |
|
"learning_rate": 4.152205922338698e-07, |
|
"loss": 0.6927, |
|
"num_input_tokens_seen": 69851776, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.9438394191757421, |
|
"grad_norm": 0.1068960577249527, |
|
"learning_rate": 3.8532813203360775e-07, |
|
"loss": 0.6462, |
|
"num_input_tokens_seen": 70017856, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.9459748024770446, |
|
"grad_norm": 0.08118557184934616, |
|
"learning_rate": 3.565439004183241e-07, |
|
"loss": 0.6962, |
|
"num_input_tokens_seen": 70153888, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.9481101857783472, |
|
"grad_norm": 0.10270223766565323, |
|
"learning_rate": 3.288691933430621e-07, |
|
"loss": 0.6935, |
|
"num_input_tokens_seen": 70292832, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.9502455690796497, |
|
"grad_norm": 0.09596558660268784, |
|
"learning_rate": 3.023052568086493e-07, |
|
"loss": 0.6684, |
|
"num_input_tokens_seen": 70448448, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.13479304313659668, |
|
"learning_rate": 2.768532868055923e-07, |
|
"loss": 0.7058, |
|
"num_input_tokens_seen": 70595488, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.9545163356822549, |
|
"grad_norm": 0.10255875438451767, |
|
"learning_rate": 2.5251442926021715e-07, |
|
"loss": 0.6543, |
|
"num_input_tokens_seen": 70756416, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.9566517189835575, |
|
"grad_norm": 0.09057821333408356, |
|
"learning_rate": 2.292897799831051e-07, |
|
"loss": 0.727, |
|
"num_input_tokens_seen": 70944896, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.9587871022848601, |
|
"grad_norm": 0.10975092649459839, |
|
"learning_rate": 2.0718038461972345e-07, |
|
"loss": 0.6602, |
|
"num_input_tokens_seen": 71083616, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.9609224855861627, |
|
"grad_norm": 0.11111032962799072, |
|
"learning_rate": 1.8618723860336916e-07, |
|
"loss": 0.6301, |
|
"num_input_tokens_seen": 71240480, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.9630578688874653, |
|
"grad_norm": 0.11888109892606735, |
|
"learning_rate": 1.663112871103406e-07, |
|
"loss": 0.6893, |
|
"num_input_tokens_seen": 71427648, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.9651932521887678, |
|
"grad_norm": 0.11165483295917511, |
|
"learning_rate": 1.4755342501739377e-07, |
|
"loss": 0.6536, |
|
"num_input_tokens_seen": 71591648, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.9673286354900704, |
|
"grad_norm": 0.07350827753543854, |
|
"learning_rate": 1.2991449686143852e-07, |
|
"loss": 0.7046, |
|
"num_input_tokens_seen": 71735296, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.969464018791373, |
|
"grad_norm": 0.12128807604312897, |
|
"learning_rate": 1.1339529680152173e-07, |
|
"loss": 0.559, |
|
"num_input_tokens_seen": 71861920, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.9715994020926756, |
|
"grad_norm": 0.11065730452537537, |
|
"learning_rate": 9.799656858307527e-08, |
|
"loss": 0.7401, |
|
"num_input_tokens_seen": 72029568, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.9737347853939782, |
|
"grad_norm": 0.10507506877183914, |
|
"learning_rate": 8.37190055044207e-08, |
|
"loss": 0.5554, |
|
"num_input_tokens_seen": 72166464, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.9758701686952808, |
|
"grad_norm": 0.10541801899671555, |
|
"learning_rate": 7.056325038556911e-08, |
|
"loss": 0.6366, |
|
"num_input_tokens_seen": 72326496, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.9780055519965833, |
|
"grad_norm": 0.09389659017324448, |
|
"learning_rate": 5.8529895539266575e-08, |
|
"loss": 0.5862, |
|
"num_input_tokens_seen": 72443616, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.9801409352978859, |
|
"grad_norm": 0.08955533802509308, |
|
"learning_rate": 4.7619482744326595e-08, |
|
"loss": 0.7018, |
|
"num_input_tokens_seen": 72624032, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.9822763185991885, |
|
"grad_norm": 0.1084001362323761, |
|
"learning_rate": 3.7832503221249535e-08, |
|
"loss": 0.642, |
|
"num_input_tokens_seen": 72797184, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.9844117019004911, |
|
"grad_norm": 0.11596699804067612, |
|
"learning_rate": 2.916939761009041e-08, |
|
"loss": 0.6432, |
|
"num_input_tokens_seen": 72941600, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.9865470852017937, |
|
"grad_norm": 0.09869848936796188, |
|
"learning_rate": 2.1630555950635788e-08, |
|
"loss": 0.5893, |
|
"num_input_tokens_seen": 73116000, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.9886824685030963, |
|
"grad_norm": 0.08685341477394104, |
|
"learning_rate": 1.5216317664829004e-08, |
|
"loss": 0.6636, |
|
"num_input_tokens_seen": 73285504, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.9908178518043989, |
|
"grad_norm": 0.10240475088357925, |
|
"learning_rate": 9.926971541496244e-09, |
|
"loss": 0.7166, |
|
"num_input_tokens_seen": 73440544, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.9929532351057014, |
|
"grad_norm": 0.09111962467432022, |
|
"learning_rate": 5.762755723348612e-09, |
|
"loss": 0.4921, |
|
"num_input_tokens_seen": 73549760, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.995088618407004, |
|
"grad_norm": 0.09901441633701324, |
|
"learning_rate": 2.7238576962435034e-09, |
|
"loss": 0.6256, |
|
"num_input_tokens_seen": 73709952, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.9972240017083066, |
|
"grad_norm": 0.11895426362752914, |
|
"learning_rate": 8.104142807663361e-10, |
|
"loss": 0.7422, |
|
"num_input_tokens_seen": 73907360, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.9993593850096092, |
|
"grad_norm": 0.12102843821048737, |
|
"learning_rate": 2.2511626046606283e-11, |
|
"loss": 0.656, |
|
"num_input_tokens_seen": 74053280, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.9997864616698697, |
|
"num_input_tokens_seen": 74083488, |
|
"step": 2341, |
|
"total_flos": 4.6864422704480256e+17, |
|
"train_loss": 0.692321076499046, |
|
"train_runtime": 65496.9949, |
|
"train_samples_per_second": 1.144, |
|
"train_steps_per_second": 0.036 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2341, |
|
"num_input_tokens_seen": 74083488, |
|
"num_train_epochs": 1, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.6864422704480256e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|