|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.000839118163094, |
|
"eval_steps": 500, |
|
"global_step": 410, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0024410710199099855, |
|
"grad_norm": 8.533896446228027, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.1655, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004882142039819971, |
|
"grad_norm": 8.988560676574707, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.2516, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0073232130597299565, |
|
"grad_norm": 7.550627708435059, |
|
"learning_rate": 6e-06, |
|
"loss": 1.1895, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009764284079639942, |
|
"grad_norm": 3.6377415657043457, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.0982, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012205355099549928, |
|
"grad_norm": 3.964740753173828, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0622, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014646426119459913, |
|
"grad_norm": 4.8016157150268555, |
|
"learning_rate": 9.999962669988608e-06, |
|
"loss": 1.0653, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.017087497139369898, |
|
"grad_norm": 2.9538488388061523, |
|
"learning_rate": 9.999850680511844e-06, |
|
"loss": 1.026, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.019528568159279884, |
|
"grad_norm": 2.869965076446533, |
|
"learning_rate": 9.999664033241933e-06, |
|
"loss": 1.0349, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02196963917918987, |
|
"grad_norm": 1.8026058673858643, |
|
"learning_rate": 9.999402730965894e-06, |
|
"loss": 1.0421, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.024410710199099857, |
|
"grad_norm": 1.075210452079773, |
|
"learning_rate": 9.999066777585496e-06, |
|
"loss": 1.0008, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02685178121900984, |
|
"grad_norm": 1.4493818283081055, |
|
"learning_rate": 9.998656178117193e-06, |
|
"loss": 0.9347, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.029292852238919826, |
|
"grad_norm": 1.2218502759933472, |
|
"learning_rate": 9.99817093869206e-06, |
|
"loss": 0.9537, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03173392325882981, |
|
"grad_norm": 1.0389800071716309, |
|
"learning_rate": 9.997611066555694e-06, |
|
"loss": 0.9458, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.034174994278739795, |
|
"grad_norm": 0.959168016910553, |
|
"learning_rate": 9.99697657006811e-06, |
|
"loss": 0.9622, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03661606529864978, |
|
"grad_norm": 1.0173426866531372, |
|
"learning_rate": 9.99626745870361e-06, |
|
"loss": 0.9594, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03905713631855977, |
|
"grad_norm": 0.9893942475318909, |
|
"learning_rate": 9.995483743050649e-06, |
|
"loss": 0.9128, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.041498207338469754, |
|
"grad_norm": 0.9174278974533081, |
|
"learning_rate": 9.99462543481167e-06, |
|
"loss": 0.9108, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04393927835837974, |
|
"grad_norm": 0.8355342745780945, |
|
"learning_rate": 9.993692546802943e-06, |
|
"loss": 0.9341, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04638034937828973, |
|
"grad_norm": 0.9482454061508179, |
|
"learning_rate": 9.992685092954347e-06, |
|
"loss": 0.8488, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.048821420398199714, |
|
"grad_norm": 0.8152992129325867, |
|
"learning_rate": 9.991603088309195e-06, |
|
"loss": 0.9388, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05126249141810969, |
|
"grad_norm": 0.7824520468711853, |
|
"learning_rate": 9.990446549023977e-06, |
|
"loss": 0.917, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05370356243801968, |
|
"grad_norm": 0.8396065831184387, |
|
"learning_rate": 9.989215492368152e-06, |
|
"loss": 0.9043, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.056144633457929666, |
|
"grad_norm": 0.7503563761711121, |
|
"learning_rate": 9.98790993672386e-06, |
|
"loss": 0.9368, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05858570447783965, |
|
"grad_norm": 0.846466600894928, |
|
"learning_rate": 9.98652990158566e-06, |
|
"loss": 0.8641, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06102677549774964, |
|
"grad_norm": 0.8216990232467651, |
|
"learning_rate": 9.985075407560247e-06, |
|
"loss": 0.8744, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06346784651765962, |
|
"grad_norm": 0.7758781313896179, |
|
"learning_rate": 9.983546476366133e-06, |
|
"loss": 0.8722, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06590891753756961, |
|
"grad_norm": 0.8065202236175537, |
|
"learning_rate": 9.981943130833323e-06, |
|
"loss": 0.8582, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06834998855747959, |
|
"grad_norm": 0.79361891746521, |
|
"learning_rate": 9.980265394902982e-06, |
|
"loss": 0.8549, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07079105957738958, |
|
"grad_norm": 0.7769683003425598, |
|
"learning_rate": 9.978513293627068e-06, |
|
"loss": 0.8801, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07323213059729956, |
|
"grad_norm": 0.7662413120269775, |
|
"learning_rate": 9.976686853167967e-06, |
|
"loss": 0.849, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07567320161720956, |
|
"grad_norm": 0.7053027153015137, |
|
"learning_rate": 9.974786100798098e-06, |
|
"loss": 0.8925, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07811427263711954, |
|
"grad_norm": 0.7407605051994324, |
|
"learning_rate": 9.9728110648995e-06, |
|
"loss": 0.8623, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08055534365702952, |
|
"grad_norm": 0.7798149585723877, |
|
"learning_rate": 9.970761774963421e-06, |
|
"loss": 0.8711, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08299641467693951, |
|
"grad_norm": 0.7310554385185242, |
|
"learning_rate": 9.968638261589866e-06, |
|
"loss": 0.9071, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08543748569684949, |
|
"grad_norm": 0.8006892204284668, |
|
"learning_rate": 9.966440556487149e-06, |
|
"loss": 0.9026, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08787855671675948, |
|
"grad_norm": 0.7774298787117004, |
|
"learning_rate": 9.96416869247141e-06, |
|
"loss": 0.8512, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09031962773666946, |
|
"grad_norm": 0.7737051844596863, |
|
"learning_rate": 9.961822703466131e-06, |
|
"loss": 0.8629, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09276069875657945, |
|
"grad_norm": 0.8388147950172424, |
|
"learning_rate": 9.959402624501636e-06, |
|
"loss": 0.803, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.09520176977648943, |
|
"grad_norm": 0.7394818067550659, |
|
"learning_rate": 9.956908491714552e-06, |
|
"loss": 0.8768, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09764284079639943, |
|
"grad_norm": 0.8373251557350159, |
|
"learning_rate": 9.95434034234728e-06, |
|
"loss": 0.8604, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1000839118163094, |
|
"grad_norm": 0.7941448092460632, |
|
"learning_rate": 9.951698214747441e-06, |
|
"loss": 0.8397, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10252498283621939, |
|
"grad_norm": 0.7676767706871033, |
|
"learning_rate": 9.948982148367294e-06, |
|
"loss": 0.8434, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10496605385612938, |
|
"grad_norm": 0.7958892583847046, |
|
"learning_rate": 9.946192183763155e-06, |
|
"loss": 0.8503, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10740712487603936, |
|
"grad_norm": 0.793487012386322, |
|
"learning_rate": 9.943328362594788e-06, |
|
"loss": 0.8566, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.10984819589594935, |
|
"grad_norm": 0.716295599937439, |
|
"learning_rate": 9.940390727624785e-06, |
|
"loss": 0.8128, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11228926691585933, |
|
"grad_norm": 0.7760279178619385, |
|
"learning_rate": 9.937379322717923e-06, |
|
"loss": 0.8409, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11473033793576932, |
|
"grad_norm": 0.8229836821556091, |
|
"learning_rate": 9.934294192840518e-06, |
|
"loss": 0.8429, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1171714089556793, |
|
"grad_norm": 0.6973395347595215, |
|
"learning_rate": 9.931135384059737e-06, |
|
"loss": 0.8542, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.11961247997558928, |
|
"grad_norm": 0.7911590933799744, |
|
"learning_rate": 9.927902943542932e-06, |
|
"loss": 0.8554, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12205355099549928, |
|
"grad_norm": 0.6992570757865906, |
|
"learning_rate": 9.924596919556917e-06, |
|
"loss": 0.8706, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12449462201540926, |
|
"grad_norm": 0.7577567100524902, |
|
"learning_rate": 9.921217361467259e-06, |
|
"loss": 0.856, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12693569303531924, |
|
"grad_norm": 0.8022581934928894, |
|
"learning_rate": 9.917764319737533e-06, |
|
"loss": 0.8276, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.12937676405522924, |
|
"grad_norm": 0.720230758190155, |
|
"learning_rate": 9.914237845928574e-06, |
|
"loss": 0.8613, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.13181783507513922, |
|
"grad_norm": 0.7254828214645386, |
|
"learning_rate": 9.910637992697707e-06, |
|
"loss": 0.8617, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1342589060950492, |
|
"grad_norm": 0.7254623174667358, |
|
"learning_rate": 9.906964813797955e-06, |
|
"loss": 0.8543, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13669997711495918, |
|
"grad_norm": 0.7306321859359741, |
|
"learning_rate": 9.903218364077242e-06, |
|
"loss": 0.8332, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.13914104813486916, |
|
"grad_norm": 0.7202122211456299, |
|
"learning_rate": 9.899398699477573e-06, |
|
"loss": 0.8663, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14158211915477917, |
|
"grad_norm": 0.7067145109176636, |
|
"learning_rate": 9.895505877034198e-06, |
|
"loss": 0.8165, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.14402319017468915, |
|
"grad_norm": 0.7376930713653564, |
|
"learning_rate": 9.891539954874758e-06, |
|
"loss": 0.8267, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.14646426119459913, |
|
"grad_norm": 0.7250686883926392, |
|
"learning_rate": 9.887500992218421e-06, |
|
"loss": 0.8239, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1489053322145091, |
|
"grad_norm": 0.7254573106765747, |
|
"learning_rate": 9.883389049374998e-06, |
|
"loss": 0.8452, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.1513464032344191, |
|
"grad_norm": 0.7461521029472351, |
|
"learning_rate": 9.879204187744036e-06, |
|
"loss": 0.803, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1537874742543291, |
|
"grad_norm": 0.7778986096382141, |
|
"learning_rate": 9.874946469813907e-06, |
|
"loss": 0.8287, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.15622854527423907, |
|
"grad_norm": 0.7395936846733093, |
|
"learning_rate": 9.870615959160876e-06, |
|
"loss": 0.8781, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.15866961629414905, |
|
"grad_norm": 0.7308329343795776, |
|
"learning_rate": 9.866212720448149e-06, |
|
"loss": 0.807, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16111068731405903, |
|
"grad_norm": 0.7851212620735168, |
|
"learning_rate": 9.861736819424904e-06, |
|
"loss": 0.821, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.16355175833396904, |
|
"grad_norm": 0.7638505697250366, |
|
"learning_rate": 9.857188322925317e-06, |
|
"loss": 0.8273, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.16599282935387902, |
|
"grad_norm": 0.7750548720359802, |
|
"learning_rate": 9.852567298867557e-06, |
|
"loss": 0.8523, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.168433900373789, |
|
"grad_norm": 0.7466771602630615, |
|
"learning_rate": 9.84787381625278e-06, |
|
"loss": 0.8415, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.17087497139369898, |
|
"grad_norm": 0.6956301331520081, |
|
"learning_rate": 9.843107945164086e-06, |
|
"loss": 0.8206, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17331604241360898, |
|
"grad_norm": 0.7392652630805969, |
|
"learning_rate": 9.838269756765483e-06, |
|
"loss": 0.8098, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.17575711343351896, |
|
"grad_norm": 0.7311574220657349, |
|
"learning_rate": 9.833359323300827e-06, |
|
"loss": 0.8116, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.17819818445342894, |
|
"grad_norm": 0.6983757615089417, |
|
"learning_rate": 9.82837671809273e-06, |
|
"loss": 0.8436, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.18063925547333892, |
|
"grad_norm": 0.7569893598556519, |
|
"learning_rate": 9.823322015541474e-06, |
|
"loss": 0.8058, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1830803264932489, |
|
"grad_norm": 0.7439902424812317, |
|
"learning_rate": 9.818195291123903e-06, |
|
"loss": 0.8424, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.1855213975131589, |
|
"grad_norm": 0.7790477275848389, |
|
"learning_rate": 9.81299662139229e-06, |
|
"loss": 0.8483, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1879624685330689, |
|
"grad_norm": 0.7717331051826477, |
|
"learning_rate": 9.807726083973192e-06, |
|
"loss": 0.8214, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.19040353955297887, |
|
"grad_norm": 0.7872374057769775, |
|
"learning_rate": 9.8023837575663e-06, |
|
"loss": 0.7938, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.19284461057288885, |
|
"grad_norm": 0.8018149137496948, |
|
"learning_rate": 9.796969721943257e-06, |
|
"loss": 0.802, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.19528568159279885, |
|
"grad_norm": 0.709600031375885, |
|
"learning_rate": 9.791484057946465e-06, |
|
"loss": 0.7944, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19772675261270883, |
|
"grad_norm": 0.8216169476509094, |
|
"learning_rate": 9.785926847487885e-06, |
|
"loss": 0.8181, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2001678236326188, |
|
"grad_norm": 0.7138919830322266, |
|
"learning_rate": 9.780298173547811e-06, |
|
"loss": 0.8043, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2026088946525288, |
|
"grad_norm": 0.7637642621994019, |
|
"learning_rate": 9.774598120173625e-06, |
|
"loss": 0.8034, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.20504996567243877, |
|
"grad_norm": 0.7272418141365051, |
|
"learning_rate": 9.76882677247855e-06, |
|
"loss": 0.8271, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.20749103669234878, |
|
"grad_norm": 0.7340764999389648, |
|
"learning_rate": 9.762984216640378e-06, |
|
"loss": 0.8508, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.20993210771225876, |
|
"grad_norm": 0.7231638431549072, |
|
"learning_rate": 9.75707053990018e-06, |
|
"loss": 0.823, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.21237317873216874, |
|
"grad_norm": 0.7670260071754456, |
|
"learning_rate": 9.751085830561e-06, |
|
"loss": 0.8595, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.21481424975207872, |
|
"grad_norm": 0.7142215371131897, |
|
"learning_rate": 9.74503017798655e-06, |
|
"loss": 0.8325, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2172553207719887, |
|
"grad_norm": 0.7884289026260376, |
|
"learning_rate": 9.738903672599858e-06, |
|
"loss": 0.7751, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2196963917918987, |
|
"grad_norm": 0.7771654725074768, |
|
"learning_rate": 9.732706405881931e-06, |
|
"loss": 0.7827, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22213746281180868, |
|
"grad_norm": 0.7293388247489929, |
|
"learning_rate": 9.726438470370385e-06, |
|
"loss": 0.8724, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.22457853383171866, |
|
"grad_norm": 0.7578020095825195, |
|
"learning_rate": 9.720099959658062e-06, |
|
"loss": 0.8277, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.22701960485162864, |
|
"grad_norm": 0.7896732091903687, |
|
"learning_rate": 9.713690968391634e-06, |
|
"loss": 0.7769, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.22946067587153865, |
|
"grad_norm": 0.6877868175506592, |
|
"learning_rate": 9.707211592270183e-06, |
|
"loss": 0.7938, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.23190174689144863, |
|
"grad_norm": 0.8047687411308289, |
|
"learning_rate": 9.700661928043787e-06, |
|
"loss": 0.7735, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2343428179113586, |
|
"grad_norm": 0.7561459541320801, |
|
"learning_rate": 9.69404207351206e-06, |
|
"loss": 0.8079, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2367838889312686, |
|
"grad_norm": 0.7163955569267273, |
|
"learning_rate": 9.687352127522703e-06, |
|
"loss": 0.8042, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.23922495995117857, |
|
"grad_norm": 0.7289466857910156, |
|
"learning_rate": 9.680592189970015e-06, |
|
"loss": 0.8449, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.24166603097108857, |
|
"grad_norm": 0.6951574087142944, |
|
"learning_rate": 9.673762361793418e-06, |
|
"loss": 0.7988, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.24410710199099855, |
|
"grad_norm": 0.7552266716957092, |
|
"learning_rate": 9.666862744975938e-06, |
|
"loss": 0.8323, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24654817301090853, |
|
"grad_norm": 0.7086972594261169, |
|
"learning_rate": 9.659893442542683e-06, |
|
"loss": 0.8567, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2489892440308185, |
|
"grad_norm": 0.7231544852256775, |
|
"learning_rate": 9.652854558559309e-06, |
|
"loss": 0.8265, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2514303150507285, |
|
"grad_norm": 0.7094722986221313, |
|
"learning_rate": 9.645746198130462e-06, |
|
"loss": 0.7803, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.25387138607063847, |
|
"grad_norm": 0.6969436407089233, |
|
"learning_rate": 9.638568467398215e-06, |
|
"loss": 0.804, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.25631245709054845, |
|
"grad_norm": 0.7204388380050659, |
|
"learning_rate": 9.631321473540476e-06, |
|
"loss": 0.787, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.2587535281104585, |
|
"grad_norm": 0.6980841159820557, |
|
"learning_rate": 9.62400532476939e-06, |
|
"loss": 0.8294, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.26119459913036847, |
|
"grad_norm": 0.6793758273124695, |
|
"learning_rate": 9.61662013032972e-06, |
|
"loss": 0.7739, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.26363567015027844, |
|
"grad_norm": 0.7096854448318481, |
|
"learning_rate": 9.60916600049723e-06, |
|
"loss": 0.8035, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2660767411701884, |
|
"grad_norm": 0.6875160932540894, |
|
"learning_rate": 9.601643046577014e-06, |
|
"loss": 0.8567, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2685178121900984, |
|
"grad_norm": 0.7122709155082703, |
|
"learning_rate": 9.59405138090186e-06, |
|
"loss": 0.8153, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2709588832100084, |
|
"grad_norm": 0.695655882358551, |
|
"learning_rate": 9.586391116830549e-06, |
|
"loss": 0.7813, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.27339995422991836, |
|
"grad_norm": 0.674659788608551, |
|
"learning_rate": 9.578662368746183e-06, |
|
"loss": 0.8802, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.27584102524982834, |
|
"grad_norm": 0.7121911644935608, |
|
"learning_rate": 9.570865252054462e-06, |
|
"loss": 0.8017, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2782820962697383, |
|
"grad_norm": 0.7068195939064026, |
|
"learning_rate": 9.562999883181968e-06, |
|
"loss": 0.7817, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.28072316728964836, |
|
"grad_norm": 0.6847429275512695, |
|
"learning_rate": 9.555066379574423e-06, |
|
"loss": 0.801, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.28316423830955834, |
|
"grad_norm": 0.743248462677002, |
|
"learning_rate": 9.547064859694943e-06, |
|
"loss": 0.7978, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2856053093294683, |
|
"grad_norm": 0.7640885710716248, |
|
"learning_rate": 9.538995443022256e-06, |
|
"loss": 0.7913, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2880463803493783, |
|
"grad_norm": 0.7139798402786255, |
|
"learning_rate": 9.530858250048933e-06, |
|
"loss": 0.7994, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2904874513692883, |
|
"grad_norm": 0.7640753388404846, |
|
"learning_rate": 9.52265340227957e-06, |
|
"loss": 0.7946, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.29292852238919825, |
|
"grad_norm": 0.7454321980476379, |
|
"learning_rate": 9.514381022228997e-06, |
|
"loss": 0.809, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.29536959340910823, |
|
"grad_norm": 0.6853974461555481, |
|
"learning_rate": 9.506041233420427e-06, |
|
"loss": 0.8013, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2978106644290182, |
|
"grad_norm": 0.723430335521698, |
|
"learning_rate": 9.497634160383627e-06, |
|
"loss": 0.7923, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3002517354489282, |
|
"grad_norm": 0.7062557935714722, |
|
"learning_rate": 9.489159928653047e-06, |
|
"loss": 0.7702, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3026928064688382, |
|
"grad_norm": 0.6789696216583252, |
|
"learning_rate": 9.480618664765956e-06, |
|
"loss": 0.7748, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3051338774887482, |
|
"grad_norm": 0.7581243515014648, |
|
"learning_rate": 9.472010496260545e-06, |
|
"loss": 0.771, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3075749485086582, |
|
"grad_norm": 0.7822269201278687, |
|
"learning_rate": 9.463335551674024e-06, |
|
"loss": 0.8, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.31001601952856817, |
|
"grad_norm": 0.7157217264175415, |
|
"learning_rate": 9.454593960540709e-06, |
|
"loss": 0.7883, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.31245709054847814, |
|
"grad_norm": 0.7614567875862122, |
|
"learning_rate": 9.445785853390074e-06, |
|
"loss": 0.7929, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3148981615683881, |
|
"grad_norm": 0.7470414042472839, |
|
"learning_rate": 9.436911361744817e-06, |
|
"loss": 0.7826, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3173392325882981, |
|
"grad_norm": 0.7033482193946838, |
|
"learning_rate": 9.427970618118888e-06, |
|
"loss": 0.8359, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3197803036082081, |
|
"grad_norm": 0.7030816674232483, |
|
"learning_rate": 9.418963756015511e-06, |
|
"loss": 0.7966, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.32222137462811806, |
|
"grad_norm": 0.7050835490226746, |
|
"learning_rate": 9.409890909925191e-06, |
|
"loss": 0.7852, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3246624456480281, |
|
"grad_norm": 0.7047673463821411, |
|
"learning_rate": 9.400752215323712e-06, |
|
"loss": 0.8134, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.3271035166679381, |
|
"grad_norm": 0.6739450693130493, |
|
"learning_rate": 9.391547808670097e-06, |
|
"loss": 0.8186, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.32954458768784806, |
|
"grad_norm": 0.7166461944580078, |
|
"learning_rate": 9.38227782740459e-06, |
|
"loss": 0.8118, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.33198565870775804, |
|
"grad_norm": 0.6905531287193298, |
|
"learning_rate": 9.372942409946597e-06, |
|
"loss": 0.8092, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.334426729727668, |
|
"grad_norm": 0.7552813291549683, |
|
"learning_rate": 9.36354169569261e-06, |
|
"loss": 0.7405, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.336867800747578, |
|
"grad_norm": 0.6745990514755249, |
|
"learning_rate": 9.35407582501414e-06, |
|
"loss": 0.8397, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.339308871767488, |
|
"grad_norm": 0.7749987840652466, |
|
"learning_rate": 9.344544939255608e-06, |
|
"loss": 0.7979, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.34174994278739795, |
|
"grad_norm": 0.7859154939651489, |
|
"learning_rate": 9.334949180732245e-06, |
|
"loss": 0.8217, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.34419101380730793, |
|
"grad_norm": 0.7111227512359619, |
|
"learning_rate": 9.325288692727963e-06, |
|
"loss": 0.7692, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.34663208482721797, |
|
"grad_norm": 0.824995219707489, |
|
"learning_rate": 9.315563619493209e-06, |
|
"loss": 0.7989, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.34907315584712795, |
|
"grad_norm": 0.7707095742225647, |
|
"learning_rate": 9.305774106242825e-06, |
|
"loss": 0.8115, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3515142268670379, |
|
"grad_norm": 0.7036089301109314, |
|
"learning_rate": 9.295920299153863e-06, |
|
"loss": 0.8119, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3539552978869479, |
|
"grad_norm": 0.7585278153419495, |
|
"learning_rate": 9.286002345363418e-06, |
|
"loss": 0.7853, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3563963689068579, |
|
"grad_norm": 0.7351112961769104, |
|
"learning_rate": 9.276020392966423e-06, |
|
"loss": 0.7974, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.35883743992676786, |
|
"grad_norm": 0.7286148071289062, |
|
"learning_rate": 9.265974591013434e-06, |
|
"loss": 0.8044, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.36127851094667784, |
|
"grad_norm": 0.6930050253868103, |
|
"learning_rate": 9.25586508950841e-06, |
|
"loss": 0.8117, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3637195819665878, |
|
"grad_norm": 0.8765610456466675, |
|
"learning_rate": 9.24569203940648e-06, |
|
"loss": 0.7551, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3661606529864978, |
|
"grad_norm": 0.7214458584785461, |
|
"learning_rate": 9.235455592611667e-06, |
|
"loss": 0.7984, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.36860172400640784, |
|
"grad_norm": 0.7065439820289612, |
|
"learning_rate": 9.225155901974645e-06, |
|
"loss": 0.8106, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3710427950263178, |
|
"grad_norm": 0.7775700092315674, |
|
"learning_rate": 9.214793121290442e-06, |
|
"loss": 0.8211, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3734838660462278, |
|
"grad_norm": 0.7118616700172424, |
|
"learning_rate": 9.204367405296144e-06, |
|
"loss": 0.82, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3759249370661378, |
|
"grad_norm": 0.7476733326911926, |
|
"learning_rate": 9.193878909668591e-06, |
|
"loss": 0.7584, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.37836600808604776, |
|
"grad_norm": 0.7488994002342224, |
|
"learning_rate": 9.183327791022048e-06, |
|
"loss": 0.7552, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.38080707910595774, |
|
"grad_norm": 0.7086935043334961, |
|
"learning_rate": 9.172714206905866e-06, |
|
"loss": 0.7993, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3832481501258677, |
|
"grad_norm": 0.7513390183448792, |
|
"learning_rate": 9.162038315802132e-06, |
|
"loss": 0.7684, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3856892211457777, |
|
"grad_norm": 0.6983102560043335, |
|
"learning_rate": 9.1513002771233e-06, |
|
"loss": 0.7904, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3881302921656877, |
|
"grad_norm": 0.6591006517410278, |
|
"learning_rate": 9.140500251209813e-06, |
|
"loss": 0.7357, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3905713631855977, |
|
"grad_norm": 0.7491998672485352, |
|
"learning_rate": 9.129638399327707e-06, |
|
"loss": 0.7964, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3930124342055077, |
|
"grad_norm": 0.7312127947807312, |
|
"learning_rate": 9.118714883666204e-06, |
|
"loss": 0.7706, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.39545350522541767, |
|
"grad_norm": 0.7120770215988159, |
|
"learning_rate": 9.107729867335287e-06, |
|
"loss": 0.8367, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.39789457624532765, |
|
"grad_norm": 0.735023021697998, |
|
"learning_rate": 9.096683514363275e-06, |
|
"loss": 0.7832, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4003356472652376, |
|
"grad_norm": 0.7334295511245728, |
|
"learning_rate": 9.085575989694358e-06, |
|
"loss": 0.7977, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4027767182851476, |
|
"grad_norm": 0.7482827305793762, |
|
"learning_rate": 9.074407459186144e-06, |
|
"loss": 0.868, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4052177893050576, |
|
"grad_norm": 0.7395485043525696, |
|
"learning_rate": 9.063178089607183e-06, |
|
"loss": 0.7676, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.40765886032496756, |
|
"grad_norm": 0.6970906257629395, |
|
"learning_rate": 9.051888048634471e-06, |
|
"loss": 0.762, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.41009993134487754, |
|
"grad_norm": 0.7200821042060852, |
|
"learning_rate": 9.040537504850954e-06, |
|
"loss": 0.8067, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4125410023647875, |
|
"grad_norm": 0.7742771506309509, |
|
"learning_rate": 9.029126627743003e-06, |
|
"loss": 0.7767, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.41498207338469756, |
|
"grad_norm": 0.7340243458747864, |
|
"learning_rate": 9.017655587697885e-06, |
|
"loss": 0.7816, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.41742314440460754, |
|
"grad_norm": 0.7570080161094666, |
|
"learning_rate": 9.006124556001223e-06, |
|
"loss": 0.8374, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4198642154245175, |
|
"grad_norm": 0.7807502150535583, |
|
"learning_rate": 8.994533704834435e-06, |
|
"loss": 0.7749, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4223052864444275, |
|
"grad_norm": 0.7137355208396912, |
|
"learning_rate": 8.982883207272164e-06, |
|
"loss": 0.7397, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.4247463574643375, |
|
"grad_norm": 0.7511448860168457, |
|
"learning_rate": 8.971173237279693e-06, |
|
"loss": 0.8006, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.42718742848424746, |
|
"grad_norm": 0.7791663408279419, |
|
"learning_rate": 8.959403969710346e-06, |
|
"loss": 0.7684, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.42962849950415744, |
|
"grad_norm": 0.7711341381072998, |
|
"learning_rate": 8.947575580302879e-06, |
|
"loss": 0.7905, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4320695705240674, |
|
"grad_norm": 0.7793801426887512, |
|
"learning_rate": 8.935688245678859e-06, |
|
"loss": 0.8121, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4345106415439774, |
|
"grad_norm": 0.7082055807113647, |
|
"learning_rate": 8.92374214334002e-06, |
|
"loss": 0.7657, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.43695171256388743, |
|
"grad_norm": 0.735462486743927, |
|
"learning_rate": 8.911737451665616e-06, |
|
"loss": 0.7833, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4393927835837974, |
|
"grad_norm": 0.7432037591934204, |
|
"learning_rate": 8.899674349909759e-06, |
|
"loss": 0.7645, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4418338546037074, |
|
"grad_norm": 0.7552315592765808, |
|
"learning_rate": 8.887553018198738e-06, |
|
"loss": 0.8018, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.44427492562361737, |
|
"grad_norm": 0.677143931388855, |
|
"learning_rate": 8.875373637528336e-06, |
|
"loss": 0.8029, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.44671599664352735, |
|
"grad_norm": 0.7790682911872864, |
|
"learning_rate": 8.863136389761115e-06, |
|
"loss": 0.792, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4491570676634373, |
|
"grad_norm": 0.735373854637146, |
|
"learning_rate": 8.85084145762372e-06, |
|
"loss": 0.78, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4515981386833473, |
|
"grad_norm": 0.7221420407295227, |
|
"learning_rate": 8.838489024704131e-06, |
|
"loss": 0.807, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4540392097032573, |
|
"grad_norm": 0.7021591067314148, |
|
"learning_rate": 8.826079275448934e-06, |
|
"loss": 0.7828, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.45648028072316726, |
|
"grad_norm": 0.7104141712188721, |
|
"learning_rate": 8.81361239516056e-06, |
|
"loss": 0.8051, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4589213517430773, |
|
"grad_norm": 0.749536395072937, |
|
"learning_rate": 8.801088569994523e-06, |
|
"loss": 0.7811, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4613624227629873, |
|
"grad_norm": 0.7570759654045105, |
|
"learning_rate": 8.788507986956639e-06, |
|
"loss": 0.8015, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.46380349378289726, |
|
"grad_norm": 0.6997769474983215, |
|
"learning_rate": 8.775870833900226e-06, |
|
"loss": 0.7816, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.46624456480280724, |
|
"grad_norm": 0.6764109134674072, |
|
"learning_rate": 8.763177299523318e-06, |
|
"loss": 0.7577, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4686856358227172, |
|
"grad_norm": 0.7811216115951538, |
|
"learning_rate": 8.750427573365825e-06, |
|
"loss": 0.7324, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4711267068426272, |
|
"grad_norm": 0.7098534107208252, |
|
"learning_rate": 8.737621845806715e-06, |
|
"loss": 0.7321, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.4735677778625372, |
|
"grad_norm": 0.7705920934677124, |
|
"learning_rate": 8.724760308061172e-06, |
|
"loss": 0.7501, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.47600884888244716, |
|
"grad_norm": 0.7170778512954712, |
|
"learning_rate": 8.711843152177735e-06, |
|
"loss": 0.767, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.47844991990235713, |
|
"grad_norm": 0.7175964713096619, |
|
"learning_rate": 8.698870571035436e-06, |
|
"loss": 0.7592, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.48089099092226717, |
|
"grad_norm": 0.7901434898376465, |
|
"learning_rate": 8.685842758340912e-06, |
|
"loss": 0.7921, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.48333206194217715, |
|
"grad_norm": 0.7608402371406555, |
|
"learning_rate": 8.672759908625528e-06, |
|
"loss": 0.8617, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.48577313296208713, |
|
"grad_norm": 0.7593024373054504, |
|
"learning_rate": 8.65962221724245e-06, |
|
"loss": 0.7674, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.4882142039819971, |
|
"grad_norm": 0.7110275626182556, |
|
"learning_rate": 8.646429880363746e-06, |
|
"loss": 0.7521, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4906552750019071, |
|
"grad_norm": 0.7535459399223328, |
|
"learning_rate": 8.633183094977453e-06, |
|
"loss": 0.7296, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.49309634602181707, |
|
"grad_norm": 0.7531000971794128, |
|
"learning_rate": 8.61988205888463e-06, |
|
"loss": 0.7863, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.49553741704172705, |
|
"grad_norm": 0.7889319658279419, |
|
"learning_rate": 8.60652697069641e-06, |
|
"loss": 0.7784, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.497978488061637, |
|
"grad_norm": 0.6903645396232605, |
|
"learning_rate": 8.593118029831025e-06, |
|
"loss": 0.7954, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.500419559081547, |
|
"grad_norm": 0.7375295758247375, |
|
"learning_rate": 8.579655436510847e-06, |
|
"loss": 0.7764, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.502860630101457, |
|
"grad_norm": 0.7218457460403442, |
|
"learning_rate": 8.566139391759378e-06, |
|
"loss": 0.7852, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.505301701121367, |
|
"grad_norm": 0.7074956297874451, |
|
"learning_rate": 8.552570097398262e-06, |
|
"loss": 0.7824, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5077427721412769, |
|
"grad_norm": 0.6844367384910583, |
|
"learning_rate": 8.53894775604426e-06, |
|
"loss": 0.8005, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5101838431611869, |
|
"grad_norm": 0.7443989515304565, |
|
"learning_rate": 8.525272571106242e-06, |
|
"loss": 0.7761, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5126249141810969, |
|
"grad_norm": 0.7639645338058472, |
|
"learning_rate": 8.511544746782124e-06, |
|
"loss": 0.8032, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.515065985201007, |
|
"grad_norm": 0.699748158454895, |
|
"learning_rate": 8.497764488055848e-06, |
|
"loss": 0.7801, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.517507056220917, |
|
"grad_norm": 0.7058794498443604, |
|
"learning_rate": 8.483932000694295e-06, |
|
"loss": 0.7693, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.519948127240827, |
|
"grad_norm": 0.7830145359039307, |
|
"learning_rate": 8.470047491244232e-06, |
|
"loss": 0.7684, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5223891982607369, |
|
"grad_norm": 0.6766949892044067, |
|
"learning_rate": 8.456111167029219e-06, |
|
"loss": 0.8214, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5248302692806469, |
|
"grad_norm": 0.7066507339477539, |
|
"learning_rate": 8.442123236146509e-06, |
|
"loss": 0.7639, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5272713403005569, |
|
"grad_norm": 0.7286085486412048, |
|
"learning_rate": 8.42808390746395e-06, |
|
"loss": 0.7723, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5297124113204669, |
|
"grad_norm": 0.7587203979492188, |
|
"learning_rate": 8.413993390616865e-06, |
|
"loss": 0.8034, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5321534823403768, |
|
"grad_norm": 0.6527595520019531, |
|
"learning_rate": 8.399851896004914e-06, |
|
"loss": 0.7587, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5345945533602868, |
|
"grad_norm": 0.8271955251693726, |
|
"learning_rate": 8.385659634788959e-06, |
|
"loss": 0.7846, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5370356243801968, |
|
"grad_norm": 0.7351842522621155, |
|
"learning_rate": 8.371416818887907e-06, |
|
"loss": 0.8002, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5394766954001068, |
|
"grad_norm": 0.7915340065956116, |
|
"learning_rate": 8.357123660975553e-06, |
|
"loss": 0.7511, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5419177664200168, |
|
"grad_norm": 0.6955085396766663, |
|
"learning_rate": 8.342780374477396e-06, |
|
"loss": 0.7754, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5443588374399267, |
|
"grad_norm": 0.7368732690811157, |
|
"learning_rate": 8.328387173567453e-06, |
|
"loss": 0.7775, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5467999084598367, |
|
"grad_norm": 0.6908881068229675, |
|
"learning_rate": 8.313944273165068e-06, |
|
"loss": 0.7571, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5492409794797467, |
|
"grad_norm": 0.700554370880127, |
|
"learning_rate": 8.299451888931696e-06, |
|
"loss": 0.7714, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5516820504996567, |
|
"grad_norm": 0.7163404822349548, |
|
"learning_rate": 8.284910237267681e-06, |
|
"loss": 0.7767, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5541231215195667, |
|
"grad_norm": 0.7443628311157227, |
|
"learning_rate": 8.270319535309035e-06, |
|
"loss": 0.7709, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5565641925394766, |
|
"grad_norm": 0.691213071346283, |
|
"learning_rate": 8.255680000924184e-06, |
|
"loss": 0.7997, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5590052635593867, |
|
"grad_norm": 0.7387362718582153, |
|
"learning_rate": 8.240991852710724e-06, |
|
"loss": 0.7502, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5614463345792967, |
|
"grad_norm": 0.7051777243614197, |
|
"learning_rate": 8.22625530999215e-06, |
|
"loss": 0.7811, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5638874055992067, |
|
"grad_norm": 0.6610181331634521, |
|
"learning_rate": 8.211470592814586e-06, |
|
"loss": 0.7884, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5663284766191167, |
|
"grad_norm": 0.6575087904930115, |
|
"learning_rate": 8.196637921943496e-06, |
|
"loss": 0.7797, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5687695476390267, |
|
"grad_norm": 0.7363637685775757, |
|
"learning_rate": 8.181757518860387e-06, |
|
"loss": 0.7369, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.5712106186589366, |
|
"grad_norm": 0.6955734491348267, |
|
"learning_rate": 8.166829605759507e-06, |
|
"loss": 0.7841, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5736516896788466, |
|
"grad_norm": 0.7019768357276917, |
|
"learning_rate": 8.151854405544526e-06, |
|
"loss": 0.7702, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5760927606987566, |
|
"grad_norm": 0.7041372656822205, |
|
"learning_rate": 8.136832141825197e-06, |
|
"loss": 0.7755, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5785338317186666, |
|
"grad_norm": 0.7138186693191528, |
|
"learning_rate": 8.12176303891403e-06, |
|
"loss": 0.7815, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5809749027385765, |
|
"grad_norm": 0.6682398319244385, |
|
"learning_rate": 8.106647321822943e-06, |
|
"loss": 0.7573, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5834159737584865, |
|
"grad_norm": 0.6600127816200256, |
|
"learning_rate": 8.091485216259886e-06, |
|
"loss": 0.7644, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5858570447783965, |
|
"grad_norm": 0.7157433032989502, |
|
"learning_rate": 8.076276948625495e-06, |
|
"loss": 0.7661, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5882981157983065, |
|
"grad_norm": 0.7180731892585754, |
|
"learning_rate": 8.061022746009687e-06, |
|
"loss": 0.756, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.5907391868182165, |
|
"grad_norm": 0.6941264271736145, |
|
"learning_rate": 8.04572283618829e-06, |
|
"loss": 0.7501, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5931802578381264, |
|
"grad_norm": 0.7047881484031677, |
|
"learning_rate": 8.030377447619622e-06, |
|
"loss": 0.7564, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.5956213288580364, |
|
"grad_norm": 0.6860742568969727, |
|
"learning_rate": 8.014986809441093e-06, |
|
"loss": 0.8048, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.5980623998779464, |
|
"grad_norm": 0.6961259245872498, |
|
"learning_rate": 7.999551151465793e-06, |
|
"loss": 0.8085, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6005034708978564, |
|
"grad_norm": 0.6859815716743469, |
|
"learning_rate": 7.984070704179026e-06, |
|
"loss": 0.7911, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6029445419177665, |
|
"grad_norm": 0.739782452583313, |
|
"learning_rate": 7.968545698734908e-06, |
|
"loss": 0.7981, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6053856129376765, |
|
"grad_norm": 0.7173625230789185, |
|
"learning_rate": 7.952976366952888e-06, |
|
"loss": 0.7738, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6078266839575864, |
|
"grad_norm": 0.7094762921333313, |
|
"learning_rate": 7.9373629413143e-06, |
|
"loss": 0.7802, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6102677549774964, |
|
"grad_norm": 0.6974766254425049, |
|
"learning_rate": 7.921705654958886e-06, |
|
"loss": 0.7956, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6127088259974064, |
|
"grad_norm": 0.7235715389251709, |
|
"learning_rate": 7.906004741681321e-06, |
|
"loss": 0.7581, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6151498970173164, |
|
"grad_norm": 0.68167644739151, |
|
"learning_rate": 7.890260435927709e-06, |
|
"loss": 0.7746, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6175909680372264, |
|
"grad_norm": 0.6965702176094055, |
|
"learning_rate": 7.874472972792097e-06, |
|
"loss": 0.7638, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6200320390571363, |
|
"grad_norm": 0.7000617384910583, |
|
"learning_rate": 7.858642588012957e-06, |
|
"loss": 0.7252, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6224731100770463, |
|
"grad_norm": 0.6918544173240662, |
|
"learning_rate": 7.842769517969665e-06, |
|
"loss": 0.7867, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6249141810969563, |
|
"grad_norm": 0.7168439626693726, |
|
"learning_rate": 7.826853999678978e-06, |
|
"loss": 0.7349, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6273552521168663, |
|
"grad_norm": 0.6823673844337463, |
|
"learning_rate": 7.810896270791484e-06, |
|
"loss": 0.7749, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6297963231367762, |
|
"grad_norm": 0.7399064898490906, |
|
"learning_rate": 7.794896569588066e-06, |
|
"loss": 0.7886, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6322373941566862, |
|
"grad_norm": 0.6988884806632996, |
|
"learning_rate": 7.778855134976334e-06, |
|
"loss": 0.7329, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6346784651765962, |
|
"grad_norm": 0.7028211951255798, |
|
"learning_rate": 7.762772206487066e-06, |
|
"loss": 0.8214, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6371195361965062, |
|
"grad_norm": 0.7346593737602234, |
|
"learning_rate": 7.74664802427062e-06, |
|
"loss": 0.7626, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6395606072164162, |
|
"grad_norm": 0.7249420881271362, |
|
"learning_rate": 7.73048282909336e-06, |
|
"loss": 0.7617, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6420016782363261, |
|
"grad_norm": 0.7126630544662476, |
|
"learning_rate": 7.714276862334051e-06, |
|
"loss": 0.7599, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6444427492562361, |
|
"grad_norm": 0.7718750238418579, |
|
"learning_rate": 7.698030365980265e-06, |
|
"loss": 0.8228, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6468838202761461, |
|
"grad_norm": 0.7375472187995911, |
|
"learning_rate": 7.681743582624761e-06, |
|
"loss": 0.7458, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6493248912960562, |
|
"grad_norm": 0.7044516205787659, |
|
"learning_rate": 7.66541675546186e-06, |
|
"loss": 0.779, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6517659623159662, |
|
"grad_norm": 0.7249746322631836, |
|
"learning_rate": 7.64905012828382e-06, |
|
"loss": 0.7983, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6542070333358762, |
|
"grad_norm": 0.7117093801498413, |
|
"learning_rate": 7.632643945477195e-06, |
|
"loss": 0.7436, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6566481043557861, |
|
"grad_norm": 0.7090557217597961, |
|
"learning_rate": 7.616198452019176e-06, |
|
"loss": 0.7563, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6590891753756961, |
|
"grad_norm": 0.720168948173523, |
|
"learning_rate": 7.59971389347395e-06, |
|
"loss": 0.7487, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6615302463956061, |
|
"grad_norm": 0.6775338053703308, |
|
"learning_rate": 7.583190515989022e-06, |
|
"loss": 0.7708, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6639713174155161, |
|
"grad_norm": 0.711544394493103, |
|
"learning_rate": 7.566628566291537e-06, |
|
"loss": 0.7732, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.666412388435426, |
|
"grad_norm": 0.7197690606117249, |
|
"learning_rate": 7.550028291684603e-06, |
|
"loss": 0.7681, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.668853459455336, |
|
"grad_norm": 0.7002537250518799, |
|
"learning_rate": 7.5333899400435986e-06, |
|
"loss": 0.7414, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.671294530475246, |
|
"grad_norm": 0.7534189820289612, |
|
"learning_rate": 7.516713759812465e-06, |
|
"loss": 0.7756, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.673735601495156, |
|
"grad_norm": 0.6993451714515686, |
|
"learning_rate": 7.500000000000001e-06, |
|
"loss": 0.7494, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.676176672515066, |
|
"grad_norm": 0.7160854339599609, |
|
"learning_rate": 7.483248910176144e-06, |
|
"loss": 0.7727, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.678617743534976, |
|
"grad_norm": 0.7340339422225952, |
|
"learning_rate": 7.466460740468246e-06, |
|
"loss": 0.7641, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6810588145548859, |
|
"grad_norm": 0.6893506050109863, |
|
"learning_rate": 7.44963574155733e-06, |
|
"loss": 0.7859, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6834998855747959, |
|
"grad_norm": 0.7197214365005493, |
|
"learning_rate": 7.432774164674359e-06, |
|
"loss": 0.7645, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6859409565947059, |
|
"grad_norm": 0.722647488117218, |
|
"learning_rate": 7.4158762615964744e-06, |
|
"loss": 0.7614, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.6883820276146159, |
|
"grad_norm": 0.7112955451011658, |
|
"learning_rate": 7.398942284643242e-06, |
|
"loss": 0.7565, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6908230986345258, |
|
"grad_norm": 0.7392273545265198, |
|
"learning_rate": 7.381972486672886e-06, |
|
"loss": 0.7474, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6932641696544359, |
|
"grad_norm": 0.691473126411438, |
|
"learning_rate": 7.3649671210785024e-06, |
|
"loss": 0.7392, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.6957052406743459, |
|
"grad_norm": 0.764784038066864, |
|
"learning_rate": 7.34792644178429e-06, |
|
"loss": 0.7341, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6981463116942559, |
|
"grad_norm": 0.7281628251075745, |
|
"learning_rate": 7.330850703241751e-06, |
|
"loss": 0.7804, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7005873827141659, |
|
"grad_norm": 0.843350887298584, |
|
"learning_rate": 7.313740160425887e-06, |
|
"loss": 0.7085, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7030284537340759, |
|
"grad_norm": 0.7312772870063782, |
|
"learning_rate": 7.296595068831406e-06, |
|
"loss": 0.7638, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7054695247539858, |
|
"grad_norm": 0.7479636073112488, |
|
"learning_rate": 7.279415684468893e-06, |
|
"loss": 0.7208, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7079105957738958, |
|
"grad_norm": 0.6945940256118774, |
|
"learning_rate": 7.262202263860989e-06, |
|
"loss": 0.7133, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7103516667938058, |
|
"grad_norm": 0.697964608669281, |
|
"learning_rate": 7.244955064038574e-06, |
|
"loss": 0.7478, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7127927378137158, |
|
"grad_norm": 0.7676611542701721, |
|
"learning_rate": 7.227674342536914e-06, |
|
"loss": 0.7778, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7152338088336258, |
|
"grad_norm": 0.6927245259284973, |
|
"learning_rate": 7.210360357391818e-06, |
|
"loss": 0.7041, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7176748798535357, |
|
"grad_norm": 0.7169522643089294, |
|
"learning_rate": 7.1930133671357915e-06, |
|
"loss": 0.7493, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7201159508734457, |
|
"grad_norm": 0.6862355470657349, |
|
"learning_rate": 7.175633630794176e-06, |
|
"loss": 0.7547, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7225570218933557, |
|
"grad_norm": 0.7003543376922607, |
|
"learning_rate": 7.1582214078812715e-06, |
|
"loss": 0.7677, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7249980929132657, |
|
"grad_norm": 0.6878992915153503, |
|
"learning_rate": 7.140776958396468e-06, |
|
"loss": 0.7663, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7274391639331756, |
|
"grad_norm": 0.6947048306465149, |
|
"learning_rate": 7.123300542820367e-06, |
|
"loss": 0.7514, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7298802349530856, |
|
"grad_norm": 0.6599522829055786, |
|
"learning_rate": 7.1057924221108856e-06, |
|
"loss": 0.7363, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7323213059729956, |
|
"grad_norm": 0.6951528787612915, |
|
"learning_rate": 7.08825285769936e-06, |
|
"loss": 0.7247, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7347623769929056, |
|
"grad_norm": 0.7057417035102844, |
|
"learning_rate": 7.0706821114866475e-06, |
|
"loss": 0.7829, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7372034480128157, |
|
"grad_norm": 0.7661730647087097, |
|
"learning_rate": 7.053080445839211e-06, |
|
"loss": 0.7233, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7396445190327257, |
|
"grad_norm": 0.6780954003334045, |
|
"learning_rate": 7.035448123585201e-06, |
|
"loss": 0.7549, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7420855900526356, |
|
"grad_norm": 0.7154073715209961, |
|
"learning_rate": 7.017785408010533e-06, |
|
"loss": 0.7593, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7445266610725456, |
|
"grad_norm": 0.72113436460495, |
|
"learning_rate": 7.0000925628549595e-06, |
|
"loss": 0.8079, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7469677320924556, |
|
"grad_norm": 0.6903125643730164, |
|
"learning_rate": 6.982369852308124e-06, |
|
"loss": 0.7777, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7494088031123656, |
|
"grad_norm": 0.7365685701370239, |
|
"learning_rate": 6.964617541005617e-06, |
|
"loss": 0.7827, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7518498741322756, |
|
"grad_norm": 0.7428478002548218, |
|
"learning_rate": 6.946835894025037e-06, |
|
"loss": 0.7319, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7542909451521855, |
|
"grad_norm": 0.7224217653274536, |
|
"learning_rate": 6.929025176882016e-06, |
|
"loss": 0.7758, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7567320161720955, |
|
"grad_norm": 0.7415998578071594, |
|
"learning_rate": 6.911185655526263e-06, |
|
"loss": 0.7832, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7591730871920055, |
|
"grad_norm": 0.7322662472724915, |
|
"learning_rate": 6.893317596337592e-06, |
|
"loss": 0.7323, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7616141582119155, |
|
"grad_norm": 0.6983169913291931, |
|
"learning_rate": 6.875421266121946e-06, |
|
"loss": 0.7576, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7640552292318255, |
|
"grad_norm": 0.7458372116088867, |
|
"learning_rate": 6.857496932107407e-06, |
|
"loss": 0.7549, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7664963002517354, |
|
"grad_norm": 0.7724815011024475, |
|
"learning_rate": 6.839544861940214e-06, |
|
"loss": 0.7625, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7689373712716454, |
|
"grad_norm": 0.7590445280075073, |
|
"learning_rate": 6.821565323680759e-06, |
|
"loss": 0.7422, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7713784422915554, |
|
"grad_norm": 0.7100796699523926, |
|
"learning_rate": 6.80355858579959e-06, |
|
"loss": 0.748, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7738195133114654, |
|
"grad_norm": 0.6766054034233093, |
|
"learning_rate": 6.7855249171734e-06, |
|
"loss": 0.7487, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7762605843313753, |
|
"grad_norm": 0.7497526407241821, |
|
"learning_rate": 6.76746458708101e-06, |
|
"loss": 0.7584, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7787016553512853, |
|
"grad_norm": 0.6761816740036011, |
|
"learning_rate": 6.74937786519935e-06, |
|
"loss": 0.7332, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.7811427263711954, |
|
"grad_norm": 0.6793827414512634, |
|
"learning_rate": 6.731265021599437e-06, |
|
"loss": 0.7387, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7835837973911054, |
|
"grad_norm": 0.7181971669197083, |
|
"learning_rate": 6.7131263267423305e-06, |
|
"loss": 0.7588, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.7860248684110154, |
|
"grad_norm": 0.6722172498703003, |
|
"learning_rate": 6.6949620514751075e-06, |
|
"loss": 0.7264, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7884659394309254, |
|
"grad_norm": 0.6741105914115906, |
|
"learning_rate": 6.676772467026809e-06, |
|
"loss": 0.7806, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.7909070104508353, |
|
"grad_norm": 0.6798011660575867, |
|
"learning_rate": 6.65855784500439e-06, |
|
"loss": 0.7281, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.7933480814707453, |
|
"grad_norm": 0.6723977327346802, |
|
"learning_rate": 6.640318457388672e-06, |
|
"loss": 0.7358, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7957891524906553, |
|
"grad_norm": 0.6920611262321472, |
|
"learning_rate": 6.622054576530275e-06, |
|
"loss": 0.7754, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.7982302235105653, |
|
"grad_norm": 0.7018612623214722, |
|
"learning_rate": 6.603766475145546e-06, |
|
"loss": 0.7714, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8006712945304753, |
|
"grad_norm": 0.7847645282745361, |
|
"learning_rate": 6.585454426312506e-06, |
|
"loss": 0.804, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8031123655503852, |
|
"grad_norm": 0.6560506820678711, |
|
"learning_rate": 6.5671187034667465e-06, |
|
"loss": 0.7768, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8055534365702952, |
|
"grad_norm": 0.720274031162262, |
|
"learning_rate": 6.548759580397363e-06, |
|
"loss": 0.7619, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8079945075902052, |
|
"grad_norm": 0.6853426694869995, |
|
"learning_rate": 6.53037733124287e-06, |
|
"loss": 0.7147, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8104355786101152, |
|
"grad_norm": 0.6448130011558533, |
|
"learning_rate": 6.511972230487091e-06, |
|
"loss": 0.7958, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8128766496300251, |
|
"grad_norm": 0.6405046582221985, |
|
"learning_rate": 6.4935445529550775e-06, |
|
"loss": 0.7659, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8153177206499351, |
|
"grad_norm": 0.7028204798698425, |
|
"learning_rate": 6.475094573808994e-06, |
|
"loss": 0.7609, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8177587916698451, |
|
"grad_norm": 0.7086704969406128, |
|
"learning_rate": 6.456622568544012e-06, |
|
"loss": 0.7719, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8201998626897551, |
|
"grad_norm": 0.6891460418701172, |
|
"learning_rate": 6.438128812984199e-06, |
|
"loss": 0.7667, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8226409337096651, |
|
"grad_norm": 0.7402656674385071, |
|
"learning_rate": 6.419613583278395e-06, |
|
"loss": 0.7833, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.825082004729575, |
|
"grad_norm": 0.7249888777732849, |
|
"learning_rate": 6.401077155896098e-06, |
|
"loss": 0.7031, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8275230757494851, |
|
"grad_norm": 0.6789460182189941, |
|
"learning_rate": 6.3825198076233255e-06, |
|
"loss": 0.7739, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8299641467693951, |
|
"grad_norm": 0.6832931041717529, |
|
"learning_rate": 6.363941815558484e-06, |
|
"loss": 0.7242, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8324052177893051, |
|
"grad_norm": 0.64850252866745, |
|
"learning_rate": 6.345343457108238e-06, |
|
"loss": 0.7378, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8348462888092151, |
|
"grad_norm": 0.7206950783729553, |
|
"learning_rate": 6.32672500998336e-06, |
|
"loss": 0.7718, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.837287359829125, |
|
"grad_norm": 0.690377950668335, |
|
"learning_rate": 6.308086752194586e-06, |
|
"loss": 0.7784, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.839728430849035, |
|
"grad_norm": 0.6793109774589539, |
|
"learning_rate": 6.289428962048467e-06, |
|
"loss": 0.7936, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.842169501868945, |
|
"grad_norm": 0.6914140582084656, |
|
"learning_rate": 6.270751918143213e-06, |
|
"loss": 0.7652, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.844610572888855, |
|
"grad_norm": 0.6733123064041138, |
|
"learning_rate": 6.252055899364525e-06, |
|
"loss": 0.8477, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.847051643908765, |
|
"grad_norm": 0.6884806156158447, |
|
"learning_rate": 6.2333411848814415e-06, |
|
"loss": 0.7544, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.849492714928675, |
|
"grad_norm": 0.6831750273704529, |
|
"learning_rate": 6.214608054142167e-06, |
|
"loss": 0.7333, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8519337859485849, |
|
"grad_norm": 0.6560673713684082, |
|
"learning_rate": 6.195856786869893e-06, |
|
"loss": 0.7252, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8543748569684949, |
|
"grad_norm": 0.6822741627693176, |
|
"learning_rate": 6.177087663058626e-06, |
|
"loss": 0.7181, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8568159279884049, |
|
"grad_norm": 0.8162235021591187, |
|
"learning_rate": 6.158300962969012e-06, |
|
"loss": 0.7359, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.8592569990083149, |
|
"grad_norm": 0.7049196362495422, |
|
"learning_rate": 6.13949696712414e-06, |
|
"loss": 0.7592, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8616980700282248, |
|
"grad_norm": 0.6802664399147034, |
|
"learning_rate": 6.120675956305363e-06, |
|
"loss": 0.7476, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8641391410481348, |
|
"grad_norm": 0.7405847311019897, |
|
"learning_rate": 6.101838211548099e-06, |
|
"loss": 0.7368, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8665802120680448, |
|
"grad_norm": 0.6568160057067871, |
|
"learning_rate": 6.0829840141376385e-06, |
|
"loss": 0.7519, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8690212830879548, |
|
"grad_norm": 0.6947789192199707, |
|
"learning_rate": 6.064113645604945e-06, |
|
"loss": 0.7217, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8714623541078649, |
|
"grad_norm": 0.6590073704719543, |
|
"learning_rate": 6.045227387722445e-06, |
|
"loss": 0.7516, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8739034251277749, |
|
"grad_norm": 0.7120059132575989, |
|
"learning_rate": 6.026325522499829e-06, |
|
"loss": 0.7481, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8763444961476848, |
|
"grad_norm": 0.6988116502761841, |
|
"learning_rate": 6.007408332179836e-06, |
|
"loss": 0.7995, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8787855671675948, |
|
"grad_norm": 0.7005130648612976, |
|
"learning_rate": 5.988476099234033e-06, |
|
"loss": 0.7427, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8812266381875048, |
|
"grad_norm": 0.675496518611908, |
|
"learning_rate": 5.969529106358612e-06, |
|
"loss": 0.7603, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8836677092074148, |
|
"grad_norm": 0.7157605290412903, |
|
"learning_rate": 5.95056763647016e-06, |
|
"loss": 0.7788, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8861087802273248, |
|
"grad_norm": 0.6584900617599487, |
|
"learning_rate": 5.931591972701427e-06, |
|
"loss": 0.7415, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.8885498512472347, |
|
"grad_norm": 0.6833528876304626, |
|
"learning_rate": 5.9126023983971114e-06, |
|
"loss": 0.7339, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8909909222671447, |
|
"grad_norm": 0.6946988701820374, |
|
"learning_rate": 5.893599197109625e-06, |
|
"loss": 0.8115, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8934319932870547, |
|
"grad_norm": 0.7004518508911133, |
|
"learning_rate": 5.874582652594855e-06, |
|
"loss": 0.75, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.8958730643069647, |
|
"grad_norm": 0.6791942119598389, |
|
"learning_rate": 5.855553048807932e-06, |
|
"loss": 0.7288, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.8983141353268747, |
|
"grad_norm": 0.6619113683700562, |
|
"learning_rate": 5.836510669898984e-06, |
|
"loss": 0.7408, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9007552063467846, |
|
"grad_norm": 0.6960625052452087, |
|
"learning_rate": 5.817455800208901e-06, |
|
"loss": 0.7937, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9031962773666946, |
|
"grad_norm": 0.6567366123199463, |
|
"learning_rate": 5.798388724265085e-06, |
|
"loss": 0.7555, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9056373483866046, |
|
"grad_norm": 0.6739965677261353, |
|
"learning_rate": 5.7793097267772e-06, |
|
"loss": 0.7193, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9080784194065146, |
|
"grad_norm": 0.6742777228355408, |
|
"learning_rate": 5.760219092632924e-06, |
|
"loss": 0.7308, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9105194904264245, |
|
"grad_norm": 0.6787696480751038, |
|
"learning_rate": 5.741117106893693e-06, |
|
"loss": 0.7387, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9129605614463345, |
|
"grad_norm": 0.6927146911621094, |
|
"learning_rate": 5.722004054790442e-06, |
|
"loss": 0.7435, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9154016324662446, |
|
"grad_norm": 0.6972612142562866, |
|
"learning_rate": 5.7028802217193565e-06, |
|
"loss": 0.7605, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9178427034861546, |
|
"grad_norm": 0.7299436926841736, |
|
"learning_rate": 5.683745893237598e-06, |
|
"loss": 0.7745, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9202837745060646, |
|
"grad_norm": 0.6617533564567566, |
|
"learning_rate": 5.664601355059044e-06, |
|
"loss": 0.7718, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9227248455259746, |
|
"grad_norm": 0.722768247127533, |
|
"learning_rate": 5.645446893050029e-06, |
|
"loss": 0.783, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9251659165458845, |
|
"grad_norm": 0.686353862285614, |
|
"learning_rate": 5.626282793225066e-06, |
|
"loss": 0.7411, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9276069875657945, |
|
"grad_norm": 0.7211350798606873, |
|
"learning_rate": 5.607109341742579e-06, |
|
"loss": 0.7729, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9300480585857045, |
|
"grad_norm": 0.6945457458496094, |
|
"learning_rate": 5.587926824900637e-06, |
|
"loss": 0.73, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9324891296056145, |
|
"grad_norm": 0.641502320766449, |
|
"learning_rate": 5.568735529132665e-06, |
|
"loss": 0.7537, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9349302006255245, |
|
"grad_norm": 0.7009978294372559, |
|
"learning_rate": 5.5495357410031805e-06, |
|
"loss": 0.7407, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9373712716454344, |
|
"grad_norm": 0.6966471672058105, |
|
"learning_rate": 5.530327747203507e-06, |
|
"loss": 0.7287, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9398123426653444, |
|
"grad_norm": 0.7361583709716797, |
|
"learning_rate": 5.511111834547496e-06, |
|
"loss": 0.7132, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9422534136852544, |
|
"grad_norm": 0.8365876078605652, |
|
"learning_rate": 5.491888289967241e-06, |
|
"loss": 0.7517, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9446944847051644, |
|
"grad_norm": 0.7376920580863953, |
|
"learning_rate": 5.472657400508801e-06, |
|
"loss": 0.7354, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.9471355557250744, |
|
"grad_norm": 0.6934507489204407, |
|
"learning_rate": 5.4534194533279e-06, |
|
"loss": 0.7418, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9495766267449843, |
|
"grad_norm": 0.6778532862663269, |
|
"learning_rate": 5.434174735685658e-06, |
|
"loss": 0.7768, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.9520176977648943, |
|
"grad_norm": 0.7122485041618347, |
|
"learning_rate": 5.414923534944283e-06, |
|
"loss": 0.7674, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9544587687848043, |
|
"grad_norm": 0.6961596608161926, |
|
"learning_rate": 5.395666138562794e-06, |
|
"loss": 0.7709, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.9568998398047143, |
|
"grad_norm": 0.7608603239059448, |
|
"learning_rate": 5.376402834092721e-06, |
|
"loss": 0.7787, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9593409108246242, |
|
"grad_norm": 0.6927947402000427, |
|
"learning_rate": 5.357133909173815e-06, |
|
"loss": 0.7363, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9617819818445343, |
|
"grad_norm": 0.7136049270629883, |
|
"learning_rate": 5.337859651529747e-06, |
|
"loss": 0.742, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9642230528644443, |
|
"grad_norm": 0.7149622440338135, |
|
"learning_rate": 5.318580348963826e-06, |
|
"loss": 0.7501, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9666641238843543, |
|
"grad_norm": 0.6613947749137878, |
|
"learning_rate": 5.2992962893546804e-06, |
|
"loss": 0.7045, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9691051949042643, |
|
"grad_norm": 0.7548585534095764, |
|
"learning_rate": 5.280007760651977e-06, |
|
"loss": 0.7447, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.9715462659241743, |
|
"grad_norm": 0.6713358759880066, |
|
"learning_rate": 5.260715050872119e-06, |
|
"loss": 0.7356, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9739873369440842, |
|
"grad_norm": 0.7130277156829834, |
|
"learning_rate": 5.241418448093931e-06, |
|
"loss": 0.7523, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9764284079639942, |
|
"grad_norm": 0.7643216252326965, |
|
"learning_rate": 5.222118240454376e-06, |
|
"loss": 0.7581, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9788694789839042, |
|
"grad_norm": 0.6513252258300781, |
|
"learning_rate": 5.202814716144245e-06, |
|
"loss": 0.7635, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.9813105500038142, |
|
"grad_norm": 0.7522091269493103, |
|
"learning_rate": 5.1835081634038455e-06, |
|
"loss": 0.7765, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.9837516210237242, |
|
"grad_norm": 0.7359428405761719, |
|
"learning_rate": 5.164198870518714e-06, |
|
"loss": 0.7626, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.9861926920436341, |
|
"grad_norm": 0.7049588561058044, |
|
"learning_rate": 5.144887125815301e-06, |
|
"loss": 0.6856, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.9886337630635441, |
|
"grad_norm": 0.70652836561203, |
|
"learning_rate": 5.125573217656664e-06, |
|
"loss": 0.7479, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9910748340834541, |
|
"grad_norm": 0.6710291504859924, |
|
"learning_rate": 5.1062574344381686e-06, |
|
"loss": 0.7419, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.9935159051033641, |
|
"grad_norm": 0.6925918459892273, |
|
"learning_rate": 5.086940064583179e-06, |
|
"loss": 0.7222, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.995956976123274, |
|
"grad_norm": 0.738444447517395, |
|
"learning_rate": 5.067621396538747e-06, |
|
"loss": 0.738, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.998398047143184, |
|
"grad_norm": 0.6995366811752319, |
|
"learning_rate": 5.048301718771317e-06, |
|
"loss": 0.7986, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.000839118163094, |
|
"grad_norm": 0.7283437848091125, |
|
"learning_rate": 5.028981319762399e-06, |
|
"loss": 0.7439, |
|
"step": 410 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 818, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 205, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.11279028268381e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|