|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9820824512777984, |
|
"eval_steps": 500, |
|
"global_step": 45000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.288641095161438, |
|
"learning_rate": 9.97817594552716e-06, |
|
"loss": 5.7446, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.8790096044540405, |
|
"learning_rate": 9.956351891054321e-06, |
|
"loss": 5.5963, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7012779712677002, |
|
"learning_rate": 9.93452783658148e-06, |
|
"loss": 5.4268, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5707438588142395, |
|
"learning_rate": 9.912703782108642e-06, |
|
"loss": 5.3015, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4737650454044342, |
|
"learning_rate": 9.890879727635801e-06, |
|
"loss": 5.191, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.383696585893631, |
|
"learning_rate": 9.86905567316296e-06, |
|
"loss": 5.0995, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.4450148940086365, |
|
"learning_rate": 9.847231618690121e-06, |
|
"loss": 5.0545, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.40447548031806946, |
|
"learning_rate": 9.82540756421728e-06, |
|
"loss": 5.0217, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.44727349281311035, |
|
"learning_rate": 9.80358350974444e-06, |
|
"loss": 4.9963, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3116198778152466, |
|
"learning_rate": 9.781759455271601e-06, |
|
"loss": 4.9777, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.2814035415649414, |
|
"learning_rate": 9.75993540079876e-06, |
|
"loss": 4.9637, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.49883121252059937, |
|
"learning_rate": 9.738111346325922e-06, |
|
"loss": 4.9526, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.22215589880943298, |
|
"learning_rate": 9.716287291853081e-06, |
|
"loss": 4.9413, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.2715405225753784, |
|
"learning_rate": 9.694463237380242e-06, |
|
"loss": 4.934, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.1865217238664627, |
|
"learning_rate": 9.672639182907401e-06, |
|
"loss": 4.9255, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.5209760069847107, |
|
"learning_rate": 9.65081512843456e-06, |
|
"loss": 4.9237, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.20881299674510956, |
|
"learning_rate": 9.62899107396172e-06, |
|
"loss": 4.9139, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.2848042845726013, |
|
"learning_rate": 9.607167019488881e-06, |
|
"loss": 4.9177, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.49689555168151855, |
|
"learning_rate": 9.58534296501604e-06, |
|
"loss": 4.907, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.37812674045562744, |
|
"learning_rate": 9.563518910543202e-06, |
|
"loss": 4.9057, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2938688099384308, |
|
"learning_rate": 9.541694856070361e-06, |
|
"loss": 4.899, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8247494101524353, |
|
"learning_rate": 9.519870801597522e-06, |
|
"loss": 4.8931, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3141603469848633, |
|
"learning_rate": 9.498046747124681e-06, |
|
"loss": 4.8942, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.35256093740463257, |
|
"learning_rate": 9.476222692651842e-06, |
|
"loss": 4.8915, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.2673836946487427, |
|
"learning_rate": 9.454398638179002e-06, |
|
"loss": 4.8863, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3463526964187622, |
|
"learning_rate": 9.432574583706163e-06, |
|
"loss": 4.8847, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.35548537969589233, |
|
"learning_rate": 9.41075052923332e-06, |
|
"loss": 4.8811, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.3124917447566986, |
|
"learning_rate": 9.388926474760482e-06, |
|
"loss": 4.8783, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.294010192155838, |
|
"learning_rate": 9.367102420287641e-06, |
|
"loss": 4.8777, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.36635321378707886, |
|
"learning_rate": 9.345278365814802e-06, |
|
"loss": 4.8754, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.5457169413566589, |
|
"learning_rate": 9.323454311341961e-06, |
|
"loss": 4.8722, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.2539723217487335, |
|
"learning_rate": 9.301630256869122e-06, |
|
"loss": 4.8691, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.29804033041000366, |
|
"learning_rate": 9.279806202396282e-06, |
|
"loss": 4.867, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.30564549565315247, |
|
"learning_rate": 9.257982147923443e-06, |
|
"loss": 4.8638, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.25882017612457275, |
|
"learning_rate": 9.236158093450602e-06, |
|
"loss": 4.8631, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.49584251642227173, |
|
"learning_rate": 9.214334038977763e-06, |
|
"loss": 4.8632, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.5460361838340759, |
|
"learning_rate": 9.192509984504923e-06, |
|
"loss": 4.8606, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.386802613735199, |
|
"learning_rate": 9.170685930032082e-06, |
|
"loss": 4.8601, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.31271764636039734, |
|
"learning_rate": 9.148861875559241e-06, |
|
"loss": 4.8585, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2538485825061798, |
|
"learning_rate": 9.127037821086402e-06, |
|
"loss": 4.8567, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.27854135632514954, |
|
"learning_rate": 9.105213766613562e-06, |
|
"loss": 4.8532, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5394597053527832, |
|
"learning_rate": 9.083389712140723e-06, |
|
"loss": 4.8536, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.2850706875324249, |
|
"learning_rate": 9.061565657667882e-06, |
|
"loss": 4.853, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7254090309143066, |
|
"learning_rate": 9.039741603195043e-06, |
|
"loss": 4.8502, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6587111353874207, |
|
"learning_rate": 9.017917548722203e-06, |
|
"loss": 4.847, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7545880079269409, |
|
"learning_rate": 8.996093494249362e-06, |
|
"loss": 4.8501, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5480329990386963, |
|
"learning_rate": 8.974269439776523e-06, |
|
"loss": 4.8504, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.26011794805526733, |
|
"learning_rate": 8.952445385303682e-06, |
|
"loss": 4.8478, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.9462392926216125, |
|
"learning_rate": 8.930621330830842e-06, |
|
"loss": 4.8464, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.526184618473053, |
|
"learning_rate": 8.908797276358003e-06, |
|
"loss": 4.8423, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.1413213014602661, |
|
"learning_rate": 8.886973221885162e-06, |
|
"loss": 4.8433, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.3846365511417389, |
|
"learning_rate": 8.865149167412323e-06, |
|
"loss": 4.8422, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.4587865471839905, |
|
"learning_rate": 8.843325112939482e-06, |
|
"loss": 4.8427, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.0439590215682983, |
|
"learning_rate": 8.821501058466644e-06, |
|
"loss": 4.8374, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.9531126022338867, |
|
"learning_rate": 8.799677003993803e-06, |
|
"loss": 4.8381, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8820663690567017, |
|
"learning_rate": 8.777852949520962e-06, |
|
"loss": 4.8388, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.2842873334884644, |
|
"learning_rate": 8.756028895048123e-06, |
|
"loss": 4.8375, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.4219115972518921, |
|
"learning_rate": 8.734204840575283e-06, |
|
"loss": 4.8368, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.27131006121635437, |
|
"learning_rate": 8.712380786102442e-06, |
|
"loss": 4.8358, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.4501149654388428, |
|
"learning_rate": 8.690556731629603e-06, |
|
"loss": 4.8328, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1851238012313843, |
|
"learning_rate": 8.668732677156762e-06, |
|
"loss": 4.8353, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.2990129292011261, |
|
"learning_rate": 8.646908622683924e-06, |
|
"loss": 4.8308, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.3613658547401428, |
|
"learning_rate": 8.625084568211083e-06, |
|
"loss": 4.8318, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.5732179880142212, |
|
"learning_rate": 8.603260513738242e-06, |
|
"loss": 4.8318, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8767447471618652, |
|
"learning_rate": 8.581436459265403e-06, |
|
"loss": 4.8299, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6536487340927124, |
|
"learning_rate": 8.559612404792563e-06, |
|
"loss": 4.8337, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.5771917104721069, |
|
"learning_rate": 8.537788350319724e-06, |
|
"loss": 4.83, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.4667840003967285, |
|
"learning_rate": 8.515964295846883e-06, |
|
"loss": 4.8282, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6794707775115967, |
|
"learning_rate": 8.494140241374044e-06, |
|
"loss": 4.8276, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7510731220245361, |
|
"learning_rate": 8.472316186901203e-06, |
|
"loss": 4.8286, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8489832282066345, |
|
"learning_rate": 8.450492132428363e-06, |
|
"loss": 4.8293, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6585586071014404, |
|
"learning_rate": 8.428668077955522e-06, |
|
"loss": 4.8287, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7326397895812988, |
|
"learning_rate": 8.406844023482683e-06, |
|
"loss": 4.8275, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.032246708869934, |
|
"learning_rate": 8.385019969009843e-06, |
|
"loss": 4.8242, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7991085648536682, |
|
"learning_rate": 8.363195914537004e-06, |
|
"loss": 4.8244, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5138424634933472, |
|
"learning_rate": 8.341371860064163e-06, |
|
"loss": 4.8233, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.4894898235797882, |
|
"learning_rate": 8.319547805591324e-06, |
|
"loss": 4.824, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.3397706151008606, |
|
"learning_rate": 8.297723751118483e-06, |
|
"loss": 4.8236, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.1616212129592896, |
|
"learning_rate": 8.275899696645645e-06, |
|
"loss": 4.824, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5268859267234802, |
|
"learning_rate": 8.254075642172802e-06, |
|
"loss": 4.8226, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6282734274864197, |
|
"learning_rate": 8.232251587699963e-06, |
|
"loss": 4.8224, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.3148588240146637, |
|
"learning_rate": 8.210427533227123e-06, |
|
"loss": 4.8238, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6932478547096252, |
|
"learning_rate": 8.188603478754284e-06, |
|
"loss": 4.8237, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8093019723892212, |
|
"learning_rate": 8.166779424281443e-06, |
|
"loss": 4.8189, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.4945412874221802, |
|
"learning_rate": 8.144955369808604e-06, |
|
"loss": 4.8182, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.250691294670105, |
|
"learning_rate": 8.123131315335763e-06, |
|
"loss": 4.8236, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.7631908655166626, |
|
"learning_rate": 8.101307260862924e-06, |
|
"loss": 4.8212, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.1759347915649414, |
|
"learning_rate": 8.079483206390084e-06, |
|
"loss": 4.8222, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7231265306472778, |
|
"learning_rate": 8.057659151917245e-06, |
|
"loss": 4.8199, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.3531375527381897, |
|
"learning_rate": 8.035835097444404e-06, |
|
"loss": 4.8185, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.43936291337013245, |
|
"learning_rate": 8.014011042971564e-06, |
|
"loss": 4.8204, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5637614727020264, |
|
"learning_rate": 7.992186988498723e-06, |
|
"loss": 4.8175, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.217545509338379, |
|
"learning_rate": 7.970362934025884e-06, |
|
"loss": 4.8201, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.44672998785972595, |
|
"learning_rate": 7.948538879553043e-06, |
|
"loss": 4.8185, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.0661417245864868, |
|
"learning_rate": 7.926714825080204e-06, |
|
"loss": 4.8185, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.0079288482666016, |
|
"learning_rate": 7.904890770607364e-06, |
|
"loss": 4.8196, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.2175804376602173, |
|
"learning_rate": 7.883066716134525e-06, |
|
"loss": 4.8164, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.90774005651474, |
|
"learning_rate": 7.861242661661684e-06, |
|
"loss": 4.8163, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6773905754089355, |
|
"learning_rate": 7.839418607188845e-06, |
|
"loss": 4.8173, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.4682576060295105, |
|
"learning_rate": 7.817594552716005e-06, |
|
"loss": 4.8157, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.6193821430206299, |
|
"learning_rate": 7.795770498243164e-06, |
|
"loss": 4.8138, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.4081961214542389, |
|
"learning_rate": 7.773946443770323e-06, |
|
"loss": 4.8123, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.4450671374797821, |
|
"learning_rate": 7.752122389297484e-06, |
|
"loss": 4.8148, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.6656301021575928, |
|
"learning_rate": 7.730298334824644e-06, |
|
"loss": 4.8142, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.447218120098114, |
|
"learning_rate": 7.708474280351805e-06, |
|
"loss": 4.8143, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8786011338233948, |
|
"learning_rate": 7.686650225878964e-06, |
|
"loss": 4.8165, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8793469071388245, |
|
"learning_rate": 7.664826171406125e-06, |
|
"loss": 4.8126, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6492193937301636, |
|
"learning_rate": 7.643002116933285e-06, |
|
"loss": 4.8106, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.1710741519927979, |
|
"learning_rate": 7.621178062460445e-06, |
|
"loss": 4.8131, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5375717282295227, |
|
"learning_rate": 7.599354007987605e-06, |
|
"loss": 4.8103, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.38140830397605896, |
|
"learning_rate": 7.577529953514765e-06, |
|
"loss": 4.8127, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7099872827529907, |
|
"learning_rate": 7.555705899041924e-06, |
|
"loss": 4.8104, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.029292583465576, |
|
"learning_rate": 7.533881844569084e-06, |
|
"loss": 4.8127, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.3873797357082367, |
|
"learning_rate": 7.512057790096244e-06, |
|
"loss": 4.812, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.0038777589797974, |
|
"learning_rate": 7.490233735623404e-06, |
|
"loss": 4.8082, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9627403616905212, |
|
"learning_rate": 7.4684096811505646e-06, |
|
"loss": 4.8084, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.518621563911438, |
|
"learning_rate": 7.446585626677725e-06, |
|
"loss": 4.8094, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 1.7235260009765625, |
|
"learning_rate": 7.424761572204885e-06, |
|
"loss": 4.8074, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.3688587248325348, |
|
"learning_rate": 7.402937517732045e-06, |
|
"loss": 4.8097, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.4294319748878479, |
|
"learning_rate": 7.381113463259205e-06, |
|
"loss": 4.8102, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.3772117495536804, |
|
"learning_rate": 7.359289408786366e-06, |
|
"loss": 4.8091, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.3151636123657227, |
|
"learning_rate": 7.337465354313526e-06, |
|
"loss": 4.809, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.2710403203964233, |
|
"learning_rate": 7.315641299840684e-06, |
|
"loss": 4.807, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.9449421763420105, |
|
"learning_rate": 7.2938172453678445e-06, |
|
"loss": 4.8092, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.3880749940872192, |
|
"learning_rate": 7.271993190895005e-06, |
|
"loss": 4.8069, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5149086117744446, |
|
"learning_rate": 7.250169136422165e-06, |
|
"loss": 4.8114, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.83113431930542, |
|
"learning_rate": 7.228345081949325e-06, |
|
"loss": 4.8091, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.9617094397544861, |
|
"learning_rate": 7.206521027476485e-06, |
|
"loss": 4.8045, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.3411821126937866, |
|
"learning_rate": 7.1846969730036456e-06, |
|
"loss": 4.8057, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.40530508756637573, |
|
"learning_rate": 7.162872918530806e-06, |
|
"loss": 4.8063, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.9460883736610413, |
|
"learning_rate": 7.141048864057966e-06, |
|
"loss": 4.8064, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 2.0108609199523926, |
|
"learning_rate": 7.119224809585125e-06, |
|
"loss": 4.8069, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.5277504920959473, |
|
"learning_rate": 7.0974007551122855e-06, |
|
"loss": 4.8068, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.932159960269928, |
|
"learning_rate": 7.075576700639445e-06, |
|
"loss": 4.8018, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.497610092163086, |
|
"learning_rate": 7.053752646166605e-06, |
|
"loss": 4.803, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.3723843097686768, |
|
"learning_rate": 7.031928591693765e-06, |
|
"loss": 4.8015, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7923598885536194, |
|
"learning_rate": 7.0101045372209255e-06, |
|
"loss": 4.8079, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.1815203428268433, |
|
"learning_rate": 6.988280482748086e-06, |
|
"loss": 4.8027, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.163899540901184, |
|
"learning_rate": 6.966456428275246e-06, |
|
"loss": 4.8029, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4986168146133423, |
|
"learning_rate": 6.944632373802406e-06, |
|
"loss": 4.8076, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6901636123657227, |
|
"learning_rate": 6.9228083193295655e-06, |
|
"loss": 4.8044, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4600536525249481, |
|
"learning_rate": 6.900984264856726e-06, |
|
"loss": 4.8058, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.3702683448791504, |
|
"learning_rate": 6.879160210383886e-06, |
|
"loss": 4.8022, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6329184770584106, |
|
"learning_rate": 6.857336155911045e-06, |
|
"loss": 4.8018, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.4246502220630646, |
|
"learning_rate": 6.8355121014382055e-06, |
|
"loss": 4.8045, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6552340388298035, |
|
"learning_rate": 6.813688046965366e-06, |
|
"loss": 4.8023, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6138970255851746, |
|
"learning_rate": 6.791863992492526e-06, |
|
"loss": 4.8051, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.8509910106658936, |
|
"learning_rate": 6.770039938019686e-06, |
|
"loss": 4.8034, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2667418718338013, |
|
"learning_rate": 6.7482158835468455e-06, |
|
"loss": 4.8035, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.723858118057251, |
|
"learning_rate": 6.726391829074006e-06, |
|
"loss": 4.8048, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5858640670776367, |
|
"learning_rate": 6.704567774601166e-06, |
|
"loss": 4.8043, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.3455156087875366, |
|
"learning_rate": 6.682743720128326e-06, |
|
"loss": 4.8048, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7291358709335327, |
|
"learning_rate": 6.660919665655486e-06, |
|
"loss": 4.8034, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.9461612105369568, |
|
"learning_rate": 6.6390956111826465e-06, |
|
"loss": 4.8009, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1789884567260742, |
|
"learning_rate": 6.617271556709806e-06, |
|
"loss": 4.8041, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.9464890360832214, |
|
"learning_rate": 6.595447502236966e-06, |
|
"loss": 4.8045, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7488701343536377, |
|
"learning_rate": 6.5736234477641254e-06, |
|
"loss": 4.8018, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8682188391685486, |
|
"learning_rate": 6.551799393291286e-06, |
|
"loss": 4.8011, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.4264489710330963, |
|
"learning_rate": 6.529975338818446e-06, |
|
"loss": 4.8023, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.49021920561790466, |
|
"learning_rate": 6.508151284345606e-06, |
|
"loss": 4.8028, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5845347046852112, |
|
"learning_rate": 6.486327229872766e-06, |
|
"loss": 4.8036, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5416790246963501, |
|
"learning_rate": 6.4645031753999265e-06, |
|
"loss": 4.7982, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6061177253723145, |
|
"learning_rate": 6.442679120927087e-06, |
|
"loss": 4.7992, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.0177648067474365, |
|
"learning_rate": 6.420855066454247e-06, |
|
"loss": 4.8009, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1131385564804077, |
|
"learning_rate": 6.399031011981407e-06, |
|
"loss": 4.8017, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.519646167755127, |
|
"learning_rate": 6.377206957508566e-06, |
|
"loss": 4.7977, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7085474729537964, |
|
"learning_rate": 6.355382903035726e-06, |
|
"loss": 4.8033, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.178063154220581, |
|
"learning_rate": 6.333558848562886e-06, |
|
"loss": 4.8005, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6010052561759949, |
|
"learning_rate": 6.311734794090046e-06, |
|
"loss": 4.7981, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.1030943393707275, |
|
"learning_rate": 6.2899107396172064e-06, |
|
"loss": 4.8012, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.7612066268920898, |
|
"learning_rate": 6.268086685144367e-06, |
|
"loss": 4.7985, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4463589191436768, |
|
"learning_rate": 6.246262630671527e-06, |
|
"loss": 4.7989, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5492839813232422, |
|
"learning_rate": 6.224438576198687e-06, |
|
"loss": 4.7981, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.7029542922973633, |
|
"learning_rate": 6.202614521725847e-06, |
|
"loss": 4.7995, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5425244569778442, |
|
"learning_rate": 6.1807904672530075e-06, |
|
"loss": 4.8013, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.1396992206573486, |
|
"learning_rate": 6.158966412780166e-06, |
|
"loss": 4.7976, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6928815841674805, |
|
"learning_rate": 6.137142358307326e-06, |
|
"loss": 4.8004, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7560544013977051, |
|
"learning_rate": 6.115318303834486e-06, |
|
"loss": 4.7991, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.5612252950668335, |
|
"learning_rate": 6.093494249361647e-06, |
|
"loss": 4.7969, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8462795615196228, |
|
"learning_rate": 6.071670194888807e-06, |
|
"loss": 4.7983, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2593392133712769, |
|
"learning_rate": 6.049846140415967e-06, |
|
"loss": 4.7961, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.180788278579712, |
|
"learning_rate": 6.028022085943127e-06, |
|
"loss": 4.7965, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8912612795829773, |
|
"learning_rate": 6.0061980314702874e-06, |
|
"loss": 4.796, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.3480154275894165, |
|
"learning_rate": 5.984373976997448e-06, |
|
"loss": 4.7985, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.097245931625366, |
|
"learning_rate": 5.962549922524608e-06, |
|
"loss": 4.7963, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.4926351308822632, |
|
"learning_rate": 5.940725868051768e-06, |
|
"loss": 4.7993, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5842808485031128, |
|
"learning_rate": 5.918901813578927e-06, |
|
"loss": 4.7971, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.43305909633636475, |
|
"learning_rate": 5.897077759106087e-06, |
|
"loss": 4.7946, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8926731944084167, |
|
"learning_rate": 5.875253704633247e-06, |
|
"loss": 4.7965, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.48742425441741943, |
|
"learning_rate": 5.853429650160407e-06, |
|
"loss": 4.7976, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.4740324020385742, |
|
"learning_rate": 5.831605595687567e-06, |
|
"loss": 4.7967, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.622294008731842, |
|
"learning_rate": 5.809781541214728e-06, |
|
"loss": 4.7979, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5809967517852783, |
|
"learning_rate": 5.787957486741888e-06, |
|
"loss": 4.7955, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.089276909828186, |
|
"learning_rate": 5.766133432269048e-06, |
|
"loss": 4.7952, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7803537845611572, |
|
"learning_rate": 5.744309377796207e-06, |
|
"loss": 4.7945, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7891755104064941, |
|
"learning_rate": 5.722485323323368e-06, |
|
"loss": 4.7986, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.4096457958221436, |
|
"learning_rate": 5.700661268850528e-06, |
|
"loss": 4.798, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7508895397186279, |
|
"learning_rate": 5.678837214377687e-06, |
|
"loss": 4.7939, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.578983187675476, |
|
"learning_rate": 5.657013159904847e-06, |
|
"loss": 4.7985, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5898668169975281, |
|
"learning_rate": 5.635189105432008e-06, |
|
"loss": 4.7991, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5105552673339844, |
|
"learning_rate": 5.613365050959168e-06, |
|
"loss": 4.798, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5170775651931763, |
|
"learning_rate": 5.591540996486328e-06, |
|
"loss": 4.7946, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.588223934173584, |
|
"learning_rate": 5.569716942013487e-06, |
|
"loss": 4.797, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7536285519599915, |
|
"learning_rate": 5.5478928875406476e-06, |
|
"loss": 4.7934, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5207072496414185, |
|
"learning_rate": 5.526068833067808e-06, |
|
"loss": 4.7975, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.6971161365509033, |
|
"learning_rate": 5.504244778594968e-06, |
|
"loss": 4.797, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7362131476402283, |
|
"learning_rate": 5.482420724122128e-06, |
|
"loss": 4.7926, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.8522694706916809, |
|
"learning_rate": 5.4605966696492876e-06, |
|
"loss": 4.7967, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.5707879662513733, |
|
"learning_rate": 5.438772615176448e-06, |
|
"loss": 4.7946, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.427687406539917, |
|
"learning_rate": 5.416948560703608e-06, |
|
"loss": 4.7958, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.4829745292663574, |
|
"learning_rate": 5.395124506230768e-06, |
|
"loss": 4.7942, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 2.6114389896392822, |
|
"learning_rate": 5.3733004517579275e-06, |
|
"loss": 4.7956, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3793102502822876, |
|
"learning_rate": 5.351476397285088e-06, |
|
"loss": 4.7932, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.681830406188965, |
|
"learning_rate": 5.329652342812248e-06, |
|
"loss": 4.7945, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9123592972755432, |
|
"learning_rate": 5.307828288339408e-06, |
|
"loss": 4.7951, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.771221935749054, |
|
"learning_rate": 5.286004233866568e-06, |
|
"loss": 4.7949, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.176202654838562, |
|
"learning_rate": 5.2641801793937286e-06, |
|
"loss": 4.7941, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.7281347513198853, |
|
"learning_rate": 5.242356124920889e-06, |
|
"loss": 4.7922, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7822020053863525, |
|
"learning_rate": 5.220532070448048e-06, |
|
"loss": 4.7945, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.47084948420524597, |
|
"learning_rate": 5.1987080159752075e-06, |
|
"loss": 4.7954, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6029684543609619, |
|
"learning_rate": 5.176883961502368e-06, |
|
"loss": 4.7949, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.7270153760910034, |
|
"learning_rate": 5.155059907029528e-06, |
|
"loss": 4.7945, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7985190153121948, |
|
"learning_rate": 5.133235852556688e-06, |
|
"loss": 4.7963, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5503430962562561, |
|
"learning_rate": 5.111411798083848e-06, |
|
"loss": 4.7929, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.8454219102859497, |
|
"learning_rate": 5.0895877436110085e-06, |
|
"loss": 4.7934, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.7326210737228394, |
|
"learning_rate": 5.067763689138169e-06, |
|
"loss": 4.7923, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3302743434906006, |
|
"learning_rate": 5.045939634665329e-06, |
|
"loss": 4.7975, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.2851781845092773, |
|
"learning_rate": 5.024115580192489e-06, |
|
"loss": 4.7941, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6426960825920105, |
|
"learning_rate": 5.002291525719649e-06, |
|
"loss": 4.7956, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.518356204032898, |
|
"learning_rate": 4.980467471246809e-06, |
|
"loss": 4.7952, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.0310689210891724, |
|
"learning_rate": 4.958643416773969e-06, |
|
"loss": 4.7948, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.9940834045410156, |
|
"learning_rate": 4.936819362301128e-06, |
|
"loss": 4.7944, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5921940803527832, |
|
"learning_rate": 4.9149953078282885e-06, |
|
"loss": 4.7917, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5630788207054138, |
|
"learning_rate": 4.893171253355449e-06, |
|
"loss": 4.7923, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5146437287330627, |
|
"learning_rate": 4.871347198882609e-06, |
|
"loss": 4.7913, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8808703422546387, |
|
"learning_rate": 4.849523144409769e-06, |
|
"loss": 4.7921, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.0775110721588135, |
|
"learning_rate": 4.827699089936929e-06, |
|
"loss": 4.7917, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8530829548835754, |
|
"learning_rate": 4.805875035464089e-06, |
|
"loss": 4.792, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6423510909080505, |
|
"learning_rate": 4.784050980991249e-06, |
|
"loss": 4.7912, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6276280879974365, |
|
"learning_rate": 4.762226926518409e-06, |
|
"loss": 4.7916, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0554680824279785, |
|
"learning_rate": 4.740402872045569e-06, |
|
"loss": 4.7918, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6865511536598206, |
|
"learning_rate": 4.7185788175727295e-06, |
|
"loss": 4.7945, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.5930079817771912, |
|
"learning_rate": 4.696754763099889e-06, |
|
"loss": 4.7912, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.2960429191589355, |
|
"learning_rate": 4.674930708627049e-06, |
|
"loss": 4.79, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.62696772813797, |
|
"learning_rate": 4.653106654154209e-06, |
|
"loss": 4.7904, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.9733163118362427, |
|
"learning_rate": 4.6312825996813695e-06, |
|
"loss": 4.7891, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.0360530614852905, |
|
"learning_rate": 4.60945854520853e-06, |
|
"loss": 4.7878, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.112536072731018, |
|
"learning_rate": 4.587634490735689e-06, |
|
"loss": 4.7911, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8656798601150513, |
|
"learning_rate": 4.565810436262849e-06, |
|
"loss": 4.7914, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.5879213809967041, |
|
"learning_rate": 4.5439863817900095e-06, |
|
"loss": 4.7909, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8013112545013428, |
|
"learning_rate": 4.52216232731717e-06, |
|
"loss": 4.7876, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8607842326164246, |
|
"learning_rate": 4.50033827284433e-06, |
|
"loss": 4.7896, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.590959370136261, |
|
"learning_rate": 4.47851421837149e-06, |
|
"loss": 4.7895, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.595662236213684, |
|
"learning_rate": 4.4566901638986495e-06, |
|
"loss": 4.7904, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.5102413892745972, |
|
"learning_rate": 4.43486610942581e-06, |
|
"loss": 4.7907, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.1544299125671387, |
|
"learning_rate": 4.41304205495297e-06, |
|
"loss": 4.791, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.2537280321121216, |
|
"learning_rate": 4.39121800048013e-06, |
|
"loss": 4.7938, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6835722327232361, |
|
"learning_rate": 4.3693939460072895e-06, |
|
"loss": 4.7924, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.0808320045471191, |
|
"learning_rate": 4.34756989153445e-06, |
|
"loss": 4.7919, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.7416857481002808, |
|
"learning_rate": 4.32574583706161e-06, |
|
"loss": 4.7903, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.857487142086029, |
|
"learning_rate": 4.30392178258877e-06, |
|
"loss": 4.789, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.9709984064102173, |
|
"learning_rate": 4.2820977281159294e-06, |
|
"loss": 4.7862, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.2289056777954102, |
|
"learning_rate": 4.26027367364309e-06, |
|
"loss": 4.7914, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7405953407287598, |
|
"learning_rate": 4.23844961917025e-06, |
|
"loss": 4.7943, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.3948872089385986, |
|
"learning_rate": 4.21662556469741e-06, |
|
"loss": 4.7898, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6792095303535461, |
|
"learning_rate": 4.1948015102245694e-06, |
|
"loss": 4.7887, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7049844264984131, |
|
"learning_rate": 4.17297745575173e-06, |
|
"loss": 4.793, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5475296974182129, |
|
"learning_rate": 4.15115340127889e-06, |
|
"loss": 4.7881, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6123815774917603, |
|
"learning_rate": 4.12932934680605e-06, |
|
"loss": 4.7901, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.7768149375915527, |
|
"learning_rate": 4.107505292333209e-06, |
|
"loss": 4.7918, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5179715156555176, |
|
"learning_rate": 4.08568123786037e-06, |
|
"loss": 4.7884, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.607803225517273, |
|
"learning_rate": 4.06385718338753e-06, |
|
"loss": 4.79, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.2372949123382568, |
|
"learning_rate": 4.04203312891469e-06, |
|
"loss": 4.7889, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.3154016733169556, |
|
"learning_rate": 4.02020907444185e-06, |
|
"loss": 4.7906, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.5069667100906372, |
|
"learning_rate": 3.99838501996901e-06, |
|
"loss": 4.7871, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.9286430478096008, |
|
"learning_rate": 3.97656096549617e-06, |
|
"loss": 4.793, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.3074681758880615, |
|
"learning_rate": 3.95473691102333e-06, |
|
"loss": 4.7886, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.9673444628715515, |
|
"learning_rate": 3.93291285655049e-06, |
|
"loss": 4.7916, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.3527958393096924, |
|
"learning_rate": 3.9110888020776504e-06, |
|
"loss": 4.7917, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6838983297348022, |
|
"learning_rate": 3.88926474760481e-06, |
|
"loss": 4.792, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.2325012683868408, |
|
"learning_rate": 3.86744069313197e-06, |
|
"loss": 4.79, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6116911172866821, |
|
"learning_rate": 3.84561663865913e-06, |
|
"loss": 4.7873, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.2169787883758545, |
|
"learning_rate": 3.82379258418629e-06, |
|
"loss": 4.7877, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.7613152265548706, |
|
"learning_rate": 3.8019685297134506e-06, |
|
"loss": 4.7869, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.9709558486938477, |
|
"learning_rate": 3.7801444752406104e-06, |
|
"loss": 4.7882, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7928886413574219, |
|
"learning_rate": 3.75832042076777e-06, |
|
"loss": 4.7891, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.5544800162315369, |
|
"learning_rate": 3.7364963662949304e-06, |
|
"loss": 4.7922, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.1702040433883667, |
|
"learning_rate": 3.7146723118220906e-06, |
|
"loss": 4.788, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.300114631652832, |
|
"learning_rate": 3.692848257349251e-06, |
|
"loss": 4.7907, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8479206562042236, |
|
"learning_rate": 3.671024202876411e-06, |
|
"loss": 4.7869, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6059646606445312, |
|
"learning_rate": 3.6492001484035704e-06, |
|
"loss": 4.7891, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.6603209972381592, |
|
"learning_rate": 3.6273760939307306e-06, |
|
"loss": 4.7899, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5265193581581116, |
|
"learning_rate": 3.605552039457891e-06, |
|
"loss": 4.7873, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.8754724264144897, |
|
"learning_rate": 3.583727984985051e-06, |
|
"loss": 4.786, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6786229610443115, |
|
"learning_rate": 3.561903930512211e-06, |
|
"loss": 4.7885, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.067661762237549, |
|
"learning_rate": 3.5400798760393706e-06, |
|
"loss": 4.7899, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.8492066860198975, |
|
"learning_rate": 3.5182558215665308e-06, |
|
"loss": 4.7896, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.989614725112915, |
|
"learning_rate": 3.496431767093691e-06, |
|
"loss": 4.7883, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.0036453008651733, |
|
"learning_rate": 3.474607712620851e-06, |
|
"loss": 4.7879, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.2347569465637207, |
|
"learning_rate": 3.4527836581480114e-06, |
|
"loss": 4.7883, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.543158233165741, |
|
"learning_rate": 3.4309596036751708e-06, |
|
"loss": 4.7895, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.209635615348816, |
|
"learning_rate": 3.409135549202331e-06, |
|
"loss": 4.7888, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.4899618923664093, |
|
"learning_rate": 3.387311494729491e-06, |
|
"loss": 4.7848, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5836758017539978, |
|
"learning_rate": 3.3654874402566514e-06, |
|
"loss": 4.7886, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5948976874351501, |
|
"learning_rate": 3.343663385783811e-06, |
|
"loss": 4.7909, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0364673137664795, |
|
"learning_rate": 3.3218393313109714e-06, |
|
"loss": 4.7873, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.48503726720809937, |
|
"learning_rate": 3.300015276838131e-06, |
|
"loss": 4.7859, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.423990249633789, |
|
"learning_rate": 3.2781912223652914e-06, |
|
"loss": 4.7868, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7693653106689453, |
|
"learning_rate": 3.256367167892451e-06, |
|
"loss": 4.789, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8543452620506287, |
|
"learning_rate": 3.2345431134196114e-06, |
|
"loss": 4.79, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.765480399131775, |
|
"learning_rate": 3.2127190589467716e-06, |
|
"loss": 4.7901, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.3010811805725098, |
|
"learning_rate": 3.1908950044739313e-06, |
|
"loss": 4.7885, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.4873560965061188, |
|
"learning_rate": 3.169070950001091e-06, |
|
"loss": 4.7865, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.61781907081604, |
|
"learning_rate": 3.1472468955282513e-06, |
|
"loss": 4.7873, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.9210663437843323, |
|
"learning_rate": 3.1254228410554115e-06, |
|
"loss": 4.7878, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7144986391067505, |
|
"learning_rate": 3.1035987865825718e-06, |
|
"loss": 4.7865, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6649860143661499, |
|
"learning_rate": 3.0817747321097315e-06, |
|
"loss": 4.7898, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7339694499969482, |
|
"learning_rate": 3.0599506776368913e-06, |
|
"loss": 4.786, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.963561773300171, |
|
"learning_rate": 3.0381266231640515e-06, |
|
"loss": 4.7882, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.8179004192352295, |
|
"learning_rate": 3.0163025686912117e-06, |
|
"loss": 4.7844, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.7514827251434326, |
|
"learning_rate": 2.994478514218372e-06, |
|
"loss": 4.7871, |
|
"step": 32100 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.3072575330734253, |
|
"learning_rate": 2.972654459745532e-06, |
|
"loss": 4.7897, |
|
"step": 32200 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.9841882586479187, |
|
"learning_rate": 2.9508304052726915e-06, |
|
"loss": 4.787, |
|
"step": 32300 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.8150787353515625, |
|
"learning_rate": 2.9290063507998517e-06, |
|
"loss": 4.787, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6560021638870239, |
|
"learning_rate": 2.907182296327012e-06, |
|
"loss": 4.7853, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.4383647441864014, |
|
"learning_rate": 2.885358241854172e-06, |
|
"loss": 4.7858, |
|
"step": 32600 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6023637056350708, |
|
"learning_rate": 2.8635341873813323e-06, |
|
"loss": 4.786, |
|
"step": 32700 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7451430559158325, |
|
"learning_rate": 2.8417101329084917e-06, |
|
"loss": 4.7917, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5302537679672241, |
|
"learning_rate": 2.819886078435652e-06, |
|
"loss": 4.7863, |
|
"step": 32900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.9489607214927673, |
|
"learning_rate": 2.798062023962812e-06, |
|
"loss": 4.7873, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.1517263650894165, |
|
"learning_rate": 2.7762379694899723e-06, |
|
"loss": 4.788, |
|
"step": 33100 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.5979319214820862, |
|
"learning_rate": 2.754413915017132e-06, |
|
"loss": 4.7898, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8156241178512573, |
|
"learning_rate": 2.732589860544292e-06, |
|
"loss": 4.7883, |
|
"step": 33300 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.709427833557129, |
|
"learning_rate": 2.710765806071452e-06, |
|
"loss": 4.7867, |
|
"step": 33400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5464685559272766, |
|
"learning_rate": 2.6889417515986123e-06, |
|
"loss": 4.7889, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.050419569015503, |
|
"learning_rate": 2.667117697125772e-06, |
|
"loss": 4.7883, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6171194314956665, |
|
"learning_rate": 2.6452936426529323e-06, |
|
"loss": 4.7893, |
|
"step": 33700 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7161327600479126, |
|
"learning_rate": 2.6234695881800925e-06, |
|
"loss": 4.787, |
|
"step": 33800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.5941922664642334, |
|
"learning_rate": 2.6016455337072523e-06, |
|
"loss": 4.7843, |
|
"step": 33900 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.1841336488723755, |
|
"learning_rate": 2.579821479234412e-06, |
|
"loss": 4.7856, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7025249004364014, |
|
"learning_rate": 2.5579974247615723e-06, |
|
"loss": 4.7843, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.5568642616271973, |
|
"learning_rate": 2.5361733702887325e-06, |
|
"loss": 4.7851, |
|
"step": 34200 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.3719669580459595, |
|
"learning_rate": 2.5143493158158927e-06, |
|
"loss": 4.7846, |
|
"step": 34300 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6786982417106628, |
|
"learning_rate": 2.4925252613430525e-06, |
|
"loss": 4.7845, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.6673011183738708, |
|
"learning_rate": 2.4707012068702123e-06, |
|
"loss": 4.7876, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.6152336597442627, |
|
"learning_rate": 2.4488771523973725e-06, |
|
"loss": 4.7864, |
|
"step": 34600 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.4687036275863647, |
|
"learning_rate": 2.4270530979245327e-06, |
|
"loss": 4.7851, |
|
"step": 34700 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5648071765899658, |
|
"learning_rate": 2.4052290434516925e-06, |
|
"loss": 4.7877, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.547589898109436, |
|
"learning_rate": 2.3834049889788527e-06, |
|
"loss": 4.7886, |
|
"step": 34900 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9473028182983398, |
|
"learning_rate": 2.361580934506013e-06, |
|
"loss": 4.7869, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.109892725944519, |
|
"learning_rate": 2.3397568800331727e-06, |
|
"loss": 4.7893, |
|
"step": 35100 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.2780309915542603, |
|
"learning_rate": 2.317932825560333e-06, |
|
"loss": 4.7856, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0184062719345093, |
|
"learning_rate": 2.2961087710874927e-06, |
|
"loss": 4.7851, |
|
"step": 35300 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.2379289865493774, |
|
"learning_rate": 2.274284716614653e-06, |
|
"loss": 4.7895, |
|
"step": 35400 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.372209072113037, |
|
"learning_rate": 2.252460662141813e-06, |
|
"loss": 4.7869, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6445749402046204, |
|
"learning_rate": 2.230636607668973e-06, |
|
"loss": 4.7833, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5948896408081055, |
|
"learning_rate": 2.208812553196133e-06, |
|
"loss": 4.7873, |
|
"step": 35700 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6000692844390869, |
|
"learning_rate": 2.1869884987232933e-06, |
|
"loss": 4.785, |
|
"step": 35800 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.0963830947875977, |
|
"learning_rate": 2.165164444250453e-06, |
|
"loss": 4.7883, |
|
"step": 35900 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.0998800992965698, |
|
"learning_rate": 2.1433403897776133e-06, |
|
"loss": 4.7888, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.1177624464035034, |
|
"learning_rate": 2.121516335304773e-06, |
|
"loss": 4.7847, |
|
"step": 36100 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6763335466384888, |
|
"learning_rate": 2.0996922808319333e-06, |
|
"loss": 4.785, |
|
"step": 36200 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.9737639427185059, |
|
"learning_rate": 2.077868226359093e-06, |
|
"loss": 4.787, |
|
"step": 36300 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.756729245185852, |
|
"learning_rate": 2.0560441718862532e-06, |
|
"loss": 4.7865, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.7666120529174805, |
|
"learning_rate": 2.0342201174134134e-06, |
|
"loss": 4.7866, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5760579705238342, |
|
"learning_rate": 2.0123960629405732e-06, |
|
"loss": 4.7849, |
|
"step": 36600 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0564417839050293, |
|
"learning_rate": 1.9905720084677334e-06, |
|
"loss": 4.7863, |
|
"step": 36700 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7988053560256958, |
|
"learning_rate": 1.9687479539948932e-06, |
|
"loss": 4.7849, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 1.6380457878112793, |
|
"learning_rate": 1.9469238995220534e-06, |
|
"loss": 4.786, |
|
"step": 36900 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.6611093282699585, |
|
"learning_rate": 1.9250998450492132e-06, |
|
"loss": 4.7862, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.7461991310119629, |
|
"learning_rate": 1.9032757905763734e-06, |
|
"loss": 4.7859, |
|
"step": 37100 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.7583649158477783, |
|
"learning_rate": 1.8814517361035334e-06, |
|
"loss": 4.7871, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 2.0912837982177734, |
|
"learning_rate": 1.8596276816306934e-06, |
|
"loss": 4.7834, |
|
"step": 37300 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5466910004615784, |
|
"learning_rate": 1.8378036271578536e-06, |
|
"loss": 4.7869, |
|
"step": 37400 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.6969348192214966, |
|
"learning_rate": 1.8159795726850136e-06, |
|
"loss": 4.7834, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.0343713760375977, |
|
"learning_rate": 1.7941555182121736e-06, |
|
"loss": 4.7849, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.1310784816741943, |
|
"learning_rate": 1.7723314637393336e-06, |
|
"loss": 4.7817, |
|
"step": 37700 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.7537195682525635, |
|
"learning_rate": 1.7505074092664936e-06, |
|
"loss": 4.7873, |
|
"step": 37800 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5658436417579651, |
|
"learning_rate": 1.7286833547936538e-06, |
|
"loss": 4.7837, |
|
"step": 37900 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.7453294396400452, |
|
"learning_rate": 1.7068593003208136e-06, |
|
"loss": 4.7842, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6501612663269043, |
|
"learning_rate": 1.6850352458479738e-06, |
|
"loss": 4.7868, |
|
"step": 38100 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.5469274520874023, |
|
"learning_rate": 1.663211191375134e-06, |
|
"loss": 4.7874, |
|
"step": 38200 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.8729673624038696, |
|
"learning_rate": 1.6413871369022938e-06, |
|
"loss": 4.7844, |
|
"step": 38300 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5102872252464294, |
|
"learning_rate": 1.619563082429454e-06, |
|
"loss": 4.7833, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5068778395652771, |
|
"learning_rate": 1.5977390279566138e-06, |
|
"loss": 4.7854, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.865955114364624, |
|
"learning_rate": 1.575914973483774e-06, |
|
"loss": 4.7869, |
|
"step": 38600 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.6757729053497314, |
|
"learning_rate": 1.554090919010934e-06, |
|
"loss": 4.7853, |
|
"step": 38700 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.1830861568450928, |
|
"learning_rate": 1.532266864538094e-06, |
|
"loss": 4.7843, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.9592554569244385, |
|
"learning_rate": 1.510442810065254e-06, |
|
"loss": 4.783, |
|
"step": 38900 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.7421624064445496, |
|
"learning_rate": 1.4886187555924142e-06, |
|
"loss": 4.7852, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.3924355506896973, |
|
"learning_rate": 1.466794701119574e-06, |
|
"loss": 4.7847, |
|
"step": 39100 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.4272788763046265, |
|
"learning_rate": 1.4449706466467342e-06, |
|
"loss": 4.7829, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.593848466873169, |
|
"learning_rate": 1.423146592173894e-06, |
|
"loss": 4.7882, |
|
"step": 39300 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.579724133014679, |
|
"learning_rate": 1.4013225377010542e-06, |
|
"loss": 4.788, |
|
"step": 39400 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.6681183576583862, |
|
"learning_rate": 1.3794984832282144e-06, |
|
"loss": 4.7828, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9919196963310242, |
|
"learning_rate": 1.3576744287553742e-06, |
|
"loss": 4.7849, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0009275674819946, |
|
"learning_rate": 1.3358503742825344e-06, |
|
"loss": 4.7848, |
|
"step": 39700 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.044150710105896, |
|
"learning_rate": 1.3140263198096944e-06, |
|
"loss": 4.7865, |
|
"step": 39800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.9887941479682922, |
|
"learning_rate": 1.2922022653368544e-06, |
|
"loss": 4.7861, |
|
"step": 39900 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.3287606239318848, |
|
"learning_rate": 1.2703782108640146e-06, |
|
"loss": 4.7831, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.6255909204483032, |
|
"learning_rate": 1.2485541563911746e-06, |
|
"loss": 4.784, |
|
"step": 40100 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.1618168354034424, |
|
"learning_rate": 1.2267301019183346e-06, |
|
"loss": 4.7854, |
|
"step": 40200 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.4494011402130127, |
|
"learning_rate": 1.2049060474454946e-06, |
|
"loss": 4.7858, |
|
"step": 40300 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5680764317512512, |
|
"learning_rate": 1.1830819929726546e-06, |
|
"loss": 4.7863, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.6083593964576721, |
|
"learning_rate": 1.1612579384998145e-06, |
|
"loss": 4.7858, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.0342472791671753, |
|
"learning_rate": 1.1394338840269745e-06, |
|
"loss": 4.7845, |
|
"step": 40600 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.8548548817634583, |
|
"learning_rate": 1.1176098295541345e-06, |
|
"loss": 4.7844, |
|
"step": 40700 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 1.0878795385360718, |
|
"learning_rate": 1.0957857750812947e-06, |
|
"loss": 4.7827, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.6437754034996033, |
|
"learning_rate": 1.0739617206084547e-06, |
|
"loss": 4.785, |
|
"step": 40900 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.8510251641273499, |
|
"learning_rate": 1.0521376661356147e-06, |
|
"loss": 4.7863, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6618091464042664, |
|
"learning_rate": 1.0303136116627747e-06, |
|
"loss": 4.7857, |
|
"step": 41100 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6428667902946472, |
|
"learning_rate": 1.008489557189935e-06, |
|
"loss": 4.7836, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7751622200012207, |
|
"learning_rate": 9.86665502717095e-07, |
|
"loss": 4.7838, |
|
"step": 41300 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6780493855476379, |
|
"learning_rate": 9.64841448244255e-07, |
|
"loss": 4.785, |
|
"step": 41400 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6410045623779297, |
|
"learning_rate": 9.430173937714149e-07, |
|
"loss": 4.7867, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6422233581542969, |
|
"learning_rate": 9.21193339298575e-07, |
|
"loss": 4.7864, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.48585817217826843, |
|
"learning_rate": 8.99369284825735e-07, |
|
"loss": 4.7862, |
|
"step": 41700 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0709576606750488, |
|
"learning_rate": 8.77545230352895e-07, |
|
"loss": 4.7845, |
|
"step": 41800 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5594016313552856, |
|
"learning_rate": 8.55721175880055e-07, |
|
"loss": 4.7852, |
|
"step": 41900 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.6192421913146973, |
|
"learning_rate": 8.338971214072151e-07, |
|
"loss": 4.7857, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.3113442659378052, |
|
"learning_rate": 8.120730669343751e-07, |
|
"loss": 4.7837, |
|
"step": 42100 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.6697332859039307, |
|
"learning_rate": 7.902490124615351e-07, |
|
"loss": 4.7861, |
|
"step": 42200 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.584178626537323, |
|
"learning_rate": 7.684249579886951e-07, |
|
"loss": 4.7852, |
|
"step": 42300 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.5884829759597778, |
|
"learning_rate": 7.466009035158553e-07, |
|
"loss": 4.7852, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.7332740426063538, |
|
"learning_rate": 7.247768490430153e-07, |
|
"loss": 4.7843, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.1577807664871216, |
|
"learning_rate": 7.029527945701753e-07, |
|
"loss": 4.7838, |
|
"step": 42600 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6798880100250244, |
|
"learning_rate": 6.811287400973353e-07, |
|
"loss": 4.7824, |
|
"step": 42700 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.1418567895889282, |
|
"learning_rate": 6.593046856244954e-07, |
|
"loss": 4.7832, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.8666886687278748, |
|
"learning_rate": 6.374806311516554e-07, |
|
"loss": 4.7836, |
|
"step": 42900 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.611757218837738, |
|
"learning_rate": 6.156565766788154e-07, |
|
"loss": 4.7881, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.6125512719154358, |
|
"learning_rate": 5.938325222059755e-07, |
|
"loss": 4.784, |
|
"step": 43100 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.8485561609268188, |
|
"learning_rate": 5.720084677331355e-07, |
|
"loss": 4.7855, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.1883187294006348, |
|
"learning_rate": 5.501844132602956e-07, |
|
"loss": 4.7849, |
|
"step": 43300 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5186755657196045, |
|
"learning_rate": 5.283603587874556e-07, |
|
"loss": 4.7815, |
|
"step": 43400 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.8067595958709717, |
|
"learning_rate": 5.065363043146156e-07, |
|
"loss": 4.7834, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5256661176681519, |
|
"learning_rate": 4.847122498417756e-07, |
|
"loss": 4.7846, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5952714085578918, |
|
"learning_rate": 4.628881953689357e-07, |
|
"loss": 4.7824, |
|
"step": 43700 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.6391135454177856, |
|
"learning_rate": 4.410641408960957e-07, |
|
"loss": 4.7865, |
|
"step": 43800 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.5354276299476624, |
|
"learning_rate": 4.192400864232558e-07, |
|
"loss": 4.7834, |
|
"step": 43900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.8023102283477783, |
|
"learning_rate": 3.974160319504158e-07, |
|
"loss": 4.7826, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9348645210266113, |
|
"learning_rate": 3.755919774775758e-07, |
|
"loss": 4.7842, |
|
"step": 44100 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9189215302467346, |
|
"learning_rate": 3.537679230047358e-07, |
|
"loss": 4.7827, |
|
"step": 44200 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.6931596994400024, |
|
"learning_rate": 3.319438685318959e-07, |
|
"loss": 4.7866, |
|
"step": 44300 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.1738442182540894, |
|
"learning_rate": 3.101198140590559e-07, |
|
"loss": 4.7836, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.6504749059677124, |
|
"learning_rate": 2.882957595862159e-07, |
|
"loss": 4.788, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.6548625230789185, |
|
"learning_rate": 2.6647170511337596e-07, |
|
"loss": 4.7857, |
|
"step": 44600 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.621839165687561, |
|
"learning_rate": 2.44647650640536e-07, |
|
"loss": 4.782, |
|
"step": 44700 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5665038228034973, |
|
"learning_rate": 2.2282359616769603e-07, |
|
"loss": 4.7858, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.6663780808448792, |
|
"learning_rate": 2.0099954169485608e-07, |
|
"loss": 4.784, |
|
"step": 44900 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.4924936294555664, |
|
"learning_rate": 1.791754872220161e-07, |
|
"loss": 4.7842, |
|
"step": 45000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 45821, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 1.4750802130806374e+18, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|