|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 5088, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00019654088050314466, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 1.9646365422396855e-07, |
|
"loss": 2.5088, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0009827044025157233, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.823182711198429e-07, |
|
"loss": 2.5011, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0019654088050314465, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 1.9646365422396858e-06, |
|
"loss": 2.507, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00294811320754717, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 2.946954813359529e-06, |
|
"loss": 2.495, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.003930817610062893, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.9292730844793715e-06, |
|
"loss": 2.4677, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004913522012578616, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.911591355599214e-06, |
|
"loss": 2.4155, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.00589622641509434, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 5.893909626719058e-06, |
|
"loss": 2.3711, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006878930817610063, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 6.876227897838901e-06, |
|
"loss": 2.3188, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.007861635220125786, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 7.858546168958743e-06, |
|
"loss": 2.2165, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00884433962264151, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 8.840864440078587e-06, |
|
"loss": 2.1073, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.009827044025157232, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 9.823182711198428e-06, |
|
"loss": 2.0433, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.010809748427672955, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 1.0805500982318271e-05, |
|
"loss": 1.9535, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01179245283018868, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 1.1787819253438115e-05, |
|
"loss": 1.8931, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.012775157232704403, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.2770137524557958e-05, |
|
"loss": 1.8378, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.013757861635220126, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 1.3752455795677802e-05, |
|
"loss": 1.7701, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01474056603773585, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 1.4734774066797644e-05, |
|
"loss": 1.7103, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.015723270440251572, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 1.5717092337917486e-05, |
|
"loss": 1.6636, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.016705974842767295, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 1.669941060903733e-05, |
|
"loss": 1.6086, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.01768867924528302, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 1.7681728880157174e-05, |
|
"loss": 1.5668, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01867138364779874, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 1.8664047151277013e-05, |
|
"loss": 1.5144, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.019654088050314465, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 1.9646365422396855e-05, |
|
"loss": 1.4948, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.020636792452830188, |
|
"grad_norm": 0.375, |
|
"learning_rate": 2.06286836935167e-05, |
|
"loss": 1.4615, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.02161949685534591, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 2.1611001964636543e-05, |
|
"loss": 1.4433, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.022602201257861634, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 2.2593320235756385e-05, |
|
"loss": 1.4125, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.02358490566037736, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 2.357563850687623e-05, |
|
"loss": 1.408, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.024567610062893083, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 2.4557956777996073e-05, |
|
"loss": 1.4023, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.025550314465408806, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 2.5540275049115915e-05, |
|
"loss": 1.3823, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.02653301886792453, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 2.6522593320235754e-05, |
|
"loss": 1.3891, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.027515723270440252, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 2.7504911591355603e-05, |
|
"loss": 1.356, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.028498427672955975, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 2.8487229862475445e-05, |
|
"loss": 1.3275, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0294811320754717, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 2.9469548133595288e-05, |
|
"loss": 1.3491, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03046383647798742, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 3.045186640471513e-05, |
|
"loss": 1.3364, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.031446540880503145, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 3.143418467583497e-05, |
|
"loss": 1.3052, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03242924528301887, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 3.241650294695481e-05, |
|
"loss": 1.3164, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.03341194968553459, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 3.339882121807466e-05, |
|
"loss": 1.3071, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.034394654088050314, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 3.43811394891945e-05, |
|
"loss": 1.3032, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.03537735849056604, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 3.536345776031435e-05, |
|
"loss": 1.2914, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03636006289308176, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 3.634577603143419e-05, |
|
"loss": 1.2698, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.03734276729559748, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 3.7328094302554026e-05, |
|
"loss": 1.2984, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.038325471698113206, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 3.831041257367387e-05, |
|
"loss": 1.2854, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.03930817610062893, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 3.929273084479371e-05, |
|
"loss": 1.2781, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04029088050314465, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 4.027504911591356e-05, |
|
"loss": 1.285, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.041273584905660375, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 4.12573673870334e-05, |
|
"loss": 1.2705, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0422562893081761, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 4.223968565815325e-05, |
|
"loss": 1.2603, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.04323899371069182, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.3222003929273086e-05, |
|
"loss": 1.2613, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.044221698113207544, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 4.4204322200392925e-05, |
|
"loss": 1.2453, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.04520440251572327, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 4.518664047151277e-05, |
|
"loss": 1.2662, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.04618710691823899, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 4.6168958742632616e-05, |
|
"loss": 1.2483, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.04716981132075472, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 4.715127701375246e-05, |
|
"loss": 1.2561, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04815251572327044, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 4.81335952848723e-05, |
|
"loss": 1.2218, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.049135220125786166, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 4.9115913555992146e-05, |
|
"loss": 1.244, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.05011792452830189, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 5.0098231827111985e-05, |
|
"loss": 1.2447, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.05110062893081761, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 5.108055009823183e-05, |
|
"loss": 1.2392, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.052083333333333336, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 5.206286836935167e-05, |
|
"loss": 1.2235, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.05306603773584906, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 5.304518664047151e-05, |
|
"loss": 1.2133, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.05404874213836478, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 5.4027504911591354e-05, |
|
"loss": 1.2414, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.055031446540880505, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 5.5009823182711206e-05, |
|
"loss": 1.203, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.05601415094339623, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 5.5992141453831045e-05, |
|
"loss": 1.2206, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.05699685534591195, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 5.697445972495089e-05, |
|
"loss": 1.2233, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.057979559748427674, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 5.795677799607073e-05, |
|
"loss": 1.1989, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.0589622641509434, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 5.8939096267190575e-05, |
|
"loss": 1.1879, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05994496855345912, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 5.9921414538310414e-05, |
|
"loss": 1.1961, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.06092767295597484, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 6.090373280943026e-05, |
|
"loss": 1.2021, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.061910377358490566, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 6.18860510805501e-05, |
|
"loss": 1.2153, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.06289308176100629, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.286836935166994e-05, |
|
"loss": 1.1961, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.06387578616352202, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 6.385068762278978e-05, |
|
"loss": 1.1765, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.06485849056603774, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 6.483300589390962e-05, |
|
"loss": 1.1868, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.06584119496855347, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 6.581532416502947e-05, |
|
"loss": 1.1748, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.06682389937106918, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 6.679764243614931e-05, |
|
"loss": 1.1659, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.06780660377358491, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.777996070726917e-05, |
|
"loss": 1.1648, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.06878930817610063, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.8762278978389e-05, |
|
"loss": 1.1573, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06977201257861636, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 6.974459724950884e-05, |
|
"loss": 1.1894, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.07075471698113207, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 7.07269155206287e-05, |
|
"loss": 1.1512, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.0717374213836478, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 7.170923379174853e-05, |
|
"loss": 1.1746, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.07272012578616352, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 7.269155206286837e-05, |
|
"loss": 1.1589, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.07370283018867925, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 7.367387033398821e-05, |
|
"loss": 1.1567, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.07468553459119497, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 7.465618860510805e-05, |
|
"loss": 1.1611, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.0756682389937107, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.56385068762279e-05, |
|
"loss": 1.1563, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.07665094339622641, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 7.662082514734774e-05, |
|
"loss": 1.1603, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.07763364779874214, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 7.760314341846758e-05, |
|
"loss": 1.1534, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.07861635220125786, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 7.858546168958742e-05, |
|
"loss": 1.1575, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07959905660377359, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 7.956777996070727e-05, |
|
"loss": 1.154, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.0805817610062893, |
|
"grad_norm": 0.375, |
|
"learning_rate": 8.055009823182712e-05, |
|
"loss": 1.1613, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.08156446540880503, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 8.153241650294696e-05, |
|
"loss": 1.1335, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.08254716981132075, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 8.25147347740668e-05, |
|
"loss": 1.1369, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.08352987421383648, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 8.349705304518664e-05, |
|
"loss": 1.1388, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.0845125786163522, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 8.44793713163065e-05, |
|
"loss": 1.1228, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.08549528301886793, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 8.546168958742633e-05, |
|
"loss": 1.1552, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.08647798742138364, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 8.644400785854617e-05, |
|
"loss": 1.1317, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.08746069182389937, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 8.742632612966601e-05, |
|
"loss": 1.1379, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.08844339622641509, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 8.840864440078585e-05, |
|
"loss": 1.1342, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.08942610062893082, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 8.93909626719057e-05, |
|
"loss": 1.1154, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.09040880503144653, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 9.037328094302554e-05, |
|
"loss": 1.126, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.09139150943396226, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 9.135559921414538e-05, |
|
"loss": 1.1165, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.09237421383647798, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 9.233791748526523e-05, |
|
"loss": 1.1268, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.09335691823899371, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 9.332023575638507e-05, |
|
"loss": 1.113, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.09433962264150944, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 9.430255402750492e-05, |
|
"loss": 1.0768, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.09532232704402516, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 9.528487229862476e-05, |
|
"loss": 1.109, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.09630503144654089, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 9.62671905697446e-05, |
|
"loss": 1.1147, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.0972877358490566, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 9.724950884086444e-05, |
|
"loss": 1.1194, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.09827044025157233, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 9.823182711198429e-05, |
|
"loss": 1.1196, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.09925314465408805, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 9.921414538310413e-05, |
|
"loss": 1.0912, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.10023584905660378, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00010019646365422397, |
|
"loss": 1.0998, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1012185534591195, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00010117878192534382, |
|
"loss": 1.1117, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.10220125786163523, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.00010216110019646366, |
|
"loss": 1.0974, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.10318396226415094, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00010314341846758351, |
|
"loss": 1.1172, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.10416666666666667, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00010412573673870334, |
|
"loss": 1.1188, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.10514937106918239, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00010510805500982319, |
|
"loss": 1.102, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.10613207547169812, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00010609037328094302, |
|
"loss": 1.0926, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.10711477987421383, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00010707269155206288, |
|
"loss": 1.0811, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.10809748427672956, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00010805500982318271, |
|
"loss": 1.1066, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.10908018867924528, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00010903732809430256, |
|
"loss": 1.0843, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.11006289308176101, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00011001964636542241, |
|
"loss": 1.0953, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.11104559748427673, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00011100196463654224, |
|
"loss": 1.0918, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.11202830188679246, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00011198428290766209, |
|
"loss": 1.0728, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.11301100628930817, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00011296660117878193, |
|
"loss": 1.0745, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.1139937106918239, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00011394891944990178, |
|
"loss": 1.0784, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.11497641509433962, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00011493123772102161, |
|
"loss": 1.0865, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.11595911949685535, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00011591355599214146, |
|
"loss": 1.0642, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.11694182389937106, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0001168958742632613, |
|
"loss": 1.0752, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.1179245283018868, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00011787819253438115, |
|
"loss": 1.0718, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.11890723270440251, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00011886051080550098, |
|
"loss": 1.082, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.11988993710691824, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00011984282907662083, |
|
"loss": 1.0514, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.12087264150943396, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00012082514734774067, |
|
"loss": 1.0718, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.12185534591194969, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00012180746561886052, |
|
"loss": 1.0721, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.1228380503144654, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00012278978388998037, |
|
"loss": 1.087, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.12382075471698113, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.0001237721021611002, |
|
"loss": 1.0613, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.12480345911949685, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00012475442043222005, |
|
"loss": 1.0848, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.12578616352201258, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.0001257367387033399, |
|
"loss": 1.076, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1267688679245283, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00012671905697445973, |
|
"loss": 1.0684, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.12775157232704404, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00012770137524557957, |
|
"loss": 1.0788, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.12873427672955975, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00012868369351669943, |
|
"loss": 1.0672, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.12971698113207547, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00012966601178781924, |
|
"loss": 1.0795, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.1306996855345912, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001306483300589391, |
|
"loss": 1.0481, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.13168238993710693, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00013163064833005895, |
|
"loss": 1.0537, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.13266509433962265, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0001326129666011788, |
|
"loss": 1.0627, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.13364779874213836, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00013359528487229863, |
|
"loss": 1.0436, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.13463050314465408, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00013457760314341847, |
|
"loss": 1.048, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.13561320754716982, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00013555992141453833, |
|
"loss": 1.0611, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.13659591194968554, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00013654223968565817, |
|
"loss": 1.0509, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.13757861635220126, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.000137524557956778, |
|
"loss": 1.054, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.13856132075471697, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00013850687622789785, |
|
"loss": 1.0321, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.13954402515723272, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.0001394891944990177, |
|
"loss": 1.0389, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.14052672955974843, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00014047151277013753, |
|
"loss": 1.0537, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.14150943396226415, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0001414538310412574, |
|
"loss": 1.0514, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.14249213836477986, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001424361493123772, |
|
"loss": 1.0085, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.1434748427672956, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00014341846758349707, |
|
"loss": 1.0316, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.14445754716981132, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00014440078585461688, |
|
"loss": 1.0503, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.14544025157232704, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00014538310412573675, |
|
"loss": 1.0632, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.14642295597484276, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.0001463654223968566, |
|
"loss": 1.0317, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.1474056603773585, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00014734774066797642, |
|
"loss": 1.0498, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.14838836477987422, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0001483300589390963, |
|
"loss": 1.0312, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.14937106918238993, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0001493123772102161, |
|
"loss": 1.0421, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.15035377358490565, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00015029469548133597, |
|
"loss": 1.0416, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.1513364779874214, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001512770137524558, |
|
"loss": 1.0296, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.1523191823899371, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00015225933202357565, |
|
"loss": 1.0351, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.15330188679245282, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00015324165029469548, |
|
"loss": 1.0265, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.15428459119496854, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00015422396856581532, |
|
"loss": 1.0243, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.15526729559748428, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00015520628683693516, |
|
"loss": 1.0341, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.15625, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00015618860510805503, |
|
"loss": 1.0214, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.15723270440251572, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00015717092337917484, |
|
"loss": 1.0199, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.15821540880503146, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0001581532416502947, |
|
"loss": 1.0078, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.15919811320754718, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00015913555992141455, |
|
"loss": 1.0247, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.1601808176100629, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00016011787819253438, |
|
"loss": 1.0397, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.1611635220125786, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00016110019646365425, |
|
"loss": 1.0195, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.16214622641509435, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00016208251473477406, |
|
"loss": 1.0129, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.16312893081761007, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00016306483300589393, |
|
"loss": 1.0171, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.16411163522012578, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00016404715127701377, |
|
"loss": 1.0045, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.1650943396226415, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0001650294695481336, |
|
"loss": 0.9911, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.16607704402515724, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00016601178781925344, |
|
"loss": 0.9963, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.16705974842767296, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00016699410609037328, |
|
"loss": 1.0181, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.16804245283018868, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00016797642436149312, |
|
"loss": 1.0027, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.1690251572327044, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.000168958742632613, |
|
"loss": 1.0185, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.17000786163522014, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.0001699410609037328, |
|
"loss": 1.0021, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.17099056603773585, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00017092337917485267, |
|
"loss": 1.0071, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.17197327044025157, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001719056974459725, |
|
"loss": 1.0065, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.17295597484276728, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00017288801571709234, |
|
"loss": 1.0237, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.17393867924528303, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0001738703339882122, |
|
"loss": 1.0162, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.17492138364779874, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00017485265225933202, |
|
"loss": 0.9945, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.17590408805031446, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0001758349705304519, |
|
"loss": 1.016, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.17688679245283018, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001768172888015717, |
|
"loss": 1.0337, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.17786949685534592, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00017779960707269156, |
|
"loss": 1.0119, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.17885220125786164, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0001787819253438114, |
|
"loss": 1.0119, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.17983490566037735, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00017976424361493124, |
|
"loss": 0.9935, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.18081761006289307, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00018074656188605108, |
|
"loss": 1.0026, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.1818003144654088, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00018172888015717092, |
|
"loss": 0.9855, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.18278301886792453, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018271119842829076, |
|
"loss": 1.0135, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.18376572327044025, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00018369351669941062, |
|
"loss": 1.0028, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.18474842767295596, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00018467583497053046, |
|
"loss": 1.0078, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.1857311320754717, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001856581532416503, |
|
"loss": 0.9931, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.18671383647798742, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00018664047151277014, |
|
"loss": 0.9898, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.18769654088050314, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00018762278978388998, |
|
"loss": 0.9982, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.18867924528301888, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018860510805500985, |
|
"loss": 0.9974, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.1896619496855346, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00018958742632612966, |
|
"loss": 0.9886, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.1906446540880503, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00019056974459724952, |
|
"loss": 0.9926, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.19162735849056603, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00019155206286836936, |
|
"loss": 1.0024, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.19261006289308177, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001925343811394892, |
|
"loss": 0.9963, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.1935927672955975, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00019351669941060904, |
|
"loss": 1.0055, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.1945754716981132, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00019449901768172888, |
|
"loss": 0.9818, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.19555817610062892, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00019548133595284875, |
|
"loss": 1.0134, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.19654088050314467, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00019646365422396858, |
|
"loss": 0.9932, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19752358490566038, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019744597249508842, |
|
"loss": 0.9802, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.1985062893081761, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019842829076620826, |
|
"loss": 0.99, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.19948899371069181, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0001994106090373281, |
|
"loss": 0.999, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.20047169811320756, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00019999997646422815, |
|
"loss": 0.9685, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.20145440251572327, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019999971168692198, |
|
"loss": 0.9788, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.202437106918239, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019999915271337634, |
|
"loss": 0.9736, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2034198113207547, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019999829954523573, |
|
"loss": 0.9811, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.20440251572327045, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019999715218501016, |
|
"loss": 0.9989, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.20538522012578617, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019999571063607512, |
|
"loss": 0.976, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.20636792452830188, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00019999397490267162, |
|
"loss": 0.9551, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2073506289308176, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019999194498990613, |
|
"loss": 0.9894, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001999896209037506, |
|
"loss": 0.9754, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.20931603773584906, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00019998700265104238, |
|
"loss": 0.961, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.21029874213836477, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00019998409023948432, |
|
"loss": 0.9761, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.2112814465408805, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00019998088367764467, |
|
"loss": 0.9644, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.21226415094339623, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00019997738297495703, |
|
"loss": 0.9669, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.21324685534591195, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019997358814172035, |
|
"loss": 0.9543, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.21422955974842767, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00019996949918909897, |
|
"loss": 0.9814, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.21521226415094338, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0001999651161291224, |
|
"loss": 0.9599, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.21619496855345913, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00019996043897468552, |
|
"loss": 0.9461, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.21717767295597484, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00019995546773954835, |
|
"loss": 0.9528, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.21816037735849056, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019995020243833615, |
|
"loss": 0.9865, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.2191430817610063, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00019994464308653926, |
|
"loss": 0.9853, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.22012578616352202, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00019993878970051316, |
|
"loss": 0.9863, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.22110849056603774, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00019993264229747833, |
|
"loss": 0.9778, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.22209119496855345, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001999262008955202, |
|
"loss": 0.9896, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.2230738993710692, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019991946551358925, |
|
"loss": 0.978, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.2240566037735849, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00019991243617150078, |
|
"loss": 0.9714, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.22503930817610063, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00019990511288993485, |
|
"loss": 0.9976, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.22602201257861634, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019989749569043638, |
|
"loss": 0.9473, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2270047169811321, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019988958459541501, |
|
"loss": 0.9745, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.2279874213836478, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019988137962814482, |
|
"loss": 0.9726, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.22897012578616352, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00019987288081276468, |
|
"loss": 0.9601, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.22995283018867924, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001998640881742778, |
|
"loss": 0.9626, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.23093553459119498, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00019985500173855196, |
|
"loss": 0.929, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.2319182389937107, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00019984562153231908, |
|
"loss": 0.9573, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.2329009433962264, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00019983594758317551, |
|
"loss": 0.9549, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.23388364779874213, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00019982597991958172, |
|
"loss": 0.9626, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.23486635220125787, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001998157185708623, |
|
"loss": 0.9584, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.2358490566037736, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019980516356720576, |
|
"loss": 0.9428, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2368317610062893, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00019979431493966473, |
|
"loss": 0.9302, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.23781446540880502, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001997831727201555, |
|
"loss": 0.9765, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.23879716981132076, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019977173694145812, |
|
"loss": 0.9454, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.23977987421383648, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00019976000763721635, |
|
"loss": 0.9604, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.2407625786163522, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.0001997479848419375, |
|
"loss": 0.9584, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.2417452830188679, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00019973566859099226, |
|
"loss": 0.9473, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.24272798742138366, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019972305892061466, |
|
"loss": 0.9486, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.24371069182389937, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00019971015586790197, |
|
"loss": 0.9477, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.2446933962264151, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00019969695947081464, |
|
"loss": 0.9394, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.2456761006289308, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00019968346976817608, |
|
"loss": 0.9466, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.24665880503144655, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00019966968679967256, |
|
"loss": 0.9538, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.24764150943396226, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0001996556106058532, |
|
"loss": 0.9581, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.24862421383647798, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00019964124122812975, |
|
"loss": 0.9568, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.2496069182389937, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0001996265787087765, |
|
"loss": 0.9465, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.2505896226415094, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00019961162309093018, |
|
"loss": 0.9562, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.25157232704402516, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00019959637441858977, |
|
"loss": 0.9467, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.2525550314465409, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00019958083273661638, |
|
"loss": 0.9524, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.2535377358490566, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00019956499809073322, |
|
"loss": 0.9494, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.25452044025157233, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00019954887052752536, |
|
"loss": 0.9512, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.2555031446540881, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0001995324500944396, |
|
"loss": 0.9496, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.25648584905660377, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001995157368397844, |
|
"loss": 0.9479, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.2574685534591195, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019949873081272966, |
|
"loss": 0.955, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.2584512578616352, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001994814320633066, |
|
"loss": 0.9335, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.25943396226415094, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00019946384064240767, |
|
"loss": 0.9458, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.2604166666666667, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00019944595660178628, |
|
"loss": 0.9421, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.2613993710691824, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001994277799940568, |
|
"loss": 0.9347, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.2623820754716981, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00019940931087269423, |
|
"loss": 0.933, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.26336477987421386, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00019939054929203422, |
|
"loss": 0.9491, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.26434748427672955, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019937149530727282, |
|
"loss": 0.9458, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.2653301886792453, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00019935214897446622, |
|
"loss": 0.9469, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.266312893081761, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019933251035053083, |
|
"loss": 0.9427, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.2672955974842767, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019931257949324288, |
|
"loss": 0.9477, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.26827830188679247, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019929235646123843, |
|
"loss": 0.9441, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.26926100628930816, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019927184131401297, |
|
"loss": 0.9471, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.2702437106918239, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001992510341119215, |
|
"loss": 0.9538, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.27122641509433965, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001992299349161782, |
|
"loss": 0.9398, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.27220911949685533, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00019920854378885632, |
|
"loss": 0.9313, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.2731918238993711, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019918686079288788, |
|
"loss": 0.9312, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.27417452830188677, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019916488599206367, |
|
"loss": 0.9434, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.2751572327044025, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001991426194510329, |
|
"loss": 0.9418, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.27613993710691825, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019912006123530305, |
|
"loss": 0.9366, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.27712264150943394, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00019909721141123975, |
|
"loss": 0.9275, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.2781053459119497, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019907407004606656, |
|
"loss": 0.9284, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.27908805031446543, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0001990506372078647, |
|
"loss": 0.9398, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.2800707547169811, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019902691296557284, |
|
"loss": 0.9489, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.28105345911949686, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.00019900289738898703, |
|
"loss": 0.9422, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.2820361635220126, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0001989785905487604, |
|
"loss": 0.9273, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.2830188679245283, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.000198953992516403, |
|
"loss": 0.9282, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.28400157232704404, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001989291033642815, |
|
"loss": 0.9269, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.2849842767295597, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00019890392316561904, |
|
"loss": 0.9296, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.28596698113207547, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00019887845199449504, |
|
"loss": 0.9268, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.2869496855345912, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00019885268992584496, |
|
"loss": 0.9222, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.2879323899371069, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019882663703546004, |
|
"loss": 0.9517, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.28891509433962265, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019880029339998715, |
|
"loss": 0.9219, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.2898977987421384, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0001987736590969285, |
|
"loss": 0.898, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.2908805031446541, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001987467342046414, |
|
"loss": 0.922, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.2918632075471698, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001987195188023381, |
|
"loss": 0.9049, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.2928459119496855, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00019869201297008552, |
|
"loss": 0.9283, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.29382861635220126, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00019866421678880507, |
|
"loss": 0.9222, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.294811320754717, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 0.9279, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2957940251572327, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0001986077537071166, |
|
"loss": 0.9152, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.29677672955974843, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00019857908697282133, |
|
"loss": 0.9122, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.2977594339622642, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00019855013022172316, |
|
"loss": 0.9351, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.29874213836477986, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.000198520883539012, |
|
"loss": 0.9285, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.2997248427672956, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00019849134701073072, |
|
"loss": 0.9257, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.3007075471698113, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019846152072377495, |
|
"loss": 0.9141, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.30169025157232704, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00019843140476589276, |
|
"loss": 0.9108, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.3026729559748428, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019840099922568437, |
|
"loss": 0.9039, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.30365566037735847, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00019837030419260208, |
|
"loss": 0.9158, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.3046383647798742, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.0001983393197569497, |
|
"loss": 0.9373, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.30562106918238996, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0001983080460098826, |
|
"loss": 0.9133, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.30660377358490565, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001982764830434072, |
|
"loss": 0.9152, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.3075864779874214, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00019824463095038082, |
|
"loss": 0.9218, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.3085691823899371, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00019821248982451143, |
|
"loss": 0.9041, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.3095518867924528, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00019818005976035723, |
|
"loss": 0.9134, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.31053459119496857, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019814734085332657, |
|
"loss": 0.9043, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.31151729559748426, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00019811433319967753, |
|
"loss": 0.9292, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.3125, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019808103689651762, |
|
"loss": 0.9138, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.31348270440251574, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019804745204180364, |
|
"loss": 0.9029, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.31446540880503143, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019801357873434121, |
|
"loss": 0.9003, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3154481132075472, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00019797941707378462, |
|
"loss": 0.9269, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.3164308176100629, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00019794496716063652, |
|
"loss": 0.932, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.3174135220125786, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00019791022909624751, |
|
"loss": 0.9096, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.31839622641509435, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019787520298281602, |
|
"loss": 0.8985, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.31937893081761004, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001978398889233878, |
|
"loss": 0.9244, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.3203616352201258, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001978042870218558, |
|
"loss": 0.9118, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.32134433962264153, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00019776839738295978, |
|
"loss": 0.8763, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.3223270440251572, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00019773222011228598, |
|
"loss": 0.887, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.32330974842767296, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00019769575531626695, |
|
"loss": 0.906, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.3242924528301887, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.000197659003102181, |
|
"loss": 0.9168, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.3252751572327044, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019762196357815207, |
|
"loss": 0.8751, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.32625786163522014, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0001975846368531494, |
|
"loss": 0.9358, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.3272405660377358, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019754702303698712, |
|
"loss": 0.9009, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.32822327044025157, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00019750912224032397, |
|
"loss": 0.9076, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.3292059748427673, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00019747093457466296, |
|
"loss": 0.9076, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.330188679245283, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00019743246015235116, |
|
"loss": 0.8957, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.33117138364779874, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019739369908657915, |
|
"loss": 0.9259, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.3321540880503145, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00019735465149138084, |
|
"loss": 0.907, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.3331367924528302, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00019731531748163318, |
|
"loss": 0.8991, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.3341194968553459, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001972756971730556, |
|
"loss": 0.9232, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.3351022012578616, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00019723579068220998, |
|
"loss": 0.9083, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.33608490566037735, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0001971955981265, |
|
"loss": 0.9031, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.3370676100628931, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.000197155119624171, |
|
"loss": 0.8836, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.3380503144654088, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019711435529430954, |
|
"loss": 0.8933, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.33903301886792453, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0001970733052568431, |
|
"loss": 0.9048, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.3400157232704403, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019703196963253972, |
|
"loss": 0.9183, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.34099842767295596, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00019699034854300763, |
|
"loss": 0.8875, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.3419811320754717, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019694844211069477, |
|
"loss": 0.9054, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.3429638364779874, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001969062504588887, |
|
"loss": 0.8919, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.34394654088050314, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019686377371171604, |
|
"loss": 0.8919, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.3449292452830189, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001968210119941421, |
|
"loss": 0.9133, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.34591194968553457, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019677796543197067, |
|
"loss": 0.8901, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.3468946540880503, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001967346341518434, |
|
"loss": 0.9216, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.34787735849056606, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00019669101828123975, |
|
"loss": 0.9063, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.34886006289308175, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00019664711794847625, |
|
"loss": 0.9262, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.3498427672955975, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00019660293328270647, |
|
"loss": 0.9045, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.35082547169811323, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019655846441392035, |
|
"loss": 0.895, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.3518081761006289, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00019651371147294406, |
|
"loss": 0.8893, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.35279088050314467, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00019646867459143942, |
|
"loss": 0.8772, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.35377358490566035, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00019642335390190367, |
|
"loss": 0.9043, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3547562893081761, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0001963777495376689, |
|
"loss": 0.9123, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.35573899371069184, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00019633186163290183, |
|
"loss": 0.9055, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.35672169811320753, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00019628569032260334, |
|
"loss": 0.9055, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.3577044025157233, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001962392357426081, |
|
"loss": 0.8827, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.358687106918239, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019619249802958413, |
|
"loss": 0.9038, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.3596698113207547, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019614547732103242, |
|
"loss": 0.9204, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.36065251572327045, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0001960981737552865, |
|
"loss": 0.9204, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.36163522012578614, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019605058747151208, |
|
"loss": 0.909, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.3626179245283019, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0001960027186097067, |
|
"loss": 0.8926, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.3636006289308176, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019595456731069904, |
|
"loss": 0.8953, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.3645833333333333, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00019590613371614892, |
|
"loss": 0.9001, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.36556603773584906, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00019585741796854654, |
|
"loss": 0.9132, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.3665487421383648, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00019580842021121213, |
|
"loss": 0.8892, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.3675314465408805, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00019575914058829577, |
|
"loss": 0.9132, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.36851415094339623, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019570957924477665, |
|
"loss": 0.8973, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.3694968553459119, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00019565973632646277, |
|
"loss": 0.9052, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.37047955974842767, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019560961197999052, |
|
"loss": 0.8967, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.3714622641509434, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00019555920635282433, |
|
"loss": 0.8919, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.3724449685534591, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0001955085195932561, |
|
"loss": 0.904, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.37342767295597484, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019545755185040474, |
|
"loss": 0.8911, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3744103773584906, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00019540630327421587, |
|
"loss": 0.8749, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.3753930817610063, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019535477401546133, |
|
"loss": 0.9019, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.376375786163522, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019530296422573873, |
|
"loss": 0.9033, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00019525087405747088, |
|
"loss": 0.8746, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.37834119496855345, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0001951985036639056, |
|
"loss": 0.908, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.3793238993710692, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001951458531991151, |
|
"loss": 0.8843, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.3803066037735849, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0001950929228179954, |
|
"loss": 0.907, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.3812893081761006, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00019503971267626621, |
|
"loss": 0.8921, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.38227201257861637, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019498622293047025, |
|
"loss": 0.8808, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.38325471698113206, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00019493245373797271, |
|
"loss": 0.9007, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.3842374213836478, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00019487840525696105, |
|
"loss": 0.8806, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.38522012578616355, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001948240776464443, |
|
"loss": 0.9046, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.38620283018867924, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019476947106625273, |
|
"loss": 0.8808, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.387185534591195, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001947145856770373, |
|
"loss": 0.8979, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.38816823899371067, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0001946594216402692, |
|
"loss": 0.8732, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.3891509433962264, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00019460397911823945, |
|
"loss": 0.8917, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.39013364779874216, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00019454825827405834, |
|
"loss": 0.882, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.39111635220125784, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.00019449225927165492, |
|
"loss": 0.8909, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.3920990566037736, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00019443598227577674, |
|
"loss": 0.8806, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.39308176100628933, |
|
"grad_norm": 0.439453125, |
|
"learning_rate": 0.00019437942745198893, |
|
"loss": 0.9044, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.394064465408805, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00019432259496667424, |
|
"loss": 0.9004, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.39504716981132076, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019426548498703217, |
|
"loss": 0.9082, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.39602987421383645, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001942080976810786, |
|
"loss": 0.8808, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.3970125786163522, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00019415043321764527, |
|
"loss": 0.8773, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.39799528301886794, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00019409249176637945, |
|
"loss": 0.8632, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.39897798742138363, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00019403427349774314, |
|
"loss": 0.9005, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.3999606918238994, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0001939757785830128, |
|
"loss": 0.8913, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.4009433962264151, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019391700719427872, |
|
"loss": 0.8829, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.4019261006289308, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00019385795950444473, |
|
"loss": 0.8666, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.40290880503144655, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00019379863568722732, |
|
"loss": 0.9056, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.40389150943396224, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00019373903591715544, |
|
"loss": 0.887, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.404874213836478, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0001936791603695699, |
|
"loss": 0.8828, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.4058569182389937, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019361900922062282, |
|
"loss": 0.9003, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.4068396226415094, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019355858264727714, |
|
"loss": 0.8674, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.40782232704402516, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019349788082730603, |
|
"loss": 0.877, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.4088050314465409, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00019343690393929251, |
|
"loss": 0.8493, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.4097877358490566, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00019337565216262878, |
|
"loss": 0.8754, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.41077044025157233, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019331412567751585, |
|
"loss": 0.8928, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.4117531446540881, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0001932523246649628, |
|
"loss": 0.8881, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.41273584905660377, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001931902493067864, |
|
"loss": 0.8883, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.4137185534591195, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00019312789978561057, |
|
"loss": 0.8558, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.4147012578616352, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019306527628486578, |
|
"loss": 0.8752, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.41568396226415094, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019300237898878852, |
|
"loss": 0.9016, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019293920808242083, |
|
"loss": 0.8701, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.4176493710691824, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00019287576375160968, |
|
"loss": 0.8905, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.4186320754716981, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00019281204618300644, |
|
"loss": 0.8675, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.41961477987421386, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00019274805556406633, |
|
"loss": 0.8766, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.42059748427672955, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00019268379208304789, |
|
"loss": 0.8839, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.4215801886792453, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001926192559290124, |
|
"loss": 0.9031, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.422562893081761, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00019255444729182337, |
|
"loss": 0.8841, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.4235455974842767, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019248936636214592, |
|
"loss": 0.8584, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.42452830188679247, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00019242401333144623, |
|
"loss": 0.8829, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.42551100628930816, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019235838839199102, |
|
"loss": 0.8676, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.4264937106918239, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019229249173684693, |
|
"loss": 0.8983, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.42747641509433965, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00019222632355988007, |
|
"loss": 0.8851, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.42845911949685533, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00019215988405575524, |
|
"loss": 0.8752, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.4294418238993711, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0001920931734199355, |
|
"loss": 0.87, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.43042452830188677, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00019202619184868167, |
|
"loss": 0.8567, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.4314072327044025, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019195893953905153, |
|
"loss": 0.8719, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.43238993710691825, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00019189141668889942, |
|
"loss": 0.8854, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.43337264150943394, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019182362349687559, |
|
"loss": 0.8816, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.4343553459119497, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00019175556016242566, |
|
"loss": 0.8858, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.43533805031446543, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00019168722688578998, |
|
"loss": 0.8927, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.4363207547169811, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00019161862386800303, |
|
"loss": 0.8676, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.43730345911949686, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00019154975131089293, |
|
"loss": 0.861, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.4382861635220126, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0001914806094170807, |
|
"loss": 0.894, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.4392688679245283, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019141119838997982, |
|
"loss": 0.8662, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.44025157232704404, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00019134151843379544, |
|
"loss": 0.8717, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.4412342767295597, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00019127156975352406, |
|
"loss": 0.8654, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.44221698113207547, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00019120135255495257, |
|
"loss": 0.8714, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.4431996855345912, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00019113086704465796, |
|
"loss": 0.8621, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.4441823899371069, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00019106011343000655, |
|
"loss": 0.8671, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.44516509433962265, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00019098909191915344, |
|
"loss": 0.8493, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.4461477987421384, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00019091780272104182, |
|
"loss": 0.8721, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.4471305031446541, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001908462460454024, |
|
"loss": 0.9037, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.4481132075471698, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0001907744221027529, |
|
"loss": 0.8682, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.4490959119496855, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00019070233110439721, |
|
"loss": 0.8732, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.45007861635220126, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.000190629973262425, |
|
"loss": 0.8801, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.451061320754717, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001905573487897109, |
|
"loss": 0.8906, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.4520440251572327, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.000190484457899914, |
|
"loss": 0.8755, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.45302672955974843, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00019041130080747718, |
|
"loss": 0.8798, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.4540094339622642, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00019033787772762645, |
|
"loss": 0.8659, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.45499213836477986, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001902641888763704, |
|
"loss": 0.8803, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.4559748427672956, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00019019023447049951, |
|
"loss": 0.8882, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.4569575471698113, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001901160147275854, |
|
"loss": 0.8623, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.45794025157232704, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00019004152986598052, |
|
"loss": 0.8802, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.4589229559748428, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00018996678010481705, |
|
"loss": 0.8626, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.45990566037735847, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00018989176566400667, |
|
"loss": 0.8727, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.4608883647798742, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00018981648676423966, |
|
"loss": 0.8822, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.46187106918238996, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00018974094362698437, |
|
"loss": 0.849, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.46285377358490565, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.0001896651364744865, |
|
"loss": 0.8585, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.4638364779874214, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00018958906552976842, |
|
"loss": 0.8691, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.4648191823899371, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00018951273101662874, |
|
"loss": 0.8903, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.4658018867924528, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00018943613315964132, |
|
"loss": 0.8721, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.46678459119496857, |
|
"grad_norm": 0.484375, |
|
"learning_rate": 0.00018935927218415483, |
|
"loss": 0.8842, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.46776729559748426, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.00018928214831629204, |
|
"loss": 0.8656, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.46875, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00018920476178294909, |
|
"loss": 0.8631, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.46973270440251574, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001891271128117949, |
|
"loss": 0.8863, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.47071540880503143, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00018904920163127054, |
|
"loss": 0.863, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.4716981132075472, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00018897102847058837, |
|
"loss": 0.8759, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4726808176100629, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00018889259355973163, |
|
"loss": 0.8641, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.4736635220125786, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00018881389712945349, |
|
"loss": 0.8534, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.47464622641509435, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00018873493941127652, |
|
"loss": 0.864, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.47562893081761004, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0001886557206374921, |
|
"loss": 0.8702, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.4766116352201258, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001885762410411595, |
|
"loss": 0.8608, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.47759433962264153, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001884965008561054, |
|
"loss": 0.8864, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.4785770440251572, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00018841650031692312, |
|
"loss": 0.875, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.47955974842767296, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001883362396589719, |
|
"loss": 0.8455, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.4805424528301887, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00018825571911837625, |
|
"loss": 0.8611, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.4815251572327044, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00018817493893202527, |
|
"loss": 0.8669, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.48250786163522014, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0001880938993375719, |
|
"loss": 0.8904, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.4834905660377358, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0001880126005734323, |
|
"loss": 0.8513, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.48447327044025157, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00018793104287878504, |
|
"loss": 0.8556, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.4854559748427673, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00018784922649357045, |
|
"loss": 0.8426, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.486438679245283, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00018776715165849003, |
|
"loss": 0.8405, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.48742138364779874, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00018768481861500548, |
|
"loss": 0.8591, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.4884040880503145, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00018760222760533826, |
|
"loss": 0.8661, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.4893867924528302, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001875193788724687, |
|
"loss": 0.8563, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.4903694968553459, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00018743627266013535, |
|
"loss": 0.8365, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.4913522012578616, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0001873529092128343, |
|
"loss": 0.8474, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.49233490566037735, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.0001872692887758184, |
|
"loss": 0.8671, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.4933176100628931, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00018718541159509644, |
|
"loss": 0.8693, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.4943003144654088, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.0001871012779174327, |
|
"loss": 0.8819, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.49528301886792453, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00018701688799034605, |
|
"loss": 0.8527, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.4962657232704403, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00018693224206210919, |
|
"loss": 0.8662, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.49724842767295596, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0001868473403817479, |
|
"loss": 0.8502, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.4982311320754717, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00018676218319904048, |
|
"loss": 0.8828, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.4992138364779874, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018667677076451695, |
|
"loss": 0.8674, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.5001965408805031, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00018659110332945814, |
|
"loss": 0.8574, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.5011792452830188, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00018650518114589516, |
|
"loss": 0.8442, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.5021619496855346, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0001864190044666086, |
|
"loss": 0.8537, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.5031446540880503, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001863325735451277, |
|
"loss": 0.8644, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.504127358490566, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018624588863572973, |
|
"loss": 0.8486, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.5051100628930818, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00018615894999343918, |
|
"loss": 0.8799, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.5060927672955975, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00018607175787402696, |
|
"loss": 0.8628, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.5070754716981132, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00018598431253400986, |
|
"loss": 0.8723, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.508058176100629, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00018589661423064937, |
|
"loss": 0.8562, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.5090408805031447, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00018580866322195143, |
|
"loss": 0.8732, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.5100235849056604, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00018572045976666534, |
|
"loss": 0.8642, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.5110062893081762, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001856320041242831, |
|
"loss": 0.859, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5119889937106918, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018554329655503865, |
|
"loss": 0.8681, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.5129716981132075, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.000185454337319907, |
|
"loss": 0.8687, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.5139544025157232, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.0001853651266806037, |
|
"loss": 0.8473, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.514937106918239, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.00018527566489958384, |
|
"loss": 0.8722, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.5159198113207547, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00018518595224004136, |
|
"loss": 0.8521, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.5169025157232704, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001850959889659083, |
|
"loss": 0.8802, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.5178852201257862, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018500577534185397, |
|
"loss": 0.8393, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.5188679245283019, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001849153116332843, |
|
"loss": 0.8637, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.5198506289308176, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00018482459810634076, |
|
"loss": 0.8627, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.5208333333333334, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001847336350279, |
|
"loss": 0.8557, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.5218160377358491, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00018464242266557273, |
|
"loss": 0.863, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.5227987421383647, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00018455096128770307, |
|
"loss": 0.8662, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.5237814465408805, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00018445925116336768, |
|
"loss": 0.8416, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.5247641509433962, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00018436729256237516, |
|
"loss": 0.8424, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.5257468553459119, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018427508575526494, |
|
"loss": 0.862, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.5267295597484277, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00018418263101330684, |
|
"loss": 0.8768, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.5277122641509434, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00018408992860849996, |
|
"loss": 0.8545, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.5286949685534591, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00018399697881357212, |
|
"loss": 0.8423, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.5296776729559748, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0001839037819019789, |
|
"loss": 0.8504, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.5306603773584906, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00018381033814790287, |
|
"loss": 0.896, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5316430817610063, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00018371664782625287, |
|
"loss": 0.8659, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.532625786163522, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00018362271121266307, |
|
"loss": 0.8628, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.5336084905660378, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00018352852858349227, |
|
"loss": 0.8289, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.5345911949685535, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.000183434100215823, |
|
"loss": 0.8347, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.5355738993710691, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018333942638746082, |
|
"loss": 0.8593, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.5365566037735849, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001832445073769333, |
|
"loss": 0.8467, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.5375393081761006, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018314934346348947, |
|
"loss": 0.8591, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.5385220125786163, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00018305393492709874, |
|
"loss": 0.8566, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.5395047169811321, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0001829582820484503, |
|
"loss": 0.8534, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.5404874213836478, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00018286238510895208, |
|
"loss": 0.8593, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.5414701257861635, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00018276624439073012, |
|
"loss": 0.8619, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.5424528301886793, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001826698601766276, |
|
"loss": 0.8458, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.543435534591195, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00018257323275020407, |
|
"loss": 0.8443, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.5444182389937107, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0001824763623957346, |
|
"loss": 0.8522, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.5454009433962265, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00018237924939820896, |
|
"loss": 0.8463, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.5463836477987422, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00018228189404333075, |
|
"loss": 0.8557, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.5473663522012578, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0001821842966175166, |
|
"loss": 0.8483, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.5483490566037735, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018208645740789528, |
|
"loss": 0.8223, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.5493317610062893, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00018198837670230694, |
|
"loss": 0.8504, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.550314465408805, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001818900547893021, |
|
"loss": 0.8426, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5512971698113207, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00018179149195814097, |
|
"loss": 0.8764, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.5522798742138365, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001816926884987926, |
|
"loss": 0.8288, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.5532625786163522, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00018159364470193381, |
|
"loss": 0.841, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.5542452830188679, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0001814943608589486, |
|
"loss": 0.8647, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.5552279874213837, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00018139483726192714, |
|
"loss": 0.8722, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.5562106918238994, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00018129507420366493, |
|
"loss": 0.8462, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.5571933962264151, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00018119507197766202, |
|
"loss": 0.8596, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.5581761006289309, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00018109483087812205, |
|
"loss": 0.8241, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.5591588050314465, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00018099435119995136, |
|
"loss": 0.8511, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.5601415094339622, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001808936332387583, |
|
"loss": 0.8544, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.561124213836478, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00018079267729085213, |
|
"loss": 0.8468, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.5621069182389937, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00018069148365324237, |
|
"loss": 0.8427, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.5630896226415094, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0001805900526236377, |
|
"loss": 0.8559, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.5640723270440252, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00018048838450044526, |
|
"loss": 0.88, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.5650550314465409, |
|
"grad_norm": 0.453125, |
|
"learning_rate": 0.0001803864795827697, |
|
"loss": 0.8573, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.5660377358490566, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00018028433817041236, |
|
"loss": 0.8477, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.5670204402515723, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0001801819605638703, |
|
"loss": 0.8425, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.5680031446540881, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00018007934706433542, |
|
"loss": 0.8495, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.5689858490566038, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00017997649797369365, |
|
"loss": 0.8543, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.5699685534591195, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00017987341359452404, |
|
"loss": 0.8473, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5709512578616353, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001797700942300978, |
|
"loss": 0.8441, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.5719339622641509, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00017966654018437757, |
|
"loss": 0.8342, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.5729166666666666, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00017956275176201624, |
|
"loss": 0.8525, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.5738993710691824, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00017945872926835636, |
|
"loss": 0.8348, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.5748820754716981, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001793544730094291, |
|
"loss": 0.8544, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.5758647798742138, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00017924998329195332, |
|
"loss": 0.8167, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.5768474842767296, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00017914526042333475, |
|
"loss": 0.8233, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.5778301886792453, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00017904030471166496, |
|
"loss": 0.8263, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.578812893081761, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00017893511646572066, |
|
"loss": 0.8455, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.5797955974842768, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00017882969599496254, |
|
"loss": 0.8454, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.5807783018867925, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00017872404360953466, |
|
"loss": 0.8488, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.5817610062893082, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00017861815962026315, |
|
"loss": 0.8565, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.5827437106918238, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00017851204433865566, |
|
"loss": 0.8341, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.5837264150943396, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00017840569807690032, |
|
"loss": 0.8458, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.5847091194968553, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 0.00017829912114786462, |
|
"loss": 0.8504, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.585691823899371, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00017819231386509486, |
|
"loss": 0.854, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.5866745283018868, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00017808527654281496, |
|
"loss": 0.8593, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.5876572327044025, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00017797800949592558, |
|
"loss": 0.8598, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.5886399371069182, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00017787051304000322, |
|
"loss": 0.8317, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.589622641509434, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017776278749129937, |
|
"loss": 0.8591, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5906053459119497, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00017765483316673945, |
|
"loss": 0.8543, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.5915880503144654, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0001775466503839219, |
|
"loss": 0.8477, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.5925707547169812, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.00017743823946111736, |
|
"loss": 0.8337, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.5935534591194969, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00017732960071726762, |
|
"loss": 0.8434, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.5945361635220126, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00017722073447198466, |
|
"loss": 0.8241, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.5955188679245284, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00017711164104554982, |
|
"loss": 0.8291, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.596501572327044, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00017700232075891278, |
|
"loss": 0.831, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.5974842767295597, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00017689277393369063, |
|
"loss": 0.8397, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.5984669811320755, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00017678300089216692, |
|
"loss": 0.8274, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.5994496855345912, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00017667300195729082, |
|
"loss": 0.8553, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.6004323899371069, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00017656277745267592, |
|
"loss": 0.8322, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.6014150943396226, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00017645232770259952, |
|
"loss": 0.8481, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.6023977987421384, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00017634165303200157, |
|
"loss": 0.8373, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.6033805031446541, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00017623075376648376, |
|
"loss": 0.8587, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.6043632075471698, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00017611963023230845, |
|
"loss": 0.8483, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.6053459119496856, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0001760082827563979, |
|
"loss": 0.8226, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.6063286163522013, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00017589671166633303, |
|
"loss": 0.8524, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.6073113207547169, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00017578491729035287, |
|
"loss": 0.8654, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.6082940251572327, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00017567289995735314, |
|
"loss": 0.8372, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.6092767295597484, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00017556065999688557, |
|
"loss": 0.8416, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.6102594339622641, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0001754481977391569, |
|
"loss": 0.8206, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.6112421383647799, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00017533551351502782, |
|
"loss": 0.8267, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.6122248427672956, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00017522260765601196, |
|
"loss": 0.8503, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.6132075471698113, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00017510948049427513, |
|
"loss": 0.8227, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.6141902515723271, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00017499613236263413, |
|
"loss": 0.8517, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.6151729559748428, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00017488256359455586, |
|
"loss": 0.8442, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.6161556603773585, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00017476877452415638, |
|
"loss": 0.8443, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.6171383647798742, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00017465476548619974, |
|
"loss": 0.8332, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.61812106918239, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001745405368160972, |
|
"loss": 0.8067, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.6191037735849056, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0001744260888499063, |
|
"loss": 0.8161, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.6200864779874213, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001743114219243295, |
|
"loss": 0.8256, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.6210691823899371, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001741965363767136, |
|
"loss": 0.8252, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.6220518867924528, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00017408143254504856, |
|
"loss": 0.8465, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.6230345911949685, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00017396611076796645, |
|
"loss": 0.8581, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.6240172955974843, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00017385057138474063, |
|
"loss": 0.8625, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.625, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0001737348147352846, |
|
"loss": 0.8294, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.6259827044025157, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.000173618841160151, |
|
"loss": 0.851, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.6269654088050315, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00017350265100053074, |
|
"loss": 0.8578, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.6279481132075472, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00017338624459825187, |
|
"loss": 0.8397, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.6289308176100629, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00017326962229577867, |
|
"loss": 0.8468, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6299135220125787, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00017315278443621055, |
|
"loss": 0.8292, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 0.6308962264150944, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001730357313632811, |
|
"loss": 0.8393, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.63187893081761, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00017291846342135697, |
|
"loss": 0.8371, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 0.6328616352201258, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.00017280098095543716, |
|
"loss": 0.8396, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.6338443396226415, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00017268328431115155, |
|
"loss": 0.8234, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 0.6348270440251572, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001725653738347603, |
|
"loss": 0.8238, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 0.6358097484276729, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00017244724987315255, |
|
"loss": 0.8451, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 0.6367924528301887, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00017232891277384562, |
|
"loss": 0.833, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.6377751572327044, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0001722103628849838, |
|
"loss": 0.8267, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 0.6387578616352201, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00017209160055533734, |
|
"loss": 0.8342, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.6397405660377359, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00017197262613430158, |
|
"loss": 0.8224, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 0.6407232704402516, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00017185343997189588, |
|
"loss": 0.831, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.6417059748427673, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00017173404241876237, |
|
"loss": 0.8337, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 0.6426886792452831, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.0001716144338261652, |
|
"loss": 0.8413, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.6436713836477987, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0001714946145459894, |
|
"loss": 0.8254, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 0.6446540880503144, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00017137458493073977, |
|
"loss": 0.8219, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.6456367924528302, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00017125434533353992, |
|
"loss": 0.8461, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 0.6466194968553459, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00017113389610813132, |
|
"loss": 0.8407, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 0.6476022012578616, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.000171013237608872, |
|
"loss": 0.8413, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 0.6485849056603774, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00017089237019073578, |
|
"loss": 0.8479, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6495676100628931, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0001707712942093111, |
|
"loss": 0.8321, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 0.6505503144654088, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00017065001002079995, |
|
"loss": 0.8345, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 0.6515330188679245, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001705285179820169, |
|
"loss": 0.8397, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 0.6525157232704403, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00017040681845038798, |
|
"loss": 0.8148, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.653498427672956, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00017028491178394965, |
|
"loss": 0.8239, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 0.6544811320754716, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001701627983413478, |
|
"loss": 0.8126, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.6554638364779874, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001700404784818366, |
|
"loss": 0.8538, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 0.6564465408805031, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00016991795256527756, |
|
"loss": 0.865, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.6574292452830188, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00016979522095213832, |
|
"loss": 0.8402, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 0.6584119496855346, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00016967228400349167, |
|
"loss": 0.8473, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.6593946540880503, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.0001695491420810146, |
|
"loss": 0.8289, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 0.660377358490566, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00016942579554698708, |
|
"loss": 0.8312, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.6613600628930818, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.00016930224476429092, |
|
"loss": 0.8244, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 0.6623427672955975, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00016917849009640904, |
|
"loss": 0.8364, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 0.6633254716981132, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00016905453190742397, |
|
"loss": 0.8244, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 0.664308176100629, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00016893037056201713, |
|
"loss": 0.8595, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.6652908805031447, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00016880600642546763, |
|
"loss": 0.8357, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 0.6662735849056604, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00016868143986365107, |
|
"loss": 0.8388, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.6672562893081762, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00016855667124303865, |
|
"loss": 0.8117, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 0.6682389937106918, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00016843170093069605, |
|
"loss": 0.8343, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6692216981132075, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00016830652929428224, |
|
"loss": 0.848, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 0.6702044025157232, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00016818115670204863, |
|
"loss": 0.8264, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 0.671187106918239, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00016805558352283768, |
|
"loss": 0.8364, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 0.6721698113207547, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00016792981012608198, |
|
"loss": 0.8046, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.6731525157232704, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00016780383688180323, |
|
"loss": 0.8455, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 0.6741352201257862, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00016767766416061108, |
|
"loss": 0.8307, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 0.6751179245283019, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00016755129233370197, |
|
"loss": 0.8364, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 0.6761006289308176, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00016742472177285812, |
|
"loss": 0.834, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.6770833333333334, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0001672979528504465, |
|
"loss": 0.8306, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 0.6780660377358491, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.00016717098593941752, |
|
"loss": 0.837, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.6790487421383647, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00016704382141330415, |
|
"loss": 0.8207, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 0.6800314465408805, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00016691645964622074, |
|
"loss": 0.8276, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.6810141509433962, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00016678890101286186, |
|
"loss": 0.8383, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 0.6819968553459119, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00016666114588850133, |
|
"loss": 0.8393, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 0.6829795597484277, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00016653319464899103, |
|
"loss": 0.824, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 0.6839622641509434, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0001664050476707597, |
|
"loss": 0.8223, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.6849449685534591, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00016627670533081213, |
|
"loss": 0.8293, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 0.6859276729559748, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00016614816800672764, |
|
"loss": 0.8357, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 0.6869103773584906, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00016601943607665932, |
|
"loss": 0.8422, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 0.6878930817610063, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001658905099193328, |
|
"loss": 0.8289, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.688875786163522, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00016576138991404506, |
|
"loss": 0.8014, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 0.6898584905660378, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00016563207644066337, |
|
"loss": 0.8128, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.6908411949685535, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00016550256987962425, |
|
"loss": 0.8319, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 0.6918238993710691, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00016537287061193218, |
|
"loss": 0.8284, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.6928066037735849, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00016524297901915867, |
|
"loss": 0.8536, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 0.6937893081761006, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00016511289548344098, |
|
"loss": 0.8315, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 0.6947720125786163, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001649826203874811, |
|
"loss": 0.8381, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 0.6957547169811321, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00016485215411454453, |
|
"loss": 0.8288, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.6967374213836478, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00016472149704845927, |
|
"loss": 0.8327, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 0.6977201257861635, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00016459064957361465, |
|
"loss": 0.8182, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.6987028301886793, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00016445961207496004, |
|
"loss": 0.8309, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 0.699685534591195, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00016432838493800401, |
|
"loss": 0.8236, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.7006682389937107, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00016419696854881298, |
|
"loss": 0.824, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 0.7016509433962265, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00016406536329401008, |
|
"loss": 0.8053, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.7026336477987422, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00016393356956077417, |
|
"loss": 0.8234, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 0.7036163522012578, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00016380158773683862, |
|
"loss": 0.8286, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.7045990566037735, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00016366941821049005, |
|
"loss": 0.8309, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 0.7055817610062893, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00016353706137056735, |
|
"loss": 0.8514, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 0.706564465408805, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00016340451760646054, |
|
"loss": 0.8247, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 0.7075471698113207, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00016327178730810948, |
|
"loss": 0.8462, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.7085298742138365, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00016313887086600286, |
|
"loss": 0.8386, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 0.7095125786163522, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00016300576867117698, |
|
"loss": 0.8303, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 0.7104952830188679, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001628724811152146, |
|
"loss": 0.8145, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 0.7114779874213837, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00016273900859024382, |
|
"loss": 0.8209, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.7124606918238994, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00016260535148893702, |
|
"loss": 0.8262, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 0.7134433962264151, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00016247151020450933, |
|
"loss": 0.8207, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.7144261006289309, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00016233748513071804, |
|
"loss": 0.8523, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 0.7154088050314465, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.000162203276661861, |
|
"loss": 0.8252, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.7163915094339622, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0001620688851927756, |
|
"loss": 0.8101, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 0.717374213836478, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00016193431111883756, |
|
"loss": 0.8187, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.7183569182389937, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00016179955483596, |
|
"loss": 0.8209, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 0.7193396226415094, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00016166461674059192, |
|
"loss": 0.8181, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.7203223270440252, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00016152949722971727, |
|
"loss": 0.824, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 0.7213050314465409, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00016139419670085372, |
|
"loss": 0.8294, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 0.7222877358490566, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00016125871555205148, |
|
"loss": 0.8181, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 0.7232704402515723, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00016112305418189218, |
|
"loss": 0.8378, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.7242531446540881, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00016098721298948756, |
|
"loss": 0.8201, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 0.7252358490566038, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00016085119237447848, |
|
"loss": 0.8445, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.7262185534591195, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00016071499273703364, |
|
"loss": 0.8048, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 0.7272012578616353, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001605786144778484, |
|
"loss": 0.7989, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7281839622641509, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.00016044205799814362, |
|
"loss": 0.8117, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 0.7291666666666666, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00016030532369966448, |
|
"loss": 0.8252, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 0.7301493710691824, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00016016841198467937, |
|
"loss": 0.8218, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 0.7311320754716981, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00016003132325597842, |
|
"loss": 0.8254, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.7321147798742138, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00015989405791687285, |
|
"loss": 0.827, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 0.7330974842767296, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00015975661637119317, |
|
"loss": 0.8323, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 0.7340801886792453, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00015961899902328845, |
|
"loss": 0.8223, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 0.735062893081761, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001594812062780249, |
|
"loss": 0.8, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.7360455974842768, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001593432385407848, |
|
"loss": 0.8112, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 0.7370283018867925, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00015920509621746517, |
|
"loss": 0.8146, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.7380110062893082, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00015906677971447674, |
|
"loss": 0.8404, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 0.7389937106918238, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00015892828943874263, |
|
"loss": 0.8136, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.7399764150943396, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00015878962579769716, |
|
"loss": 0.8105, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 0.7409591194968553, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001586507891992848, |
|
"loss": 0.8034, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 0.741941823899371, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00015851178005195867, |
|
"loss": 0.8042, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 0.7429245283018868, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0001583725987646797, |
|
"loss": 0.83, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.7439072327044025, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00015823324574691517, |
|
"loss": 0.8129, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 0.7448899371069182, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00015809372140863763, |
|
"loss": 0.7918, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 0.745872641509434, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00015795402616032358, |
|
"loss": 0.8354, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 0.7468553459119497, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001578141604129524, |
|
"loss": 0.8187, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7478380503144654, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00015767412457800504, |
|
"loss": 0.8346, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 0.7488207547169812, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00015753391906746282, |
|
"loss": 0.8073, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.7498034591194969, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0001573935442938063, |
|
"loss": 0.8113, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 0.7507861635220126, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00015725300067001395, |
|
"loss": 0.8181, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.7517688679245284, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00015711228860956102, |
|
"loss": 0.8136, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 0.752751572327044, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00015697140852641834, |
|
"loss": 0.8214, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 0.7537342767295597, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.000156830360835051, |
|
"loss": 0.8166, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00015668914595041712, |
|
"loss": 0.8477, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.7556996855345912, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00015654776428796686, |
|
"loss": 0.8105, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 0.7566823899371069, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00015640621626364094, |
|
"loss": 0.8393, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.7576650943396226, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00015626450229386948, |
|
"loss": 0.8025, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 0.7586477987421384, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00015612262279557094, |
|
"loss": 0.833, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.7596305031446541, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0001559805781861506, |
|
"loss": 0.8165, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 0.7606132075471698, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.0001558383688834996, |
|
"loss": 0.8155, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.7615959119496856, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0001556959953059935, |
|
"loss": 0.8105, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 0.7625786163522013, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00015555345787249128, |
|
"loss": 0.8203, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.7635613207547169, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.00015541075700233395, |
|
"loss": 0.8226, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 0.7645440251572327, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.0001552678931153432, |
|
"loss": 0.8373, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 0.7655267295597484, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.0001551248666318206, |
|
"loss": 0.8278, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 0.7665094339622641, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001549816779725457, |
|
"loss": 0.8169, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.7674921383647799, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.0001548383275587755, |
|
"loss": 0.8137, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 0.7684748427672956, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 0.8154, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 0.7694575471698113, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0001545511431551547, |
|
"loss": 0.8084, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 0.7704402515723271, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.0001544073100101922, |
|
"loss": 0.8502, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.7714229559748428, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 0.00015426331680050824, |
|
"loss": 0.8345, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 0.7724056603773585, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.00015411916394972655, |
|
"loss": 0.7933, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.7733883647798742, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00015397485188194064, |
|
"loss": 0.8072, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 0.77437106918239, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00015383038102171248, |
|
"loss": 0.7954, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.7753537735849056, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00015368575179407104, |
|
"loss": 0.8003, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 0.7763364779874213, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00015354096462451134, |
|
"loss": 0.8056, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.7773191823899371, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00015339601993899304, |
|
"loss": 0.8217, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 0.7783018867924528, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00015325091816393912, |
|
"loss": 0.8441, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.7792845911949685, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00015310565972623483, |
|
"loss": 0.8003, |
|
"step": 3965 |
|
}, |
|
{ |
|
"epoch": 0.7802672955974843, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00015296024505322625, |
|
"loss": 0.8123, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 0.78125, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00015281467457271909, |
|
"loss": 0.8415, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 0.7822327044025157, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0001526689487129775, |
|
"loss": 0.813, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.7832154088050315, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00015252306790272267, |
|
"loss": 0.8125, |
|
"step": 3985 |
|
}, |
|
{ |
|
"epoch": 0.7841981132075472, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00015237703257113173, |
|
"loss": 0.8013, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.7851808176100629, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.0001522308431478364, |
|
"loss": 0.812, |
|
"step": 3995 |
|
}, |
|
{ |
|
"epoch": 0.7861635220125787, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00015208450006292163, |
|
"loss": 0.8184, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7871462264150944, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00015193800374692457, |
|
"loss": 0.8145, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 0.78812893081761, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0001517913546308331, |
|
"loss": 0.8209, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 0.7891116352201258, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00015164455314608467, |
|
"loss": 0.8417, |
|
"step": 4015 |
|
}, |
|
{ |
|
"epoch": 0.7900943396226415, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0001514975997245649, |
|
"loss": 0.8193, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.7910770440251572, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00015135049479860657, |
|
"loss": 0.8271, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 0.7920597484276729, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00015120323880098803, |
|
"loss": 0.8001, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 0.7930424528301887, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00015105583216493216, |
|
"loss": 0.793, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 0.7940251572327044, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00015090827532410499, |
|
"loss": 0.7943, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.7950078616352201, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00015076056871261444, |
|
"loss": 0.8088, |
|
"step": 4045 |
|
}, |
|
{ |
|
"epoch": 0.7959905660377359, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00015061271276500904, |
|
"loss": 0.8059, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.7969732704402516, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00015046470791627668, |
|
"loss": 0.8052, |
|
"step": 4055 |
|
}, |
|
{ |
|
"epoch": 0.7979559748427673, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00015031655460184337, |
|
"loss": 0.7997, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.7989386792452831, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.00015016825325757182, |
|
"loss": 0.8233, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 0.7999213836477987, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00015001980431976022, |
|
"loss": 0.8133, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 0.8009040880503144, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001498712082251411, |
|
"loss": 0.8027, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 0.8018867924528302, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00014972246541087978, |
|
"loss": 0.8174, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.8028694968553459, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00014957357631457333, |
|
"loss": 0.8162, |
|
"step": 4085 |
|
}, |
|
{ |
|
"epoch": 0.8038522012578616, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00014942454137424914, |
|
"loss": 0.8031, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 0.8048349056603774, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00014927536102836357, |
|
"loss": 0.8032, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 0.8058176100628931, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00014912603571580097, |
|
"loss": 0.826, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.8068003144654088, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00014897656587587198, |
|
"loss": 0.791, |
|
"step": 4105 |
|
}, |
|
{ |
|
"epoch": 0.8077830188679245, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00014882695194831256, |
|
"loss": 0.8137, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.8087657232704403, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00014867719437328252, |
|
"loss": 0.8015, |
|
"step": 4115 |
|
}, |
|
{ |
|
"epoch": 0.809748427672956, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00014852729359136432, |
|
"loss": 0.8189, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.8107311320754716, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001483772500435616, |
|
"loss": 0.8096, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 0.8117138364779874, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001482270641712982, |
|
"loss": 0.8014, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 0.8126965408805031, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00014807673641641653, |
|
"loss": 0.8062, |
|
"step": 4135 |
|
}, |
|
{ |
|
"epoch": 0.8136792452830188, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001479262672211765, |
|
"loss": 0.8215, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.8146619496855346, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.00014777565702825407, |
|
"loss": 0.8257, |
|
"step": 4145 |
|
}, |
|
{ |
|
"epoch": 0.8156446540880503, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00014762490628074005, |
|
"loss": 0.815, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.816627358490566, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00014747401542213875, |
|
"loss": 0.8213, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 0.8176100628930818, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0001473229848963667, |
|
"loss": 0.8136, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.8185927672955975, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00014717181514775128, |
|
"loss": 0.796, |
|
"step": 4165 |
|
}, |
|
{ |
|
"epoch": 0.8195754716981132, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00014702050662102948, |
|
"loss": 0.8168, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.820558176100629, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00014686905976134663, |
|
"loss": 0.8266, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 0.8215408805031447, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00014671747501425497, |
|
"loss": 0.7863, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.8225235849056604, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00014656575282571234, |
|
"loss": 0.8031, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 0.8235062893081762, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00014641389364208107, |
|
"loss": 0.7824, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 0.8244889937106918, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00014626189791012647, |
|
"loss": 0.8233, |
|
"step": 4195 |
|
}, |
|
{ |
|
"epoch": 0.8254716981132075, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001461097660770155, |
|
"loss": 0.8177, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8264544025157232, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00014595749859031557, |
|
"loss": 0.8038, |
|
"step": 4205 |
|
}, |
|
{ |
|
"epoch": 0.827437106918239, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00014580509589799329, |
|
"loss": 0.8032, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 0.8284198113207547, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00014565255844841286, |
|
"loss": 0.8246, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 0.8294025157232704, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.000145499886690335, |
|
"loss": 0.7918, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.8303852201257862, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00014534708107291565, |
|
"loss": 0.8305, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 0.8313679245283019, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00014519414204570446, |
|
"loss": 0.8086, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.8323506289308176, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00014504107005864353, |
|
"loss": 0.7889, |
|
"step": 4235 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00014488786556206626, |
|
"loss": 0.8268, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.8343160377358491, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0001447345290066958, |
|
"loss": 0.8001, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 0.8352987421383647, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00014458106084364383, |
|
"loss": 0.8205, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.8362814465408805, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0001444274615244092, |
|
"loss": 0.8076, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 0.8372641509433962, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00014427373150087663, |
|
"loss": 0.8025, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.8382468553459119, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00014411987122531542, |
|
"loss": 0.7983, |
|
"step": 4265 |
|
}, |
|
{ |
|
"epoch": 0.8392295597484277, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 0.000143965881150378, |
|
"loss": 0.8006, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 0.8402122641509434, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00014381176172909862, |
|
"loss": 0.8044, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 0.8411949685534591, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00014365751341489222, |
|
"loss": 0.8143, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.8421776729559748, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0001435031366615528, |
|
"loss": 0.7867, |
|
"step": 4285 |
|
}, |
|
{ |
|
"epoch": 0.8431603773584906, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00014334863192325226, |
|
"loss": 0.8115, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.8441430817610063, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00014319399965453911, |
|
"loss": 0.8234, |
|
"step": 4295 |
|
}, |
|
{ |
|
"epoch": 0.845125786163522, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00014303924031033692, |
|
"loss": 0.8317, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8461084905660378, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00014288435434594315, |
|
"loss": 0.8321, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 0.8470911949685535, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00014272934221702788, |
|
"loss": 0.8127, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 0.8480738993710691, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00014257420437963222, |
|
"loss": 0.801, |
|
"step": 4315 |
|
}, |
|
{ |
|
"epoch": 0.8490566037735849, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00014241894129016718, |
|
"loss": 0.8139, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.8500393081761006, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014226355340541224, |
|
"loss": 0.7854, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 0.8510220125786163, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00014210804118251405, |
|
"loss": 0.8163, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 0.8520047169811321, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00014195240507898504, |
|
"loss": 0.829, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 0.8529874213836478, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00014179664555270206, |
|
"loss": 0.7911, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.8539701257861635, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00014164076306190517, |
|
"loss": 0.8278, |
|
"step": 4345 |
|
}, |
|
{ |
|
"epoch": 0.8549528301886793, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00014148475806519603, |
|
"loss": 0.8158, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.855935534591195, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00014132863102153683, |
|
"loss": 0.8281, |
|
"step": 4355 |
|
}, |
|
{ |
|
"epoch": 0.8569182389937107, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00014117238239024887, |
|
"loss": 0.8236, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.8579009433962265, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00014101601263101095, |
|
"loss": 0.7968, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 0.8588836477987422, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00014085952220385838, |
|
"loss": 0.8183, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 0.8598663522012578, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001407029115691815, |
|
"loss": 0.7929, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 0.8608490566037735, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00014054618118772416, |
|
"loss": 0.8169, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.8618317610062893, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00014038933152058262, |
|
"loss": 0.7911, |
|
"step": 4385 |
|
}, |
|
{ |
|
"epoch": 0.862814465408805, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.000140232363029204, |
|
"loss": 0.8171, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 0.8637971698113207, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.00014007527617538508, |
|
"loss": 0.8039, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 0.8647798742138365, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0001399180714212708, |
|
"loss": 0.82, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8657625786163522, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001397607492293529, |
|
"loss": 0.8094, |
|
"step": 4405 |
|
}, |
|
{ |
|
"epoch": 0.8667452830188679, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00013960331006246878, |
|
"loss": 0.8232, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.8677279874213837, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00013944575438379984, |
|
"loss": 0.8301, |
|
"step": 4415 |
|
}, |
|
{ |
|
"epoch": 0.8687106918238994, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00013928808265687028, |
|
"loss": 0.8081, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.8696933962264151, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00013913029534554574, |
|
"loss": 0.7712, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 0.8706761006289309, |
|
"grad_norm": 0.35546875, |
|
"learning_rate": 0.00013897239291403191, |
|
"loss": 0.8289, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 0.8716588050314465, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00013881437582687314, |
|
"loss": 0.7992, |
|
"step": 4435 |
|
}, |
|
{ |
|
"epoch": 0.8726415094339622, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.0001386562445489511, |
|
"loss": 0.8034, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.873624213836478, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.0001384979995454834, |
|
"loss": 0.7996, |
|
"step": 4445 |
|
}, |
|
{ |
|
"epoch": 0.8746069182389937, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00013833964128202224, |
|
"loss": 0.8168, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.8755896226415094, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00013818117022445297, |
|
"loss": 0.7952, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 0.8765723270440252, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0001380225868389929, |
|
"loss": 0.8389, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.8775550314465409, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0001378638915921897, |
|
"loss": 0.7979, |
|
"step": 4465 |
|
}, |
|
{ |
|
"epoch": 0.8785377358490566, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.00013770508495092014, |
|
"loss": 0.8099, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.8795204402515723, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00013754616738238877, |
|
"loss": 0.7925, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 0.8805031446540881, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00013738713935412643, |
|
"loss": 0.8164, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.8814858490566038, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00013722800133398897, |
|
"loss": 0.8104, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 0.8824685534591195, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00013706875379015577, |
|
"loss": 0.8087, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 0.8834512578616353, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0001369093971911285, |
|
"loss": 0.8037, |
|
"step": 4495 |
|
}, |
|
{ |
|
"epoch": 0.8844339622641509, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00013674993200572962, |
|
"loss": 0.8024, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8854166666666666, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 7.893462771773996e-06, |
|
"loss": 0.782, |
|
"step": 4505 |
|
}, |
|
{ |
|
"epoch": 0.8863993710691824, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 7.760421092313152e-06, |
|
"loss": 0.7891, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 0.8873820754716981, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 7.628464876673202e-06, |
|
"loss": 0.8201, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 0.8883647798742138, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 7.497595677698388e-06, |
|
"loss": 0.8031, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.8893474842767296, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 7.3678150354410615e-06, |
|
"loss": 0.8013, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 0.8903301886792453, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 7.239124477143578e-06, |
|
"loss": 0.8075, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.891312893081761, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.111525517220308e-06, |
|
"loss": 0.7919, |
|
"step": 4535 |
|
}, |
|
{ |
|
"epoch": 0.8922955974842768, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 6.985019657239867e-06, |
|
"loss": 0.8074, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.8932783018867925, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 6.859608385907379e-06, |
|
"loss": 0.8009, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 0.8942610062893082, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 6.735293179046975e-06, |
|
"loss": 0.8081, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.8952437106918238, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 6.612075499584458e-06, |
|
"loss": 0.8067, |
|
"step": 4555 |
|
}, |
|
{ |
|
"epoch": 0.8962264150943396, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 6.489956797530084e-06, |
|
"loss": 0.811, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.8972091194968553, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 6.368938509961398e-06, |
|
"loss": 0.7966, |
|
"step": 4565 |
|
}, |
|
{ |
|
"epoch": 0.898191823899371, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 6.2490220610065155e-06, |
|
"loss": 0.8123, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 0.8991745283018868, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 6.130208861827202e-06, |
|
"loss": 0.8045, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 0.9001572327044025, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 6.012500310602254e-06, |
|
"loss": 0.7923, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.9011399371069182, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 5.8958977925112405e-06, |
|
"loss": 0.7986, |
|
"step": 4585 |
|
}, |
|
{ |
|
"epoch": 0.902122641509434, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 5.780402679717989e-06, |
|
"loss": 0.8166, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.9031053459119497, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.666016331354485e-06, |
|
"loss": 0.7845, |
|
"step": 4595 |
|
}, |
|
{ |
|
"epoch": 0.9040880503144654, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 5.552740093505015e-06, |
|
"loss": 0.7865, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.9050707547169812, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 5.440575299190165e-06, |
|
"loss": 0.8243, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 0.9060534591194969, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 5.329523268351155e-06, |
|
"loss": 0.8041, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 0.9070361635220126, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.219585307834407e-06, |
|
"loss": 0.8057, |
|
"step": 4615 |
|
}, |
|
{ |
|
"epoch": 0.9080188679245284, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 5.110762711376116e-06, |
|
"loss": 0.7987, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.909001572327044, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 5.003056759586944e-06, |
|
"loss": 0.7983, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 0.9099842767295597, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 4.89646871993703e-06, |
|
"loss": 0.7872, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 0.9109669811320755, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 4.79099984674114e-06, |
|
"loss": 0.8203, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 0.9119496855345912, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 4.6866513811437475e-06, |
|
"loss": 0.7816, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.9129323899371069, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 4.58342455110452e-06, |
|
"loss": 0.8151, |
|
"step": 4645 |
|
}, |
|
{ |
|
"epoch": 0.9139150943396226, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 4.481320571383907e-06, |
|
"loss": 0.8052, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.9148977987421384, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 4.380340643528735e-06, |
|
"loss": 0.8069, |
|
"step": 4655 |
|
}, |
|
{ |
|
"epoch": 0.9158805031446541, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 4.280485955858171e-06, |
|
"loss": 0.7986, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.9168632075471698, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 4.181757683449694e-06, |
|
"loss": 0.8219, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 0.9178459119496856, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 4.084156988125231e-06, |
|
"loss": 0.8162, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 0.9188286163522013, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.987685018437581e-06, |
|
"loss": 0.7972, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 0.9198113207547169, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 3.892342909656776e-06, |
|
"loss": 0.8163, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.9207940251572327, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 3.798131783756853e-06, |
|
"loss": 0.8151, |
|
"step": 4685 |
|
}, |
|
{ |
|
"epoch": 0.9217767295597484, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 3.7050527494025265e-06, |
|
"loss": 0.8023, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 0.9227594339622641, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 3.6131069019362362e-06, |
|
"loss": 0.8229, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 0.9237421383647799, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.52229532336521e-06, |
|
"loss": 0.7951, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.9247248427672956, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.4326190823487315e-06, |
|
"loss": 0.8034, |
|
"step": 4705 |
|
}, |
|
{ |
|
"epoch": 0.9257075471698113, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 3.344079234185604e-06, |
|
"loss": 0.807, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.9266902515723271, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.2566768208016297e-06, |
|
"loss": 0.8122, |
|
"step": 4715 |
|
}, |
|
{ |
|
"epoch": 0.9276729559748428, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 3.170412870737516e-06, |
|
"loss": 0.8023, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.9286556603773585, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 3.0852883991366322e-06, |
|
"loss": 0.7757, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 0.9296383647798742, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 3.0013044077330744e-06, |
|
"loss": 0.7709, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 0.93062106918239, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 2.9184618848399627e-06, |
|
"loss": 0.8331, |
|
"step": 4735 |
|
}, |
|
{ |
|
"epoch": 0.9316037735849056, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.836761805337762e-06, |
|
"loss": 0.7819, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.9325864779874213, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 2.756205130662737e-06, |
|
"loss": 0.7949, |
|
"step": 4745 |
|
}, |
|
{ |
|
"epoch": 0.9335691823899371, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.6767928087957693e-06, |
|
"loss": 0.8147, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.9345518867924528, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 2.598525774251159e-06, |
|
"loss": 0.7786, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 0.9355345911949685, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 2.52140494806552e-06, |
|
"loss": 0.7954, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.9365172955974843, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 2.44543123778711e-06, |
|
"loss": 0.7851, |
|
"step": 4765 |
|
}, |
|
{ |
|
"epoch": 0.9375, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 2.370605537465065e-06, |
|
"loss": 0.81, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.9384827044025157, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 2.296928727638814e-06, |
|
"loss": 0.8305, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 0.9394654088050315, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 2.2244016753278586e-06, |
|
"loss": 0.7896, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.9404481132075472, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 2.1530252340214996e-06, |
|
"loss": 0.8101, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 0.9414308176100629, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 2.0828002436687257e-06, |
|
"loss": 0.805, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 0.9424135220125787, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.013727530668452e-06, |
|
"loss": 0.804, |
|
"step": 4795 |
|
}, |
|
{ |
|
"epoch": 0.9433962264150944, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.9458079078597203e-06, |
|
"loss": 0.825, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.94437893081761, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 1.8790421745121356e-06, |
|
"loss": 0.821, |
|
"step": 4805 |
|
}, |
|
{ |
|
"epoch": 0.9453616352201258, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.813431116316522e-06, |
|
"loss": 0.8101, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 0.9463443396226415, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.748975505375583e-06, |
|
"loss": 0.8016, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 0.9473270440251572, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 1.6856761001948772e-06, |
|
"loss": 0.7847, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.9483097484276729, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 1.6235336456739026e-06, |
|
"loss": 0.8007, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 0.9492924528301887, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.5625488730972693e-06, |
|
"loss": 0.7891, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.9502751572327044, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.5027225001261525e-06, |
|
"loss": 0.8244, |
|
"step": 4835 |
|
}, |
|
{ |
|
"epoch": 0.9512578616352201, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 1.4440552307898202e-06, |
|
"loss": 0.7962, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.9522405660377359, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 1.386547755477363e-06, |
|
"loss": 0.7982, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 0.9532232704402516, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 1.3302007509295445e-06, |
|
"loss": 0.7896, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.9542059748427673, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.2750148802308737e-06, |
|
"loss": 0.8158, |
|
"step": 4855 |
|
}, |
|
{ |
|
"epoch": 0.9551886792452831, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 1.2209907928017795e-06, |
|
"loss": 0.8012, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.9561713836477987, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 1.1681291243909153e-06, |
|
"loss": 0.8146, |
|
"step": 4865 |
|
}, |
|
{ |
|
"epoch": 0.9571540880503144, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 1.116430497067833e-06, |
|
"loss": 0.8175, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 0.9581367924528302, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 1.0658955192154763e-06, |
|
"loss": 0.7937, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 0.9591194968553459, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 1.0165247855231542e-06, |
|
"loss": 0.8, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.9601022012578616, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 9.683188769794792e-07, |
|
"loss": 0.8042, |
|
"step": 4885 |
|
}, |
|
{ |
|
"epoch": 0.9610849056603774, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 9.212783608655518e-07, |
|
"loss": 0.8078, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.9620676100628931, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 8.754037907482748e-07, |
|
"loss": 0.7992, |
|
"step": 4895 |
|
}, |
|
{ |
|
"epoch": 0.9630503144654088, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 8.306957064738385e-07, |
|
"loss": 0.806, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9640330188679245, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 7.871546341614023e-07, |
|
"loss": 0.7803, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 0.9650157232704403, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 7.447810861968552e-07, |
|
"loss": 0.7864, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 0.965998427672956, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 7.03575561226788e-07, |
|
"loss": 0.7837, |
|
"step": 4915 |
|
}, |
|
{ |
|
"epoch": 0.9669811320754716, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 6.635385441526754e-07, |
|
"loss": 0.7935, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.9679638364779874, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 6.246705061251245e-07, |
|
"loss": 0.8074, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 0.9689465408805031, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.86971904538347e-07, |
|
"loss": 0.8082, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 0.9699292452830188, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 5.504431830247514e-07, |
|
"loss": 0.7889, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 0.9709119496855346, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 5.150847714497697e-07, |
|
"loss": 0.7924, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.9718946540880503, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 4.80897085906773e-07, |
|
"loss": 0.81, |
|
"step": 4945 |
|
}, |
|
{ |
|
"epoch": 0.972877358490566, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 4.4788052871215234e-07, |
|
"loss": 0.805, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.9738600628930818, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 4.1603548840062345e-07, |
|
"loss": 0.8101, |
|
"step": 4955 |
|
}, |
|
{ |
|
"epoch": 0.9748427672955975, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 3.853623397206407e-07, |
|
"loss": 0.7909, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.9758254716981132, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.5586144362997896e-07, |
|
"loss": 0.7972, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 0.976808176100629, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 3.275331472914922e-07, |
|
"loss": 0.8101, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 0.9777908805031447, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 3.0037778406902805e-07, |
|
"loss": 0.8184, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 0.9787735849056604, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.743956735234865e-07, |
|
"loss": 0.782, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.9797562893081762, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 2.4958712140911166e-07, |
|
"loss": 0.7905, |
|
"step": 4985 |
|
}, |
|
{ |
|
"epoch": 0.9807389937106918, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.2595241966982817e-07, |
|
"loss": 0.8163, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 0.9817216981132075, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 2.0349184643586595e-07, |
|
"loss": 0.8266, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 0.9827044025157232, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 1.8220566602040745e-07, |
|
"loss": 0.8174, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.983687106918239, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.6209412891659003e-07, |
|
"loss": 0.8052, |
|
"step": 5005 |
|
}, |
|
{ |
|
"epoch": 0.9846698113207547, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.4315747179446392e-07, |
|
"loss": 0.7871, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.9856525157232704, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 1.2539591749821666e-07, |
|
"loss": 0.7974, |
|
"step": 5015 |
|
}, |
|
{ |
|
"epoch": 0.9866352201257862, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 1.088096750436085e-07, |
|
"loss": 0.7972, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.9876179245283019, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 9.339893961548551e-08, |
|
"loss": 0.8153, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 0.9886006289308176, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 7.916389256541479e-08, |
|
"loss": 0.8146, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 0.9895833333333334, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 6.610470140967495e-08, |
|
"loss": 0.81, |
|
"step": 5035 |
|
}, |
|
{ |
|
"epoch": 0.9905660377358491, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 5.422151982719115e-08, |
|
"loss": 0.8167, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.9915487421383647, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 4.351448765775867e-08, |
|
"loss": 0.8175, |
|
"step": 5045 |
|
}, |
|
{ |
|
"epoch": 0.9925314465408805, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 3.3983730900377655e-08, |
|
"loss": 0.8008, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.9935141509433962, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 2.5629361711809742e-08, |
|
"loss": 0.8024, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 0.9944968553459119, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 1.8451478405223653e-08, |
|
"loss": 0.7952, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.9954795597484277, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 1.2450165449062744e-08, |
|
"loss": 0.7893, |
|
"step": 5065 |
|
}, |
|
{ |
|
"epoch": 0.9964622641509434, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 7.62549346601249e-09, |
|
"loss": 0.8112, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.9974449685534591, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 3.977519232223337e-09, |
|
"loss": 0.8175, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 0.9984276729559748, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 1.5062856765779565e-09, |
|
"loss": 0.8088, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.9994103773584906, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 2.118218802582561e-10, |
|
"loss": 0.8288, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7983007431030273, |
|
"eval_runtime": 9224.1097, |
|
"eval_samples_per_second": 8.823, |
|
"eval_steps_per_second": 0.138, |
|
"step": 5088 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 5088, |
|
"total_flos": 1.5751056572484157e+19, |
|
"train_loss": 0.013955340304839536, |
|
"train_runtime": 11108.4839, |
|
"train_samples_per_second": 29.312, |
|
"train_steps_per_second": 0.458 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 5088, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5751056572484157e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|