|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.0, |
|
"eval_steps": 500, |
|
"global_step": 7500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 1.6428182125091553, |
|
"learning_rate": 0.00029983999999999995, |
|
"loss": 4.0499, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 1.1695531606674194, |
|
"learning_rate": 0.00029968, |
|
"loss": 2.6823, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 0.982557475566864, |
|
"learning_rate": 0.00029951999999999995, |
|
"loss": 2.3495, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 1.119385004043579, |
|
"learning_rate": 0.00029936, |
|
"loss": 2.187, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.1943817138671875, |
|
"learning_rate": 0.00029919999999999995, |
|
"loss": 2.1278, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 1.0324301719665527, |
|
"learning_rate": 0.00029904, |
|
"loss": 2.1567, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 1.0339545011520386, |
|
"learning_rate": 0.00029887999999999996, |
|
"loss": 2.1682, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 1.1292812824249268, |
|
"learning_rate": 0.00029872, |
|
"loss": 2.0833, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 1.112321376800537, |
|
"learning_rate": 0.00029855999999999996, |
|
"loss": 2.0453, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.2117633819580078, |
|
"learning_rate": 0.0002984, |
|
"loss": 2.1188, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 1.0593370199203491, |
|
"learning_rate": 0.00029823999999999996, |
|
"loss": 2.1201, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 1.1461642980575562, |
|
"learning_rate": 0.00029808, |
|
"loss": 2.035, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 1.2336146831512451, |
|
"learning_rate": 0.00029791999999999997, |
|
"loss": 2.0329, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 1.0999081134796143, |
|
"learning_rate": 0.00029776, |
|
"loss": 2.029, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.109130620956421, |
|
"learning_rate": 0.00029759999999999997, |
|
"loss": 2.0032, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 1.150937557220459, |
|
"learning_rate": 0.00029744, |
|
"loss": 2.039, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 1.1265838146209717, |
|
"learning_rate": 0.00029727999999999997, |
|
"loss": 2.0358, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 1.1429523229599, |
|
"learning_rate": 0.00029711999999999995, |
|
"loss": 2.0357, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 1.0551432371139526, |
|
"learning_rate": 0.00029696, |
|
"loss": 2.0233, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.1221256256103516, |
|
"learning_rate": 0.00029679999999999995, |
|
"loss": 2.0512, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 1.0235646963119507, |
|
"learning_rate": 0.00029664, |
|
"loss": 2.0874, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 1.0271421670913696, |
|
"learning_rate": 0.00029647999999999995, |
|
"loss": 2.007, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 1.1792947053909302, |
|
"learning_rate": 0.00029632, |
|
"loss": 1.9954, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 1.1998450756072998, |
|
"learning_rate": 0.00029615999999999996, |
|
"loss": 1.9629, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.0941493511199951, |
|
"learning_rate": 0.000296, |
|
"loss": 1.9895, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 1.1195231676101685, |
|
"learning_rate": 0.00029583999999999996, |
|
"loss": 1.9704, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 1.0294626951217651, |
|
"learning_rate": 0.00029568, |
|
"loss": 1.9912, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 1.0843749046325684, |
|
"learning_rate": 0.00029551999999999996, |
|
"loss": 1.9365, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 0.8985214233398438, |
|
"learning_rate": 0.00029536, |
|
"loss": 2.0002, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0384533405303955, |
|
"learning_rate": 0.00029519999999999997, |
|
"loss": 1.94, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 1.1195266246795654, |
|
"learning_rate": 0.00029504, |
|
"loss": 2.0072, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 1.0751473903656006, |
|
"learning_rate": 0.00029487999999999997, |
|
"loss": 1.9446, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 1.0846151113510132, |
|
"learning_rate": 0.00029472, |
|
"loss": 1.9619, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 1.0839966535568237, |
|
"learning_rate": 0.00029455999999999997, |
|
"loss": 1.9454, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.0731072425842285, |
|
"learning_rate": 0.00029439999999999995, |
|
"loss": 1.9626, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.0523524284362793, |
|
"learning_rate": 0.00029424, |
|
"loss": 1.913, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 1.0012118816375732, |
|
"learning_rate": 0.00029407999999999995, |
|
"loss": 1.9395, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.9734252691268921, |
|
"learning_rate": 0.00029392, |
|
"loss": 2.0065, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 1.127196192741394, |
|
"learning_rate": 0.00029375999999999995, |
|
"loss": 1.8512, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2507715225219727, |
|
"learning_rate": 0.0002936, |
|
"loss": 1.9136, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 1.0916541814804077, |
|
"learning_rate": 0.00029343999999999996, |
|
"loss": 1.8957, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 1.1081781387329102, |
|
"learning_rate": 0.00029328, |
|
"loss": 1.9262, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 1.1098934412002563, |
|
"learning_rate": 0.00029311999999999996, |
|
"loss": 1.9213, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 1.0184811353683472, |
|
"learning_rate": 0.00029296, |
|
"loss": 1.9374, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1124446392059326, |
|
"learning_rate": 0.00029279999999999996, |
|
"loss": 1.9237, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 1.1229047775268555, |
|
"learning_rate": 0.00029264, |
|
"loss": 1.8897, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 1.0087217092514038, |
|
"learning_rate": 0.00029247999999999996, |
|
"loss": 1.9317, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 1.0527478456497192, |
|
"learning_rate": 0.00029232, |
|
"loss": 1.9571, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 0.9762263894081116, |
|
"learning_rate": 0.00029215999999999997, |
|
"loss": 1.911, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.0288947820663452, |
|
"learning_rate": 0.000292, |
|
"loss": 1.8763, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 1.0375839471817017, |
|
"learning_rate": 0.00029183999999999997, |
|
"loss": 1.9924, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 1.005863904953003, |
|
"learning_rate": 0.00029167999999999994, |
|
"loss": 1.8497, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 0.9753358960151672, |
|
"learning_rate": 0.00029152, |
|
"loss": 1.9155, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 1.0157995223999023, |
|
"learning_rate": 0.00029135999999999995, |
|
"loss": 1.9108, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.1655962467193604, |
|
"learning_rate": 0.0002912, |
|
"loss": 1.8594, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.0194449424743652, |
|
"learning_rate": 0.00029103999999999995, |
|
"loss": 1.8832, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 1.0156056880950928, |
|
"learning_rate": 0.00029088, |
|
"loss": 1.9253, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 1.031867265701294, |
|
"learning_rate": 0.00029071999999999995, |
|
"loss": 1.8896, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 0.9771973490715027, |
|
"learning_rate": 0.00029056, |
|
"loss": 1.8817, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.0212839841842651, |
|
"learning_rate": 0.00029039999999999996, |
|
"loss": 1.9077, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 1.09153413772583, |
|
"learning_rate": 0.00029024, |
|
"loss": 1.8725, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 1.043017029762268, |
|
"learning_rate": 0.00029007999999999996, |
|
"loss": 1.8432, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 0.9705913066864014, |
|
"learning_rate": 0.00028992, |
|
"loss": 1.8996, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.9535217881202698, |
|
"learning_rate": 0.00028975999999999996, |
|
"loss": 1.9339, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.1274858713150024, |
|
"learning_rate": 0.0002896, |
|
"loss": 1.8275, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 1.1044244766235352, |
|
"learning_rate": 0.00028943999999999997, |
|
"loss": 1.894, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 1.0410267114639282, |
|
"learning_rate": 0.00028928, |
|
"loss": 1.9064, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 1.118211269378662, |
|
"learning_rate": 0.00028911999999999997, |
|
"loss": 1.8881, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 1.0527877807617188, |
|
"learning_rate": 0.00028895999999999994, |
|
"loss": 1.8371, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0014268159866333, |
|
"learning_rate": 0.00028879999999999997, |
|
"loss": 1.9004, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 1.0764245986938477, |
|
"learning_rate": 0.00028863999999999995, |
|
"loss": 1.9347, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 1.0075087547302246, |
|
"learning_rate": 0.00028848, |
|
"loss": 1.8226, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 1.0563082695007324, |
|
"learning_rate": 0.00028831999999999995, |
|
"loss": 1.8147, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 1.1010650396347046, |
|
"learning_rate": 0.00028816, |
|
"loss": 1.9306, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.9899283647537231, |
|
"learning_rate": 0.00028799999999999995, |
|
"loss": 1.885, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 1.0245839357376099, |
|
"learning_rate": 0.00028784, |
|
"loss": 1.8532, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 1.056541085243225, |
|
"learning_rate": 0.00028767999999999996, |
|
"loss": 1.8861, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.9766470193862915, |
|
"learning_rate": 0.00028752, |
|
"loss": 1.8241, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 1.10284423828125, |
|
"learning_rate": 0.00028735999999999996, |
|
"loss": 1.7675, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.080234408378601, |
|
"learning_rate": 0.0002872, |
|
"loss": 1.8204, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 1.0814071893692017, |
|
"learning_rate": 0.00028703999999999996, |
|
"loss": 1.8619, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.9824687838554382, |
|
"learning_rate": 0.00028688, |
|
"loss": 1.9017, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 1.0177820920944214, |
|
"learning_rate": 0.00028671999999999997, |
|
"loss": 1.8842, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.9703278541564941, |
|
"learning_rate": 0.00028656, |
|
"loss": 1.916, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0800108909606934, |
|
"learning_rate": 0.00028639999999999997, |
|
"loss": 1.8274, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 1.0110689401626587, |
|
"learning_rate": 0.00028624, |
|
"loss": 1.8077, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 1.091354250907898, |
|
"learning_rate": 0.00028607999999999997, |
|
"loss": 1.8971, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 1.0147050619125366, |
|
"learning_rate": 0.00028591999999999995, |
|
"loss": 1.8365, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 1.0930813550949097, |
|
"learning_rate": 0.00028576, |
|
"loss": 1.7962, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.0309563875198364, |
|
"learning_rate": 0.00028559999999999995, |
|
"loss": 1.808, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 1.0878843069076538, |
|
"learning_rate": 0.00028544, |
|
"loss": 1.8481, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 1.039565086364746, |
|
"learning_rate": 0.00028527999999999995, |
|
"loss": 1.8475, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 0.9955683350563049, |
|
"learning_rate": 0.00028512, |
|
"loss": 1.8577, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.9792163372039795, |
|
"learning_rate": 0.00028495999999999996, |
|
"loss": 1.8577, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0933603048324585, |
|
"learning_rate": 0.0002848, |
|
"loss": 1.8941, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 1.0719082355499268, |
|
"learning_rate": 0.00028463999999999996, |
|
"loss": 1.8739, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 1.039011836051941, |
|
"learning_rate": 0.00028448, |
|
"loss": 1.8526, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 1.1158881187438965, |
|
"learning_rate": 0.00028431999999999996, |
|
"loss": 1.8001, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 0.9756163954734802, |
|
"learning_rate": 0.00028416, |
|
"loss": 1.8211, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0662978887557983, |
|
"learning_rate": 0.00028399999999999996, |
|
"loss": 1.8549, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 1.060304880142212, |
|
"learning_rate": 0.00028384, |
|
"loss": 1.8516, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 1.0433423519134521, |
|
"learning_rate": 0.00028367999999999997, |
|
"loss": 1.8146, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 1.0191080570220947, |
|
"learning_rate": 0.00028352, |
|
"loss": 1.8172, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 1.0157259702682495, |
|
"learning_rate": 0.00028335999999999997, |
|
"loss": 1.8096, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.0125967264175415, |
|
"learning_rate": 0.00028319999999999994, |
|
"loss": 1.7954, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 1.0847101211547852, |
|
"learning_rate": 0.00028304, |
|
"loss": 1.7961, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 0.9798891544342041, |
|
"learning_rate": 0.00028287999999999995, |
|
"loss": 1.8246, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.9857827425003052, |
|
"learning_rate": 0.00028272, |
|
"loss": 1.8989, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 0.9614414572715759, |
|
"learning_rate": 0.00028255999999999995, |
|
"loss": 1.9081, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.9770805835723877, |
|
"learning_rate": 0.0002824, |
|
"loss": 1.8396, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 1.0100719928741455, |
|
"learning_rate": 0.00028223999999999995, |
|
"loss": 1.8233, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.9945518970489502, |
|
"learning_rate": 0.00028208, |
|
"loss": 1.8163, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 1.0281423330307007, |
|
"learning_rate": 0.00028191999999999996, |
|
"loss": 1.8317, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 1.0575731992721558, |
|
"learning_rate": 0.00028176, |
|
"loss": 1.8673, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.1658177375793457, |
|
"learning_rate": 0.00028159999999999996, |
|
"loss": 1.827, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 1.0432631969451904, |
|
"learning_rate": 0.00028144, |
|
"loss": 1.8164, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 1.0257468223571777, |
|
"learning_rate": 0.00028127999999999996, |
|
"loss": 1.8025, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 1.1194055080413818, |
|
"learning_rate": 0.00028112, |
|
"loss": 1.8514, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 1.0339341163635254, |
|
"learning_rate": 0.00028095999999999997, |
|
"loss": 1.7735, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.9593726396560669, |
|
"learning_rate": 0.0002808, |
|
"loss": 1.8003, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 0.9705820083618164, |
|
"learning_rate": 0.00028063999999999997, |
|
"loss": 1.7788, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 1.0924532413482666, |
|
"learning_rate": 0.00028047999999999994, |
|
"loss": 1.7911, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 1.0870336294174194, |
|
"learning_rate": 0.00028031999999999997, |
|
"loss": 1.777, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 1.0212570428848267, |
|
"learning_rate": 0.00028015999999999995, |
|
"loss": 1.8329, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9898034334182739, |
|
"learning_rate": 0.00028, |
|
"loss": 1.803, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 1.1636098623275757, |
|
"learning_rate": 0.00027983999999999995, |
|
"loss": 1.7512, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.016, |
|
"grad_norm": 1.122517704963684, |
|
"learning_rate": 0.00027968, |
|
"loss": 1.7579, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 1.1521267890930176, |
|
"learning_rate": 0.00027951999999999995, |
|
"loss": 1.6773, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.032, |
|
"grad_norm": 1.164711833000183, |
|
"learning_rate": 0.00027936, |
|
"loss": 1.6539, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 1.0965043306350708, |
|
"learning_rate": 0.00027919999999999996, |
|
"loss": 1.7126, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.048, |
|
"grad_norm": 1.2235987186431885, |
|
"learning_rate": 0.00027904, |
|
"loss": 1.6967, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 1.1083018779754639, |
|
"learning_rate": 0.00027887999999999996, |
|
"loss": 1.7282, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.064, |
|
"grad_norm": 1.1210997104644775, |
|
"learning_rate": 0.00027872, |
|
"loss": 1.7371, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 1.1816761493682861, |
|
"learning_rate": 0.00027855999999999996, |
|
"loss": 1.7099, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.1083471775054932, |
|
"learning_rate": 0.0002784, |
|
"loss": 1.7029, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 1.1974619626998901, |
|
"learning_rate": 0.00027823999999999997, |
|
"loss": 1.676, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.096, |
|
"grad_norm": 1.1856564283370972, |
|
"learning_rate": 0.00027808, |
|
"loss": 1.7675, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 1.1293108463287354, |
|
"learning_rate": 0.00027791999999999997, |
|
"loss": 1.6936, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.112, |
|
"grad_norm": 1.1792447566986084, |
|
"learning_rate": 0.00027775999999999994, |
|
"loss": 1.7194, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.1036359071731567, |
|
"learning_rate": 0.00027759999999999997, |
|
"loss": 1.6564, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.1280000000000001, |
|
"grad_norm": 1.2215582132339478, |
|
"learning_rate": 0.00027743999999999995, |
|
"loss": 1.7064, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 1.1735379695892334, |
|
"learning_rate": 0.00027728, |
|
"loss": 1.7184, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.144, |
|
"grad_norm": 1.1964507102966309, |
|
"learning_rate": 0.00027711999999999995, |
|
"loss": 1.7329, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 1.138510823249817, |
|
"learning_rate": 0.00027696, |
|
"loss": 1.7096, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.1308197975158691, |
|
"learning_rate": 0.00027679999999999995, |
|
"loss": 1.7457, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 1.1567286252975464, |
|
"learning_rate": 0.00027664, |
|
"loss": 1.6813, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.176, |
|
"grad_norm": 1.1560039520263672, |
|
"learning_rate": 0.00027647999999999995, |
|
"loss": 1.699, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 1.230444073677063, |
|
"learning_rate": 0.00027632, |
|
"loss": 1.7782, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.192, |
|
"grad_norm": 1.2430510520935059, |
|
"learning_rate": 0.00027615999999999996, |
|
"loss": 1.716, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.1405155658721924, |
|
"learning_rate": 0.000276, |
|
"loss": 1.7072, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.208, |
|
"grad_norm": 1.1308519840240479, |
|
"learning_rate": 0.00027583999999999996, |
|
"loss": 1.6952, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 1.2301914691925049, |
|
"learning_rate": 0.00027568, |
|
"loss": 1.721, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.224, |
|
"grad_norm": 1.2387229204177856, |
|
"learning_rate": 0.00027551999999999996, |
|
"loss": 1.6866, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 1.070438027381897, |
|
"learning_rate": 0.00027536, |
|
"loss": 1.6882, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.1818335056304932, |
|
"learning_rate": 0.00027519999999999997, |
|
"loss": 1.7479, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 1.1129992008209229, |
|
"learning_rate": 0.00027503999999999994, |
|
"loss": 1.7563, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.256, |
|
"grad_norm": 1.2298282384872437, |
|
"learning_rate": 0.00027487999999999997, |
|
"loss": 1.684, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 1.2662142515182495, |
|
"learning_rate": 0.00027471999999999994, |
|
"loss": 1.7052, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.272, |
|
"grad_norm": 1.2866853475570679, |
|
"learning_rate": 0.00027456, |
|
"loss": 1.7403, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.225150227546692, |
|
"learning_rate": 0.00027439999999999995, |
|
"loss": 1.7141, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.288, |
|
"grad_norm": 1.2254585027694702, |
|
"learning_rate": 0.00027424, |
|
"loss": 1.7558, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 1.228905439376831, |
|
"learning_rate": 0.00027407999999999995, |
|
"loss": 1.7081, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.304, |
|
"grad_norm": 1.190305471420288, |
|
"learning_rate": 0.00027392, |
|
"loss": 1.7173, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 1.1080456972122192, |
|
"learning_rate": 0.00027375999999999995, |
|
"loss": 1.6771, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.1341513395309448, |
|
"learning_rate": 0.0002736, |
|
"loss": 1.6865, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 1.1582372188568115, |
|
"learning_rate": 0.00027343999999999996, |
|
"loss": 1.7083, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.336, |
|
"grad_norm": 1.2780879735946655, |
|
"learning_rate": 0.00027328, |
|
"loss": 1.7348, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 1.1118934154510498, |
|
"learning_rate": 0.00027311999999999996, |
|
"loss": 1.763, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.3519999999999999, |
|
"grad_norm": 1.2540453672409058, |
|
"learning_rate": 0.00027296, |
|
"loss": 1.7273, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 1.1426582336425781, |
|
"learning_rate": 0.00027279999999999996, |
|
"loss": 1.7098, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.3679999999999999, |
|
"grad_norm": 1.1964046955108643, |
|
"learning_rate": 0.00027264, |
|
"loss": 1.709, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 1.14896559715271, |
|
"learning_rate": 0.00027247999999999997, |
|
"loss": 1.7313, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.384, |
|
"grad_norm": 1.202528953552246, |
|
"learning_rate": 0.00027231999999999994, |
|
"loss": 1.744, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 1.1821858882904053, |
|
"learning_rate": 0.00027215999999999997, |
|
"loss": 1.6926, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 1.2063863277435303, |
|
"learning_rate": 0.00027199999999999994, |
|
"loss": 1.6386, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 1.2269303798675537, |
|
"learning_rate": 0.00027183999999999997, |
|
"loss": 1.7395, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.416, |
|
"grad_norm": 1.1873764991760254, |
|
"learning_rate": 0.00027167999999999995, |
|
"loss": 1.7408, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 1.1701534986495972, |
|
"learning_rate": 0.00027152, |
|
"loss": 1.6902, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.432, |
|
"grad_norm": 1.2059394121170044, |
|
"learning_rate": 0.00027135999999999995, |
|
"loss": 1.7189, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.2177969217300415, |
|
"learning_rate": 0.0002712, |
|
"loss": 1.6688, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.448, |
|
"grad_norm": 1.1420925855636597, |
|
"learning_rate": 0.00027103999999999995, |
|
"loss": 1.7027, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 1.1630126237869263, |
|
"learning_rate": 0.00027088, |
|
"loss": 1.7268, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.464, |
|
"grad_norm": 1.1708976030349731, |
|
"learning_rate": 0.00027071999999999996, |
|
"loss": 1.6667, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 1.1763298511505127, |
|
"learning_rate": 0.00027056, |
|
"loss": 1.7667, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 1.1959589719772339, |
|
"learning_rate": 0.00027039999999999996, |
|
"loss": 1.682, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 1.187795639038086, |
|
"learning_rate": 0.00027024, |
|
"loss": 1.7078, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.496, |
|
"grad_norm": 1.1146178245544434, |
|
"learning_rate": 0.00027007999999999996, |
|
"loss": 1.6864, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 1.1661298274993896, |
|
"learning_rate": 0.00026992, |
|
"loss": 1.7206, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.512, |
|
"grad_norm": 1.1348265409469604, |
|
"learning_rate": 0.00026975999999999997, |
|
"loss": 1.6971, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.2029168605804443, |
|
"learning_rate": 0.00026959999999999994, |
|
"loss": 1.7435, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.528, |
|
"grad_norm": 1.2038522958755493, |
|
"learning_rate": 0.00026943999999999997, |
|
"loss": 1.6938, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 1.1772645711898804, |
|
"learning_rate": 0.00026927999999999994, |
|
"loss": 1.6974, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.544, |
|
"grad_norm": 1.2052574157714844, |
|
"learning_rate": 0.00026911999999999997, |
|
"loss": 1.6798, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 1.22791588306427, |
|
"learning_rate": 0.00026895999999999995, |
|
"loss": 1.7074, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 1.0809330940246582, |
|
"learning_rate": 0.0002688, |
|
"loss": 1.7332, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 1.2375030517578125, |
|
"learning_rate": 0.00026863999999999995, |
|
"loss": 1.7188, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.576, |
|
"grad_norm": 1.1218806505203247, |
|
"learning_rate": 0.00026848, |
|
"loss": 1.7222, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 1.1987130641937256, |
|
"learning_rate": 0.00026831999999999995, |
|
"loss": 1.6705, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.592, |
|
"grad_norm": 1.1293755769729614, |
|
"learning_rate": 0.00026816, |
|
"loss": 1.7911, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.1469671726226807, |
|
"learning_rate": 0.00026799999999999995, |
|
"loss": 1.7355, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.608, |
|
"grad_norm": 1.2343659400939941, |
|
"learning_rate": 0.00026784, |
|
"loss": 1.6655, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 1.1669197082519531, |
|
"learning_rate": 0.00026767999999999996, |
|
"loss": 1.7655, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.624, |
|
"grad_norm": 1.1948648691177368, |
|
"learning_rate": 0.00026752, |
|
"loss": 1.5903, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 1.210276484489441, |
|
"learning_rate": 0.00026735999999999996, |
|
"loss": 1.6919, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 1.1474298238754272, |
|
"learning_rate": 0.0002672, |
|
"loss": 1.7315, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 1.1558197736740112, |
|
"learning_rate": 0.00026703999999999996, |
|
"loss": 1.7004, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.6560000000000001, |
|
"grad_norm": 1.2014431953430176, |
|
"learning_rate": 0.00026687999999999994, |
|
"loss": 1.7262, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 1.1946237087249756, |
|
"learning_rate": 0.00026671999999999997, |
|
"loss": 1.7575, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.6720000000000002, |
|
"grad_norm": 1.096993088722229, |
|
"learning_rate": 0.00026655999999999994, |
|
"loss": 1.7049, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 1.136132001876831, |
|
"learning_rate": 0.00026639999999999997, |
|
"loss": 1.7116, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.688, |
|
"grad_norm": 1.1487154960632324, |
|
"learning_rate": 0.00026623999999999994, |
|
"loss": 1.6711, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 1.2251691818237305, |
|
"learning_rate": 0.00026608, |
|
"loss": 1.7647, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.704, |
|
"grad_norm": 1.1736303567886353, |
|
"learning_rate": 0.00026591999999999995, |
|
"loss": 1.7074, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 1.1187472343444824, |
|
"learning_rate": 0.00026576, |
|
"loss": 1.6904, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 1.2309964895248413, |
|
"learning_rate": 0.00026559999999999995, |
|
"loss": 1.6816, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 1.182122826576233, |
|
"learning_rate": 0.00026544, |
|
"loss": 1.7505, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.736, |
|
"grad_norm": 1.1426887512207031, |
|
"learning_rate": 0.00026527999999999995, |
|
"loss": 1.7415, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 1.1243617534637451, |
|
"learning_rate": 0.00026512, |
|
"loss": 1.7427, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.752, |
|
"grad_norm": 1.1814117431640625, |
|
"learning_rate": 0.00026495999999999996, |
|
"loss": 1.6717, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.1558399200439453, |
|
"learning_rate": 0.0002648, |
|
"loss": 1.7521, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.768, |
|
"grad_norm": 1.1759192943572998, |
|
"learning_rate": 0.00026463999999999996, |
|
"loss": 1.6922, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 1.213027834892273, |
|
"learning_rate": 0.00026448, |
|
"loss": 1.6918, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.784, |
|
"grad_norm": 1.1476492881774902, |
|
"learning_rate": 0.00026431999999999996, |
|
"loss": 1.7319, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 1.171706199645996, |
|
"learning_rate": 0.00026415999999999994, |
|
"loss": 1.7126, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.2222481966018677, |
|
"learning_rate": 0.00026399999999999997, |
|
"loss": 1.666, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 1.1074283123016357, |
|
"learning_rate": 0.00026383999999999994, |
|
"loss": 1.7136, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.8159999999999998, |
|
"grad_norm": 1.0644099712371826, |
|
"learning_rate": 0.00026367999999999997, |
|
"loss": 1.7343, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 1.1833316087722778, |
|
"learning_rate": 0.00026351999999999994, |
|
"loss": 1.6886, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.8319999999999999, |
|
"grad_norm": 1.2397806644439697, |
|
"learning_rate": 0.00026335999999999997, |
|
"loss": 1.7136, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.6187876462936401, |
|
"learning_rate": 0.00026319999999999995, |
|
"loss": 1.5873, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.8479999999999999, |
|
"grad_norm": 1.141440749168396, |
|
"learning_rate": 0.00026304, |
|
"loss": 1.6937, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 1.1803792715072632, |
|
"learning_rate": 0.00026287999999999995, |
|
"loss": 1.8136, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.8639999999999999, |
|
"grad_norm": 1.1979511976242065, |
|
"learning_rate": 0.00026272, |
|
"loss": 1.7426, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 1.1708755493164062, |
|
"learning_rate": 0.00026255999999999995, |
|
"loss": 1.7532, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.0788543224334717, |
|
"learning_rate": 0.0002624, |
|
"loss": 1.7435, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 1.1670109033584595, |
|
"learning_rate": 0.00026223999999999996, |
|
"loss": 1.7338, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.896, |
|
"grad_norm": 1.1337978839874268, |
|
"learning_rate": 0.00026208, |
|
"loss": 1.7508, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 1.131404995918274, |
|
"learning_rate": 0.00026191999999999996, |
|
"loss": 1.7321, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.912, |
|
"grad_norm": 1.1655117273330688, |
|
"learning_rate": 0.00026176, |
|
"loss": 1.7629, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.1582902669906616, |
|
"learning_rate": 0.00026159999999999996, |
|
"loss": 1.7083, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.928, |
|
"grad_norm": 1.181884765625, |
|
"learning_rate": 0.00026143999999999994, |
|
"loss": 1.6752, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 1.1487571001052856, |
|
"learning_rate": 0.00026127999999999996, |
|
"loss": 1.6785, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.944, |
|
"grad_norm": 1.2264763116836548, |
|
"learning_rate": 0.00026111999999999994, |
|
"loss": 1.7237, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 1.285232424736023, |
|
"learning_rate": 0.00026095999999999997, |
|
"loss": 1.7148, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.2243884801864624, |
|
"learning_rate": 0.00026079999999999994, |
|
"loss": 1.7305, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 1.2969841957092285, |
|
"learning_rate": 0.00026063999999999997, |
|
"loss": 1.7271, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.976, |
|
"grad_norm": 1.1378511190414429, |
|
"learning_rate": 0.00026047999999999995, |
|
"loss": 1.7425, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 1.1501156091690063, |
|
"learning_rate": 0.00026032, |
|
"loss": 1.712, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.992, |
|
"grad_norm": 1.2089420557022095, |
|
"learning_rate": 0.00026015999999999995, |
|
"loss": 1.7164, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.1784271001815796, |
|
"learning_rate": 0.00026, |
|
"loss": 1.736, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.008, |
|
"grad_norm": 1.1402006149291992, |
|
"learning_rate": 0.00025983999999999995, |
|
"loss": 1.5834, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 1.1883610486984253, |
|
"learning_rate": 0.00025968, |
|
"loss": 1.5888, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.024, |
|
"grad_norm": 1.3511167764663696, |
|
"learning_rate": 0.00025951999999999995, |
|
"loss": 1.5551, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 1.2824231386184692, |
|
"learning_rate": 0.00025936, |
|
"loss": 1.5119, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.3076261281967163, |
|
"learning_rate": 0.00025919999999999996, |
|
"loss": 1.5044, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 1.3731348514556885, |
|
"learning_rate": 0.00025904, |
|
"loss": 1.5585, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.056, |
|
"grad_norm": 1.3174951076507568, |
|
"learning_rate": 0.00025887999999999996, |
|
"loss": 1.5591, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 1.3365912437438965, |
|
"learning_rate": 0.00025872, |
|
"loss": 1.5357, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.072, |
|
"grad_norm": 1.3091166019439697, |
|
"learning_rate": 0.00025855999999999996, |
|
"loss": 1.5232, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.372147560119629, |
|
"learning_rate": 0.00025839999999999994, |
|
"loss": 1.5427, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.088, |
|
"grad_norm": 1.412030577659607, |
|
"learning_rate": 0.00025823999999999997, |
|
"loss": 1.5851, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 1.3613168001174927, |
|
"learning_rate": 0.00025807999999999994, |
|
"loss": 1.5283, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.104, |
|
"grad_norm": 1.3381197452545166, |
|
"learning_rate": 0.00025791999999999997, |
|
"loss": 1.5812, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 1.3254282474517822, |
|
"learning_rate": 0.00025775999999999994, |
|
"loss": 1.5016, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.410579800605774, |
|
"learning_rate": 0.0002576, |
|
"loss": 1.5876, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 1.3652007579803467, |
|
"learning_rate": 0.00025743999999999995, |
|
"loss": 1.6116, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.136, |
|
"grad_norm": 1.4137206077575684, |
|
"learning_rate": 0.00025728, |
|
"loss": 1.5736, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 1.3824632167816162, |
|
"learning_rate": 0.00025711999999999995, |
|
"loss": 1.5523, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.152, |
|
"grad_norm": 1.4239449501037598, |
|
"learning_rate": 0.00025696, |
|
"loss": 1.5221, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.4395557641983032, |
|
"learning_rate": 0.00025679999999999995, |
|
"loss": 1.5161, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.168, |
|
"grad_norm": 1.5318158864974976, |
|
"learning_rate": 0.00025664, |
|
"loss": 1.5784, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 1.3760408163070679, |
|
"learning_rate": 0.00025647999999999996, |
|
"loss": 1.567, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.184, |
|
"grad_norm": 1.4429463148117065, |
|
"learning_rate": 0.00025632, |
|
"loss": 1.5653, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 1.424181342124939, |
|
"learning_rate": 0.00025615999999999996, |
|
"loss": 1.5378, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.4517723321914673, |
|
"learning_rate": 0.000256, |
|
"loss": 1.5738, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 1.455818772315979, |
|
"learning_rate": 0.00025583999999999996, |
|
"loss": 1.5117, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.216, |
|
"grad_norm": 1.3915988206863403, |
|
"learning_rate": 0.00025567999999999994, |
|
"loss": 1.5682, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 1.3807563781738281, |
|
"learning_rate": 0.00025551999999999997, |
|
"loss": 1.5618, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.232, |
|
"grad_norm": 1.3413660526275635, |
|
"learning_rate": 0.00025535999999999994, |
|
"loss": 1.5724, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.3377097845077515, |
|
"learning_rate": 0.00025519999999999997, |
|
"loss": 1.5662, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.248, |
|
"grad_norm": 1.3897491693496704, |
|
"learning_rate": 0.00025503999999999994, |
|
"loss": 1.5431, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 1.4780755043029785, |
|
"learning_rate": 0.00025487999999999997, |
|
"loss": 1.5657, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.2640000000000002, |
|
"grad_norm": 1.3818844556808472, |
|
"learning_rate": 0.00025471999999999995, |
|
"loss": 1.5399, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 1.4447002410888672, |
|
"learning_rate": 0.00025456, |
|
"loss": 1.5729, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 1.381330132484436, |
|
"learning_rate": 0.00025439999999999995, |
|
"loss": 1.52, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 1.418094277381897, |
|
"learning_rate": 0.00025424, |
|
"loss": 1.5954, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.296, |
|
"grad_norm": 1.329988718032837, |
|
"learning_rate": 0.00025407999999999995, |
|
"loss": 1.5852, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 1.3431826829910278, |
|
"learning_rate": 0.00025392, |
|
"loss": 1.5482, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.312, |
|
"grad_norm": 1.4532684087753296, |
|
"learning_rate": 0.00025375999999999996, |
|
"loss": 1.4967, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 1.3491160869598389, |
|
"learning_rate": 0.0002536, |
|
"loss": 1.5145, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.328, |
|
"grad_norm": 1.3651959896087646, |
|
"learning_rate": 0.00025343999999999996, |
|
"loss": 1.5726, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 1.4137355089187622, |
|
"learning_rate": 0.00025328, |
|
"loss": 1.5646, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.344, |
|
"grad_norm": 1.4950937032699585, |
|
"learning_rate": 0.00025311999999999996, |
|
"loss": 1.5653, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 1.3360849618911743, |
|
"learning_rate": 0.00025295999999999994, |
|
"loss": 1.5669, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.4283881187438965, |
|
"learning_rate": 0.00025279999999999996, |
|
"loss": 1.5729, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 1.40790855884552, |
|
"learning_rate": 0.00025263999999999994, |
|
"loss": 1.54, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.376, |
|
"grad_norm": 1.5222750902175903, |
|
"learning_rate": 0.00025247999999999997, |
|
"loss": 1.6079, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 1.424391746520996, |
|
"learning_rate": 0.00025231999999999994, |
|
"loss": 1.5835, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.392, |
|
"grad_norm": 1.4419969320297241, |
|
"learning_rate": 0.00025215999999999997, |
|
"loss": 1.6546, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.3506951332092285, |
|
"learning_rate": 0.00025199999999999995, |
|
"loss": 1.5856, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.408, |
|
"grad_norm": 1.3341702222824097, |
|
"learning_rate": 0.00025184, |
|
"loss": 1.5933, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 1.3723673820495605, |
|
"learning_rate": 0.00025167999999999995, |
|
"loss": 1.602, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.424, |
|
"grad_norm": 1.4064242839813232, |
|
"learning_rate": 0.00025152, |
|
"loss": 1.5438, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 1.3700838088989258, |
|
"learning_rate": 0.00025135999999999995, |
|
"loss": 1.6322, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.4045076370239258, |
|
"learning_rate": 0.0002512, |
|
"loss": 1.5728, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 1.4885849952697754, |
|
"learning_rate": 0.00025103999999999995, |
|
"loss": 1.6177, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.456, |
|
"grad_norm": 1.4054323434829712, |
|
"learning_rate": 0.00025088, |
|
"loss": 1.5579, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 1.4171288013458252, |
|
"learning_rate": 0.00025071999999999996, |
|
"loss": 1.6058, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.472, |
|
"grad_norm": 1.3950269222259521, |
|
"learning_rate": 0.00025056, |
|
"loss": 1.5906, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.3375904560089111, |
|
"learning_rate": 0.00025039999999999996, |
|
"loss": 1.5984, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.488, |
|
"grad_norm": 1.3980008363723755, |
|
"learning_rate": 0.00025024, |
|
"loss": 1.5724, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 1.3917794227600098, |
|
"learning_rate": 0.00025007999999999996, |
|
"loss": 1.6085, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.504, |
|
"grad_norm": 1.3524688482284546, |
|
"learning_rate": 0.00024991999999999994, |
|
"loss": 1.5734, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 1.4597851037979126, |
|
"learning_rate": 0.00024975999999999997, |
|
"loss": 1.6001, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.4018633365631104, |
|
"learning_rate": 0.00024959999999999994, |
|
"loss": 1.5761, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 1.414162278175354, |
|
"learning_rate": 0.00024943999999999997, |
|
"loss": 1.5799, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.536, |
|
"grad_norm": 1.3470393419265747, |
|
"learning_rate": 0.00024927999999999994, |
|
"loss": 1.5905, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 1.350521445274353, |
|
"learning_rate": 0.00024912, |
|
"loss": 1.605, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.552, |
|
"grad_norm": 1.4013463258743286, |
|
"learning_rate": 0.00024895999999999995, |
|
"loss": 1.5653, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.3292449712753296, |
|
"learning_rate": 0.0002488, |
|
"loss": 1.5761, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.568, |
|
"grad_norm": 1.2734830379486084, |
|
"learning_rate": 0.00024863999999999995, |
|
"loss": 1.6476, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.576, |
|
"grad_norm": 1.4279605150222778, |
|
"learning_rate": 0.00024848, |
|
"loss": 1.5671, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.584, |
|
"grad_norm": 1.4233906269073486, |
|
"learning_rate": 0.00024831999999999995, |
|
"loss": 1.5653, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 1.393425703048706, |
|
"learning_rate": 0.00024816, |
|
"loss": 1.5605, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.4003586769104004, |
|
"learning_rate": 0.00024799999999999996, |
|
"loss": 1.5775, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 1.3909311294555664, |
|
"learning_rate": 0.00024784, |
|
"loss": 1.599, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.616, |
|
"grad_norm": 1.3618372678756714, |
|
"learning_rate": 0.00024767999999999996, |
|
"loss": 1.5981, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 1.3769896030426025, |
|
"learning_rate": 0.00024752, |
|
"loss": 1.6391, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.632, |
|
"grad_norm": 1.2977598905563354, |
|
"learning_rate": 0.00024735999999999996, |
|
"loss": 1.6043, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.3508882522583008, |
|
"learning_rate": 0.0002472, |
|
"loss": 1.5432, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.648, |
|
"grad_norm": 1.411916732788086, |
|
"learning_rate": 0.00024703999999999997, |
|
"loss": 1.5889, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 1.3107311725616455, |
|
"learning_rate": 0.00024687999999999994, |
|
"loss": 1.5618, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.664, |
|
"grad_norm": 1.383974552154541, |
|
"learning_rate": 0.00024671999999999997, |
|
"loss": 1.5893, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.672, |
|
"grad_norm": 1.3793646097183228, |
|
"learning_rate": 0.00024655999999999994, |
|
"loss": 1.5901, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.405423879623413, |
|
"learning_rate": 0.00024639999999999997, |
|
"loss": 1.638, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 1.3815405368804932, |
|
"learning_rate": 0.00024623999999999995, |
|
"loss": 1.6037, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.6959999999999997, |
|
"grad_norm": 1.297813057899475, |
|
"learning_rate": 0.00024608, |
|
"loss": 1.5632, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 1.3591309785842896, |
|
"learning_rate": 0.00024591999999999995, |
|
"loss": 1.5892, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.7119999999999997, |
|
"grad_norm": 1.4379678964614868, |
|
"learning_rate": 0.00024576, |
|
"loss": 1.6029, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 1.4956458806991577, |
|
"learning_rate": 0.00024559999999999995, |
|
"loss": 1.5974, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.7279999999999998, |
|
"grad_norm": 1.4072085618972778, |
|
"learning_rate": 0.00024544, |
|
"loss": 1.6431, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 1.28607177734375, |
|
"learning_rate": 0.00024527999999999996, |
|
"loss": 1.5868, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.7439999999999998, |
|
"grad_norm": 1.5061297416687012, |
|
"learning_rate": 0.00024512, |
|
"loss": 1.613, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 1.4274139404296875, |
|
"learning_rate": 0.00024495999999999996, |
|
"loss": 1.5507, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.4335947036743164, |
|
"learning_rate": 0.0002448, |
|
"loss": 1.5557, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 1.3052548170089722, |
|
"learning_rate": 0.00024463999999999996, |
|
"loss": 1.5779, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.776, |
|
"grad_norm": 1.2695350646972656, |
|
"learning_rate": 0.00024448, |
|
"loss": 1.5587, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 1.4060364961624146, |
|
"learning_rate": 0.00024431999999999996, |
|
"loss": 1.6129, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.792, |
|
"grad_norm": 1.4803110361099243, |
|
"learning_rate": 0.00024416, |
|
"loss": 1.6236, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.353215217590332, |
|
"learning_rate": 0.000244, |
|
"loss": 1.6323, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.808, |
|
"grad_norm": 1.3456429243087769, |
|
"learning_rate": 0.00024383999999999997, |
|
"loss": 1.6635, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 1.4098529815673828, |
|
"learning_rate": 0.00024368, |
|
"loss": 1.5523, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.824, |
|
"grad_norm": 1.5074928998947144, |
|
"learning_rate": 0.00024351999999999997, |
|
"loss": 1.5736, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 1.4895234107971191, |
|
"learning_rate": 0.00024336, |
|
"loss": 1.6004, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.370694875717163, |
|
"learning_rate": 0.00024319999999999998, |
|
"loss": 1.5812, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 1.3541662693023682, |
|
"learning_rate": 0.00024303999999999998, |
|
"loss": 1.6059, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.856, |
|
"grad_norm": 1.3392258882522583, |
|
"learning_rate": 0.00024287999999999998, |
|
"loss": 1.5564, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 1.4230504035949707, |
|
"learning_rate": 0.00024271999999999998, |
|
"loss": 1.6174, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.872, |
|
"grad_norm": 1.3360211849212646, |
|
"learning_rate": 0.00024255999999999998, |
|
"loss": 1.5281, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.4017730951309204, |
|
"learning_rate": 0.00024239999999999998, |
|
"loss": 1.5762, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.888, |
|
"grad_norm": 1.4613922834396362, |
|
"learning_rate": 0.00024223999999999998, |
|
"loss": 1.6045, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 1.458549976348877, |
|
"learning_rate": 0.00024207999999999996, |
|
"loss": 1.5711, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.904, |
|
"grad_norm": 1.4020884037017822, |
|
"learning_rate": 0.00024192, |
|
"loss": 1.5384, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 1.306881308555603, |
|
"learning_rate": 0.00024175999999999996, |
|
"loss": 1.6462, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.417031168937683, |
|
"learning_rate": 0.0002416, |
|
"loss": 1.6108, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 1.3262524604797363, |
|
"learning_rate": 0.00024143999999999997, |
|
"loss": 1.6286, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.936, |
|
"grad_norm": 1.4688752889633179, |
|
"learning_rate": 0.00024128, |
|
"loss": 1.6187, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 1.413116216659546, |
|
"learning_rate": 0.00024111999999999997, |
|
"loss": 1.5804, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.952, |
|
"grad_norm": 1.3572710752487183, |
|
"learning_rate": 0.00024096, |
|
"loss": 1.5947, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.4315195083618164, |
|
"learning_rate": 0.00024079999999999997, |
|
"loss": 1.5673, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.968, |
|
"grad_norm": 1.455693244934082, |
|
"learning_rate": 0.00024064, |
|
"loss": 1.6311, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 1.4192560911178589, |
|
"learning_rate": 0.00024047999999999997, |
|
"loss": 1.604, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.984, |
|
"grad_norm": 1.4426895380020142, |
|
"learning_rate": 0.00024032, |
|
"loss": 1.6163, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"grad_norm": 1.3536198139190674, |
|
"learning_rate": 0.00024015999999999998, |
|
"loss": 1.6009, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.4080064296722412, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 1.5628, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.008, |
|
"grad_norm": 1.457725167274475, |
|
"learning_rate": 0.00023983999999999998, |
|
"loss": 1.3987, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 3.016, |
|
"grad_norm": 1.5731124877929688, |
|
"learning_rate": 0.00023967999999999998, |
|
"loss": 1.3801, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 3.024, |
|
"grad_norm": 1.6412163972854614, |
|
"learning_rate": 0.00023951999999999998, |
|
"loss": 1.3839, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.032, |
|
"grad_norm": 1.596416711807251, |
|
"learning_rate": 0.00023935999999999996, |
|
"loss": 1.3565, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.645661473274231, |
|
"learning_rate": 0.0002392, |
|
"loss": 1.3738, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.048, |
|
"grad_norm": 1.6392892599105835, |
|
"learning_rate": 0.00023903999999999996, |
|
"loss": 1.4528, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.056, |
|
"grad_norm": 1.6278529167175293, |
|
"learning_rate": 0.00023888, |
|
"loss": 1.3921, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 3.064, |
|
"grad_norm": 1.5314973592758179, |
|
"learning_rate": 0.00023871999999999996, |
|
"loss": 1.3978, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 3.072, |
|
"grad_norm": 1.648292899131775, |
|
"learning_rate": 0.00023856, |
|
"loss": 1.3825, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.5989805459976196, |
|
"learning_rate": 0.00023839999999999997, |
|
"loss": 1.4229, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 3.088, |
|
"grad_norm": 1.581239104270935, |
|
"learning_rate": 0.00023824, |
|
"loss": 1.4038, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 3.096, |
|
"grad_norm": 1.6047204732894897, |
|
"learning_rate": 0.00023807999999999997, |
|
"loss": 1.3652, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.104, |
|
"grad_norm": 1.6853421926498413, |
|
"learning_rate": 0.00023792, |
|
"loss": 1.4256, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 3.112, |
|
"grad_norm": 1.531514048576355, |
|
"learning_rate": 0.00023775999999999997, |
|
"loss": 1.4061, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.6761980056762695, |
|
"learning_rate": 0.0002376, |
|
"loss": 1.4205, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.128, |
|
"grad_norm": 1.6190063953399658, |
|
"learning_rate": 0.00023743999999999998, |
|
"loss": 1.434, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 3.136, |
|
"grad_norm": 1.6470814943313599, |
|
"learning_rate": 0.00023728, |
|
"loss": 1.42, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 3.144, |
|
"grad_norm": 1.5572431087493896, |
|
"learning_rate": 0.00023711999999999998, |
|
"loss": 1.418, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 3.152, |
|
"grad_norm": 1.6244536638259888, |
|
"learning_rate": 0.00023695999999999998, |
|
"loss": 1.4415, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.7610841989517212, |
|
"learning_rate": 0.00023679999999999998, |
|
"loss": 1.4249, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.168, |
|
"grad_norm": 1.6935431957244873, |
|
"learning_rate": 0.00023663999999999996, |
|
"loss": 1.4373, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 3.176, |
|
"grad_norm": 1.6628581285476685, |
|
"learning_rate": 0.00023647999999999999, |
|
"loss": 1.4168, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 3.184, |
|
"grad_norm": 1.6654530763626099, |
|
"learning_rate": 0.00023631999999999996, |
|
"loss": 1.4028, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 3.192, |
|
"grad_norm": 1.6582281589508057, |
|
"learning_rate": 0.00023616, |
|
"loss": 1.4062, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.7206676006317139, |
|
"learning_rate": 0.00023599999999999996, |
|
"loss": 1.3972, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.208, |
|
"grad_norm": 1.6492377519607544, |
|
"learning_rate": 0.000235856, |
|
"loss": 1.4392, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 3.216, |
|
"grad_norm": 1.8048959970474243, |
|
"learning_rate": 0.00023569599999999997, |
|
"loss": 1.4523, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 3.224, |
|
"grad_norm": 1.644860029220581, |
|
"learning_rate": 0.000235536, |
|
"loss": 1.4168, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 3.232, |
|
"grad_norm": 1.666577696800232, |
|
"learning_rate": 0.00023537599999999998, |
|
"loss": 1.5143, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.6702250242233276, |
|
"learning_rate": 0.000235216, |
|
"loss": 1.4573, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.248, |
|
"grad_norm": 1.6784839630126953, |
|
"learning_rate": 0.00023505599999999998, |
|
"loss": 1.4388, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 3.2560000000000002, |
|
"grad_norm": 1.6922202110290527, |
|
"learning_rate": 0.000234896, |
|
"loss": 1.4481, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 3.2640000000000002, |
|
"grad_norm": 1.5728070735931396, |
|
"learning_rate": 0.00023473599999999998, |
|
"loss": 1.4358, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 3.2720000000000002, |
|
"grad_norm": 1.6510705947875977, |
|
"learning_rate": 0.00023457599999999996, |
|
"loss": 1.4615, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 1.655216097831726, |
|
"learning_rate": 0.00023441599999999999, |
|
"loss": 1.4087, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.288, |
|
"grad_norm": 1.6214429140090942, |
|
"learning_rate": 0.00023425599999999996, |
|
"loss": 1.3582, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 3.296, |
|
"grad_norm": 1.5415211915969849, |
|
"learning_rate": 0.000234096, |
|
"loss": 1.438, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 3.304, |
|
"grad_norm": 1.6406790018081665, |
|
"learning_rate": 0.00023393599999999996, |
|
"loss": 1.4781, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 3.312, |
|
"grad_norm": 1.7424193620681763, |
|
"learning_rate": 0.000233776, |
|
"loss": 1.447, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.5324851274490356, |
|
"learning_rate": 0.00023361599999999997, |
|
"loss": 1.4415, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 3.328, |
|
"grad_norm": 1.6945812702178955, |
|
"learning_rate": 0.000233456, |
|
"loss": 1.4761, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 3.336, |
|
"grad_norm": 1.7372822761535645, |
|
"learning_rate": 0.00023329599999999997, |
|
"loss": 1.4504, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 3.344, |
|
"grad_norm": 1.6869308948516846, |
|
"learning_rate": 0.000233136, |
|
"loss": 1.4623, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 3.352, |
|
"grad_norm": 1.7480005025863647, |
|
"learning_rate": 0.00023299199999999998, |
|
"loss": 1.4184, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.570494532585144, |
|
"learning_rate": 0.00023283199999999996, |
|
"loss": 1.3832, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.368, |
|
"grad_norm": 1.8143585920333862, |
|
"learning_rate": 0.00023267199999999998, |
|
"loss": 1.4252, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 3.376, |
|
"grad_norm": 1.571781039237976, |
|
"learning_rate": 0.00023251199999999996, |
|
"loss": 1.4336, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 3.384, |
|
"grad_norm": 1.6962851285934448, |
|
"learning_rate": 0.000232352, |
|
"loss": 1.4297, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.392, |
|
"grad_norm": 1.6035798788070679, |
|
"learning_rate": 0.00023219199999999996, |
|
"loss": 1.451, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.6665290594100952, |
|
"learning_rate": 0.000232032, |
|
"loss": 1.4281, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 3.408, |
|
"grad_norm": 1.8126115798950195, |
|
"learning_rate": 0.00023187199999999996, |
|
"loss": 1.4711, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 3.416, |
|
"grad_norm": 1.6531234979629517, |
|
"learning_rate": 0.000231712, |
|
"loss": 1.4257, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 3.424, |
|
"grad_norm": 1.7809317111968994, |
|
"learning_rate": 0.00023155199999999997, |
|
"loss": 1.4205, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 3.432, |
|
"grad_norm": 1.8113545179367065, |
|
"learning_rate": 0.000231392, |
|
"loss": 1.4269, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.6733276844024658, |
|
"learning_rate": 0.00023123199999999997, |
|
"loss": 1.4178, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 3.448, |
|
"grad_norm": 1.6381465196609497, |
|
"learning_rate": 0.000231072, |
|
"loss": 1.4926, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 3.456, |
|
"grad_norm": 1.6401630640029907, |
|
"learning_rate": 0.00023091199999999997, |
|
"loss": 1.4817, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.464, |
|
"grad_norm": 1.628445029258728, |
|
"learning_rate": 0.000230752, |
|
"loss": 1.4935, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 3.472, |
|
"grad_norm": 1.632957935333252, |
|
"learning_rate": 0.00023059199999999998, |
|
"loss": 1.4787, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.7037655115127563, |
|
"learning_rate": 0.000230432, |
|
"loss": 1.4547, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.488, |
|
"grad_norm": 1.7122141122817993, |
|
"learning_rate": 0.00023027199999999998, |
|
"loss": 1.4314, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 3.496, |
|
"grad_norm": 1.620144248008728, |
|
"learning_rate": 0.000230112, |
|
"loss": 1.4198, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 3.504, |
|
"grad_norm": 1.604781150817871, |
|
"learning_rate": 0.00022995199999999998, |
|
"loss": 1.4648, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 3.512, |
|
"grad_norm": 1.6798850297927856, |
|
"learning_rate": 0.00022979199999999996, |
|
"loss": 1.4556, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 1.7711225748062134, |
|
"learning_rate": 0.00022963199999999999, |
|
"loss": 1.4741, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 3.528, |
|
"grad_norm": 1.7546377182006836, |
|
"learning_rate": 0.00022947199999999996, |
|
"loss": 1.4186, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.536, |
|
"grad_norm": 1.5374271869659424, |
|
"learning_rate": 0.000229312, |
|
"loss": 1.4726, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 3.544, |
|
"grad_norm": 1.691049575805664, |
|
"learning_rate": 0.00022915199999999996, |
|
"loss": 1.4302, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 3.552, |
|
"grad_norm": 1.8386030197143555, |
|
"learning_rate": 0.000228992, |
|
"loss": 1.4855, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.659847617149353, |
|
"learning_rate": 0.00022883199999999997, |
|
"loss": 1.4818, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 3.568, |
|
"grad_norm": 1.693474292755127, |
|
"learning_rate": 0.000228672, |
|
"loss": 1.4226, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 3.576, |
|
"grad_norm": 1.6777136325836182, |
|
"learning_rate": 0.00022851199999999997, |
|
"loss": 1.4819, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 3.584, |
|
"grad_norm": 1.6764864921569824, |
|
"learning_rate": 0.000228352, |
|
"loss": 1.4802, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 3.592, |
|
"grad_norm": 1.790226697921753, |
|
"learning_rate": 0.00022819199999999997, |
|
"loss": 1.5031, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.5997536182403564, |
|
"learning_rate": 0.000228032, |
|
"loss": 1.4584, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.608, |
|
"grad_norm": 1.6292929649353027, |
|
"learning_rate": 0.00022787199999999998, |
|
"loss": 1.4389, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 3.616, |
|
"grad_norm": 1.6309911012649536, |
|
"learning_rate": 0.000227712, |
|
"loss": 1.4545, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 3.624, |
|
"grad_norm": 1.6208481788635254, |
|
"learning_rate": 0.00022755199999999998, |
|
"loss": 1.4653, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 3.632, |
|
"grad_norm": 1.75088369846344, |
|
"learning_rate": 0.000227392, |
|
"loss": 1.4934, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 1.8166102170944214, |
|
"learning_rate": 0.00022723199999999998, |
|
"loss": 1.4889, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 3.648, |
|
"grad_norm": 1.5575486421585083, |
|
"learning_rate": 0.00022707199999999996, |
|
"loss": 1.4948, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 3.656, |
|
"grad_norm": 1.6632091999053955, |
|
"learning_rate": 0.00022691199999999999, |
|
"loss": 1.4832, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 3.664, |
|
"grad_norm": 1.583553433418274, |
|
"learning_rate": 0.00022675199999999996, |
|
"loss": 1.492, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 3.672, |
|
"grad_norm": 1.668994426727295, |
|
"learning_rate": 0.000226592, |
|
"loss": 1.4978, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.515479326248169, |
|
"learning_rate": 0.00022643199999999996, |
|
"loss": 1.4916, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 3.6879999999999997, |
|
"grad_norm": 1.652949333190918, |
|
"learning_rate": 0.000226272, |
|
"loss": 1.4584, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 3.6959999999999997, |
|
"grad_norm": 1.6760021448135376, |
|
"learning_rate": 0.00022611199999999997, |
|
"loss": 1.5198, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 3.7039999999999997, |
|
"grad_norm": 1.5702306032180786, |
|
"learning_rate": 0.000225952, |
|
"loss": 1.4703, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 3.7119999999999997, |
|
"grad_norm": 1.4395867586135864, |
|
"learning_rate": 0.00022579199999999997, |
|
"loss": 1.5072, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 1.5167125463485718, |
|
"learning_rate": 0.000225632, |
|
"loss": 1.473, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 3.7279999999999998, |
|
"grad_norm": 1.6731159687042236, |
|
"learning_rate": 0.00022547199999999997, |
|
"loss": 1.4886, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 3.7359999999999998, |
|
"grad_norm": 1.6189254522323608, |
|
"learning_rate": 0.000225312, |
|
"loss": 1.4699, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 3.7439999999999998, |
|
"grad_norm": 1.5746572017669678, |
|
"learning_rate": 0.00022515199999999997, |
|
"loss": 1.5175, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 3.752, |
|
"grad_norm": 1.7097058296203613, |
|
"learning_rate": 0.000224992, |
|
"loss": 1.4978, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.6119595766067505, |
|
"learning_rate": 0.00022483199999999998, |
|
"loss": 1.4492, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 3.768, |
|
"grad_norm": 1.736672043800354, |
|
"learning_rate": 0.000224672, |
|
"loss": 1.4344, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 3.776, |
|
"grad_norm": 1.7588441371917725, |
|
"learning_rate": 0.00022451199999999998, |
|
"loss": 1.4563, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 3.784, |
|
"grad_norm": 1.6483169794082642, |
|
"learning_rate": 0.00022435199999999996, |
|
"loss": 1.525, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 3.792, |
|
"grad_norm": 1.5439528226852417, |
|
"learning_rate": 0.00022419199999999998, |
|
"loss": 1.4263, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.5422582626342773, |
|
"learning_rate": 0.00022403199999999996, |
|
"loss": 1.4792, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 3.808, |
|
"grad_norm": 1.580538272857666, |
|
"learning_rate": 0.000223872, |
|
"loss": 1.5005, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 3.816, |
|
"grad_norm": 1.5790603160858154, |
|
"learning_rate": 0.00022371199999999996, |
|
"loss": 1.4742, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 3.824, |
|
"grad_norm": 1.597711443901062, |
|
"learning_rate": 0.000223552, |
|
"loss": 1.4576, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 3.832, |
|
"grad_norm": 1.7034629583358765, |
|
"learning_rate": 0.00022339199999999996, |
|
"loss": 1.4753, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.6988534927368164, |
|
"learning_rate": 0.000223232, |
|
"loss": 1.4657, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.848, |
|
"grad_norm": 1.5512536764144897, |
|
"learning_rate": 0.00022307199999999997, |
|
"loss": 1.5351, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 3.856, |
|
"grad_norm": 1.6742076873779297, |
|
"learning_rate": 0.000222912, |
|
"loss": 1.5108, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 3.864, |
|
"grad_norm": 1.6922365427017212, |
|
"learning_rate": 0.00022275199999999997, |
|
"loss": 1.4979, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 3.872, |
|
"grad_norm": 1.6407732963562012, |
|
"learning_rate": 0.000222592, |
|
"loss": 1.535, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.7077395915985107, |
|
"learning_rate": 0.00022243199999999997, |
|
"loss": 1.4981, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 3.888, |
|
"grad_norm": 1.7150638103485107, |
|
"learning_rate": 0.000222272, |
|
"loss": 1.5042, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 3.896, |
|
"grad_norm": 1.5963282585144043, |
|
"learning_rate": 0.00022211199999999998, |
|
"loss": 1.4823, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 3.904, |
|
"grad_norm": 1.5717283487319946, |
|
"learning_rate": 0.000221952, |
|
"loss": 1.4598, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 3.912, |
|
"grad_norm": 1.6642472743988037, |
|
"learning_rate": 0.00022179199999999998, |
|
"loss": 1.5055, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.6903674602508545, |
|
"learning_rate": 0.00022163199999999995, |
|
"loss": 1.4801, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 3.928, |
|
"grad_norm": 1.5324851274490356, |
|
"learning_rate": 0.00022147199999999998, |
|
"loss": 1.4673, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 3.936, |
|
"grad_norm": 1.6329302787780762, |
|
"learning_rate": 0.00022131199999999996, |
|
"loss": 1.5142, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 3.944, |
|
"grad_norm": 1.74014413356781, |
|
"learning_rate": 0.00022115199999999999, |
|
"loss": 1.5035, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 3.952, |
|
"grad_norm": 1.7019540071487427, |
|
"learning_rate": 0.00022099199999999996, |
|
"loss": 1.4721, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 1.8085479736328125, |
|
"learning_rate": 0.000220832, |
|
"loss": 1.5223, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 3.968, |
|
"grad_norm": 1.5533138513565063, |
|
"learning_rate": 0.00022067199999999996, |
|
"loss": 1.5005, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 3.976, |
|
"grad_norm": 1.5848819017410278, |
|
"learning_rate": 0.000220512, |
|
"loss": 1.4882, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 3.984, |
|
"grad_norm": 1.7058250904083252, |
|
"learning_rate": 0.00022035199999999997, |
|
"loss": 1.5222, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 3.992, |
|
"grad_norm": 1.6337260007858276, |
|
"learning_rate": 0.000220192, |
|
"loss": 1.5153, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.6282905340194702, |
|
"learning_rate": 0.00022003199999999997, |
|
"loss": 1.5281, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.008, |
|
"grad_norm": 1.752145528793335, |
|
"learning_rate": 0.000219872, |
|
"loss": 1.2938, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 4.016, |
|
"grad_norm": 1.7552149295806885, |
|
"learning_rate": 0.00021971199999999997, |
|
"loss": 1.2279, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 4.024, |
|
"grad_norm": 1.7727724313735962, |
|
"learning_rate": 0.000219552, |
|
"loss": 1.2764, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 4.032, |
|
"grad_norm": 1.9101881980895996, |
|
"learning_rate": 0.00021939199999999998, |
|
"loss": 1.2379, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 1.8460677862167358, |
|
"learning_rate": 0.000219232, |
|
"loss": 1.2328, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 4.048, |
|
"grad_norm": 1.869718313217163, |
|
"learning_rate": 0.00021907199999999998, |
|
"loss": 1.3119, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 4.056, |
|
"grad_norm": 1.789228916168213, |
|
"learning_rate": 0.00021891199999999995, |
|
"loss": 1.3397, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 4.064, |
|
"grad_norm": 1.9012354612350464, |
|
"learning_rate": 0.00021875199999999998, |
|
"loss": 1.2709, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 4.072, |
|
"grad_norm": 1.9376635551452637, |
|
"learning_rate": 0.00021859199999999996, |
|
"loss": 1.3102, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 2.017179489135742, |
|
"learning_rate": 0.00021843199999999998, |
|
"loss": 1.2612, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.088, |
|
"grad_norm": 1.9067007303237915, |
|
"learning_rate": 0.00021827199999999996, |
|
"loss": 1.2686, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 4.096, |
|
"grad_norm": 1.8910828828811646, |
|
"learning_rate": 0.000218112, |
|
"loss": 1.3517, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 4.104, |
|
"grad_norm": 2.0437941551208496, |
|
"learning_rate": 0.00021795199999999996, |
|
"loss": 1.3021, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 4.112, |
|
"grad_norm": 1.896239995956421, |
|
"learning_rate": 0.000217792, |
|
"loss": 1.2559, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 1.965014100074768, |
|
"learning_rate": 0.00021763199999999997, |
|
"loss": 1.2925, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 4.128, |
|
"grad_norm": 1.9773045778274536, |
|
"learning_rate": 0.000217472, |
|
"loss": 1.2994, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 4.136, |
|
"grad_norm": 1.7610217332839966, |
|
"learning_rate": 0.00021731199999999997, |
|
"loss": 1.2904, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 4.144, |
|
"grad_norm": 2.0166215896606445, |
|
"learning_rate": 0.000217152, |
|
"loss": 1.3171, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 4.152, |
|
"grad_norm": 1.8862032890319824, |
|
"learning_rate": 0.00021699199999999997, |
|
"loss": 1.3267, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 1.7716232538223267, |
|
"learning_rate": 0.000216832, |
|
"loss": 1.3342, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.168, |
|
"grad_norm": 1.8332161903381348, |
|
"learning_rate": 0.00021667199999999997, |
|
"loss": 1.2916, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 4.176, |
|
"grad_norm": 1.9238322973251343, |
|
"learning_rate": 0.000216512, |
|
"loss": 1.3271, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 4.184, |
|
"grad_norm": 1.7780416011810303, |
|
"learning_rate": 0.00021635199999999998, |
|
"loss": 1.321, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 4.192, |
|
"grad_norm": 1.985548973083496, |
|
"learning_rate": 0.00021619199999999995, |
|
"loss": 1.3342, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 1.9339617490768433, |
|
"learning_rate": 0.00021603199999999998, |
|
"loss": 1.3496, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 4.208, |
|
"grad_norm": 1.7527296543121338, |
|
"learning_rate": 0.00021587199999999996, |
|
"loss": 1.2671, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 4.216, |
|
"grad_norm": 1.8272658586502075, |
|
"learning_rate": 0.00021571199999999998, |
|
"loss": 1.2471, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 4.224, |
|
"grad_norm": 1.87795090675354, |
|
"learning_rate": 0.00021555199999999996, |
|
"loss": 1.2631, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 4.232, |
|
"grad_norm": 1.9426238536834717, |
|
"learning_rate": 0.000215392, |
|
"loss": 1.2649, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.819056510925293, |
|
"learning_rate": 0.00021523199999999996, |
|
"loss": 1.3185, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.248, |
|
"grad_norm": 1.9573816061019897, |
|
"learning_rate": 0.000215072, |
|
"loss": 1.3122, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 4.256, |
|
"grad_norm": 1.919756531715393, |
|
"learning_rate": 0.00021491199999999996, |
|
"loss": 1.329, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 4.264, |
|
"grad_norm": 1.9141929149627686, |
|
"learning_rate": 0.000214752, |
|
"loss": 1.2475, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 4.272, |
|
"grad_norm": 2.0552964210510254, |
|
"learning_rate": 0.00021459199999999997, |
|
"loss": 1.3471, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 1.90670645236969, |
|
"learning_rate": 0.000214432, |
|
"loss": 1.3266, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 4.288, |
|
"grad_norm": 2.149916410446167, |
|
"learning_rate": 0.00021427199999999997, |
|
"loss": 1.3484, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 4.296, |
|
"grad_norm": 1.805012822151184, |
|
"learning_rate": 0.000214112, |
|
"loss": 1.3079, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 4.304, |
|
"grad_norm": 1.9589773416519165, |
|
"learning_rate": 0.00021395199999999997, |
|
"loss": 1.36, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 4.312, |
|
"grad_norm": 2.037567138671875, |
|
"learning_rate": 0.000213792, |
|
"loss": 1.2982, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 1.8047535419464111, |
|
"learning_rate": 0.00021363199999999998, |
|
"loss": 1.3386, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.328, |
|
"grad_norm": 1.9072496891021729, |
|
"learning_rate": 0.00021347199999999998, |
|
"loss": 1.2602, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 4.336, |
|
"grad_norm": 1.9491392374038696, |
|
"learning_rate": 0.00021331199999999998, |
|
"loss": 1.3797, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 4.344, |
|
"grad_norm": 2.073835611343384, |
|
"learning_rate": 0.00021315199999999995, |
|
"loss": 1.3107, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 4.352, |
|
"grad_norm": 1.936270833015442, |
|
"learning_rate": 0.00021299199999999998, |
|
"loss": 1.3318, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 1.9866790771484375, |
|
"learning_rate": 0.00021283199999999996, |
|
"loss": 1.273, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 4.368, |
|
"grad_norm": 2.0993947982788086, |
|
"learning_rate": 0.00021267199999999999, |
|
"loss": 1.3157, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 4.376, |
|
"grad_norm": 1.937992811203003, |
|
"learning_rate": 0.00021251199999999996, |
|
"loss": 1.2858, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 4.384, |
|
"grad_norm": 1.7649872303009033, |
|
"learning_rate": 0.000212352, |
|
"loss": 1.3118, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 4.392, |
|
"grad_norm": 1.8896372318267822, |
|
"learning_rate": 0.00021219199999999996, |
|
"loss": 1.3051, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.9377533197402954, |
|
"learning_rate": 0.000212032, |
|
"loss": 1.3561, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 4.408, |
|
"grad_norm": 2.079291820526123, |
|
"learning_rate": 0.00021187199999999997, |
|
"loss": 1.3083, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 4.416, |
|
"grad_norm": 2.0022287368774414, |
|
"learning_rate": 0.000211712, |
|
"loss": 1.3287, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 4.424, |
|
"grad_norm": 1.80183744430542, |
|
"learning_rate": 0.00021155199999999997, |
|
"loss": 1.3599, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 4.432, |
|
"grad_norm": 1.9421368837356567, |
|
"learning_rate": 0.000211392, |
|
"loss": 1.3661, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 1.9392564296722412, |
|
"learning_rate": 0.00021123199999999997, |
|
"loss": 1.3463, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 4.448, |
|
"grad_norm": 2.102717638015747, |
|
"learning_rate": 0.000211072, |
|
"loss": 1.3544, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 4.456, |
|
"grad_norm": 1.9294030666351318, |
|
"learning_rate": 0.00021091199999999998, |
|
"loss": 1.3765, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 4.464, |
|
"grad_norm": 1.8542896509170532, |
|
"learning_rate": 0.00021075199999999998, |
|
"loss": 1.3624, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 4.4719999999999995, |
|
"grad_norm": 2.159574031829834, |
|
"learning_rate": 0.00021059199999999998, |
|
"loss": 1.3452, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 2.136308193206787, |
|
"learning_rate": 0.00021043199999999998, |
|
"loss": 1.3514, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 4.4879999999999995, |
|
"grad_norm": 1.959116816520691, |
|
"learning_rate": 0.00021027199999999998, |
|
"loss": 1.3332, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 4.496, |
|
"grad_norm": 1.9541338682174683, |
|
"learning_rate": 0.00021011199999999996, |
|
"loss": 1.3396, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 4.504, |
|
"grad_norm": 1.9139293432235718, |
|
"learning_rate": 0.00020995199999999998, |
|
"loss": 1.3306, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 4.5120000000000005, |
|
"grad_norm": 2.0729434490203857, |
|
"learning_rate": 0.00020979199999999996, |
|
"loss": 1.3385, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 1.9547297954559326, |
|
"learning_rate": 0.000209632, |
|
"loss": 1.3378, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 4.5280000000000005, |
|
"grad_norm": 2.0007593631744385, |
|
"learning_rate": 0.00020947199999999996, |
|
"loss": 1.3744, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 4.536, |
|
"grad_norm": 1.841583251953125, |
|
"learning_rate": 0.000209312, |
|
"loss": 1.3461, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 4.5440000000000005, |
|
"grad_norm": 1.950011968612671, |
|
"learning_rate": 0.00020915199999999997, |
|
"loss": 1.3898, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 4.552, |
|
"grad_norm": 1.9242889881134033, |
|
"learning_rate": 0.000208992, |
|
"loss": 1.375, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 2.023679733276367, |
|
"learning_rate": 0.00020883199999999997, |
|
"loss": 1.3547, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 4.568, |
|
"grad_norm": 1.96961510181427, |
|
"learning_rate": 0.000208672, |
|
"loss": 1.33, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 4.576, |
|
"grad_norm": 1.9337737560272217, |
|
"learning_rate": 0.00020851199999999997, |
|
"loss": 1.338, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 4.584, |
|
"grad_norm": 1.9906611442565918, |
|
"learning_rate": 0.000208352, |
|
"loss": 1.3979, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 4.592, |
|
"grad_norm": 1.819471001625061, |
|
"learning_rate": 0.00020819199999999997, |
|
"loss": 1.375, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.9368617534637451, |
|
"learning_rate": 0.00020803199999999998, |
|
"loss": 1.3637, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 4.608, |
|
"grad_norm": 1.9653687477111816, |
|
"learning_rate": 0.00020787199999999998, |
|
"loss": 1.3554, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 4.616, |
|
"grad_norm": 1.9763808250427246, |
|
"learning_rate": 0.00020771199999999998, |
|
"loss": 1.3437, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 4.624, |
|
"grad_norm": 1.8649840354919434, |
|
"learning_rate": 0.00020755199999999998, |
|
"loss": 1.3624, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 4.632, |
|
"grad_norm": 1.828291893005371, |
|
"learning_rate": 0.00020739199999999998, |
|
"loss": 1.378, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 1.8722482919692993, |
|
"learning_rate": 0.00020723199999999998, |
|
"loss": 1.3548, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 4.648, |
|
"grad_norm": 2.2012381553649902, |
|
"learning_rate": 0.00020707199999999996, |
|
"loss": 1.3698, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 4.656, |
|
"grad_norm": 1.9233702421188354, |
|
"learning_rate": 0.000206912, |
|
"loss": 1.3798, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 4.664, |
|
"grad_norm": 1.9627357721328735, |
|
"learning_rate": 0.00020675199999999996, |
|
"loss": 1.424, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 4.672, |
|
"grad_norm": 1.8615745306015015, |
|
"learning_rate": 0.000206592, |
|
"loss": 1.3909, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 1.9583446979522705, |
|
"learning_rate": 0.00020643199999999996, |
|
"loss": 1.3946, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 4.688, |
|
"grad_norm": 1.9457666873931885, |
|
"learning_rate": 0.000206272, |
|
"loss": 1.372, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 4.696, |
|
"grad_norm": 1.8619425296783447, |
|
"learning_rate": 0.00020611199999999997, |
|
"loss": 1.3787, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 4.704, |
|
"grad_norm": 1.9508628845214844, |
|
"learning_rate": 0.000205952, |
|
"loss": 1.371, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 4.712, |
|
"grad_norm": 1.7727349996566772, |
|
"learning_rate": 0.00020579199999999997, |
|
"loss": 1.3631, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 1.7935019731521606, |
|
"learning_rate": 0.000205632, |
|
"loss": 1.3514, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 4.728, |
|
"grad_norm": 1.9109313488006592, |
|
"learning_rate": 0.00020547199999999997, |
|
"loss": 1.327, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 4.736, |
|
"grad_norm": 1.9290359020233154, |
|
"learning_rate": 0.00020531199999999997, |
|
"loss": 1.3574, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 4.744, |
|
"grad_norm": 2.15079665184021, |
|
"learning_rate": 0.00020515199999999998, |
|
"loss": 1.3939, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 4.752, |
|
"grad_norm": 2.0457019805908203, |
|
"learning_rate": 0.00020499199999999998, |
|
"loss": 1.3558, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 1.9548970460891724, |
|
"learning_rate": 0.00020483199999999998, |
|
"loss": 1.362, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 4.768, |
|
"grad_norm": 1.791396141052246, |
|
"learning_rate": 0.00020467199999999998, |
|
"loss": 1.33, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 4.776, |
|
"grad_norm": 1.8451635837554932, |
|
"learning_rate": 0.00020451199999999998, |
|
"loss": 1.3507, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 4.784, |
|
"grad_norm": 1.7999178171157837, |
|
"learning_rate": 0.00020435199999999998, |
|
"loss": 1.4174, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 4.792, |
|
"grad_norm": 1.8192893266677856, |
|
"learning_rate": 0.00020419199999999999, |
|
"loss": 1.3481, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.9753166437149048, |
|
"learning_rate": 0.00020403199999999996, |
|
"loss": 1.4223, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.808, |
|
"grad_norm": 1.8800415992736816, |
|
"learning_rate": 0.000203872, |
|
"loss": 1.3661, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 4.816, |
|
"grad_norm": 1.8040504455566406, |
|
"learning_rate": 0.00020371199999999996, |
|
"loss": 1.3519, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 4.824, |
|
"grad_norm": 1.9058725833892822, |
|
"learning_rate": 0.000203552, |
|
"loss": 1.3973, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 4.832, |
|
"grad_norm": 1.7217756509780884, |
|
"learning_rate": 0.00020339199999999997, |
|
"loss": 1.403, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 1.8864495754241943, |
|
"learning_rate": 0.000203232, |
|
"loss": 1.3468, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 4.848, |
|
"grad_norm": 2.006610870361328, |
|
"learning_rate": 0.00020307199999999997, |
|
"loss": 1.3972, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 4.856, |
|
"grad_norm": 1.9524073600769043, |
|
"learning_rate": 0.000202912, |
|
"loss": 1.4012, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 4.864, |
|
"grad_norm": 1.9322147369384766, |
|
"learning_rate": 0.00020275199999999997, |
|
"loss": 1.3928, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 4.872, |
|
"grad_norm": 1.929335594177246, |
|
"learning_rate": 0.000202592, |
|
"loss": 1.3799, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.8158811330795288, |
|
"learning_rate": 0.00020243199999999998, |
|
"loss": 1.3967, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 4.888, |
|
"grad_norm": 1.9702143669128418, |
|
"learning_rate": 0.00020227199999999998, |
|
"loss": 1.3714, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 4.896, |
|
"grad_norm": 1.6967090368270874, |
|
"learning_rate": 0.00020211199999999998, |
|
"loss": 1.3553, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 4.904, |
|
"grad_norm": 1.7388558387756348, |
|
"learning_rate": 0.00020195199999999998, |
|
"loss": 1.4053, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 4.912, |
|
"grad_norm": 1.9453833103179932, |
|
"learning_rate": 0.00020179199999999998, |
|
"loss": 1.3512, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.8605188131332397, |
|
"learning_rate": 0.00020163199999999998, |
|
"loss": 1.3376, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 4.928, |
|
"grad_norm": 1.9881434440612793, |
|
"learning_rate": 0.00020147199999999998, |
|
"loss": 1.3877, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 4.936, |
|
"grad_norm": 1.8932327032089233, |
|
"learning_rate": 0.00020131199999999999, |
|
"loss": 1.331, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 4.944, |
|
"grad_norm": 1.8074854612350464, |
|
"learning_rate": 0.000201152, |
|
"loss": 1.4211, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 4.952, |
|
"grad_norm": 1.9307423830032349, |
|
"learning_rate": 0.00020099199999999996, |
|
"loss": 1.3498, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 1.949623942375183, |
|
"learning_rate": 0.000200832, |
|
"loss": 1.3897, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 4.968, |
|
"grad_norm": 1.7373038530349731, |
|
"learning_rate": 0.00020067199999999997, |
|
"loss": 1.3696, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 4.976, |
|
"grad_norm": 1.9628345966339111, |
|
"learning_rate": 0.000200512, |
|
"loss": 1.3667, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 4.984, |
|
"grad_norm": 1.9516173601150513, |
|
"learning_rate": 0.00020035199999999997, |
|
"loss": 1.4143, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 4.992, |
|
"grad_norm": 1.7527846097946167, |
|
"learning_rate": 0.000200192, |
|
"loss": 1.3599, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 1.9414066076278687, |
|
"learning_rate": 0.00020003199999999997, |
|
"loss": 1.3942, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 5.008, |
|
"grad_norm": 1.802935242652893, |
|
"learning_rate": 0.000199872, |
|
"loss": 1.2225, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 5.016, |
|
"grad_norm": 2.1949446201324463, |
|
"learning_rate": 0.00019971199999999997, |
|
"loss": 1.168, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 5.024, |
|
"grad_norm": 2.1667227745056152, |
|
"learning_rate": 0.00019955199999999998, |
|
"loss": 1.2283, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 5.032, |
|
"grad_norm": 2.0180180072784424, |
|
"learning_rate": 0.00019939199999999998, |
|
"loss": 1.1925, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 2.257992744445801, |
|
"learning_rate": 0.00019923199999999998, |
|
"loss": 1.1695, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.048, |
|
"grad_norm": 2.023444890975952, |
|
"learning_rate": 0.00019907199999999998, |
|
"loss": 1.1454, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 5.056, |
|
"grad_norm": 2.1307425498962402, |
|
"learning_rate": 0.00019891199999999998, |
|
"loss": 1.1645, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 5.064, |
|
"grad_norm": 2.018718957901001, |
|
"learning_rate": 0.00019875199999999998, |
|
"loss": 1.1451, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 5.072, |
|
"grad_norm": 2.158968448638916, |
|
"learning_rate": 0.00019859199999999999, |
|
"loss": 1.1819, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 2.125598907470703, |
|
"learning_rate": 0.000198432, |
|
"loss": 1.1819, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 5.088, |
|
"grad_norm": 2.2982337474823, |
|
"learning_rate": 0.000198272, |
|
"loss": 1.1838, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 5.096, |
|
"grad_norm": 2.3263471126556396, |
|
"learning_rate": 0.000198112, |
|
"loss": 1.176, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 5.104, |
|
"grad_norm": 2.0729761123657227, |
|
"learning_rate": 0.00019795199999999996, |
|
"loss": 1.1237, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 5.112, |
|
"grad_norm": 2.302323579788208, |
|
"learning_rate": 0.000197792, |
|
"loss": 1.1658, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 5.12, |
|
"grad_norm": 2.1555356979370117, |
|
"learning_rate": 0.00019763199999999997, |
|
"loss": 1.1943, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.128, |
|
"grad_norm": 2.104564666748047, |
|
"learning_rate": 0.000197472, |
|
"loss": 1.1776, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 5.136, |
|
"grad_norm": 2.101271390914917, |
|
"learning_rate": 0.00019731199999999997, |
|
"loss": 1.2107, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 5.144, |
|
"grad_norm": 2.1387553215026855, |
|
"learning_rate": 0.000197152, |
|
"loss": 1.1662, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 5.152, |
|
"grad_norm": 1.9566245079040527, |
|
"learning_rate": 0.00019699199999999997, |
|
"loss": 1.1862, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 2.1503751277923584, |
|
"learning_rate": 0.00019683199999999997, |
|
"loss": 1.1739, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 5.168, |
|
"grad_norm": 2.0225651264190674, |
|
"learning_rate": 0.00019668799999999998, |
|
"loss": 1.1878, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 5.176, |
|
"grad_norm": 2.179147481918335, |
|
"learning_rate": 0.00019652799999999999, |
|
"loss": 1.1973, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 5.184, |
|
"grad_norm": 2.376354932785034, |
|
"learning_rate": 0.000196368, |
|
"loss": 1.1756, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 5.192, |
|
"grad_norm": 2.143554449081421, |
|
"learning_rate": 0.000196208, |
|
"loss": 1.2341, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 2.128620147705078, |
|
"learning_rate": 0.000196048, |
|
"loss": 1.1808, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 5.208, |
|
"grad_norm": 2.025129556655884, |
|
"learning_rate": 0.00019588799999999996, |
|
"loss": 1.1889, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 5.216, |
|
"grad_norm": 2.1475353240966797, |
|
"learning_rate": 0.000195728, |
|
"loss": 1.2154, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 5.224, |
|
"grad_norm": 2.032588005065918, |
|
"learning_rate": 0.00019556799999999997, |
|
"loss": 1.2046, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 5.232, |
|
"grad_norm": 2.2672226428985596, |
|
"learning_rate": 0.000195408, |
|
"loss": 1.1553, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 5.24, |
|
"grad_norm": 2.2911875247955322, |
|
"learning_rate": 0.00019524799999999997, |
|
"loss": 1.2179, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 5.248, |
|
"grad_norm": 2.0162782669067383, |
|
"learning_rate": 0.00019508799999999997, |
|
"loss": 1.2399, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 5.256, |
|
"grad_norm": 2.193554639816284, |
|
"learning_rate": 0.00019492799999999997, |
|
"loss": 1.1745, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 5.264, |
|
"grad_norm": 2.104660749435425, |
|
"learning_rate": 0.00019476799999999998, |
|
"loss": 1.2107, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 5.272, |
|
"grad_norm": 2.141188621520996, |
|
"learning_rate": 0.00019460799999999998, |
|
"loss": 1.2222, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 2.184913158416748, |
|
"learning_rate": 0.00019444799999999998, |
|
"loss": 1.2103, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 5.288, |
|
"grad_norm": 2.3275797367095947, |
|
"learning_rate": 0.00019428799999999998, |
|
"loss": 1.2263, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 5.296, |
|
"grad_norm": 2.2514960765838623, |
|
"learning_rate": 0.00019412799999999998, |
|
"loss": 1.2263, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 5.304, |
|
"grad_norm": 2.335054874420166, |
|
"learning_rate": 0.00019396799999999998, |
|
"loss": 1.2105, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 5.312, |
|
"grad_norm": 2.0840258598327637, |
|
"learning_rate": 0.00019380799999999998, |
|
"loss": 1.2322, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 2.2909815311431885, |
|
"learning_rate": 0.00019364799999999999, |
|
"loss": 1.2027, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 5.328, |
|
"grad_norm": 2.076932668685913, |
|
"learning_rate": 0.000193488, |
|
"loss": 1.2039, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 5.336, |
|
"grad_norm": 2.017833948135376, |
|
"learning_rate": 0.000193328, |
|
"loss": 1.1752, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 5.344, |
|
"grad_norm": 2.242431879043579, |
|
"learning_rate": 0.000193168, |
|
"loss": 1.2351, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 5.352, |
|
"grad_norm": 2.0976057052612305, |
|
"learning_rate": 0.000193008, |
|
"loss": 1.2151, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 5.36, |
|
"grad_norm": 2.2112200260162354, |
|
"learning_rate": 0.00019284799999999997, |
|
"loss": 1.2196, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 5.368, |
|
"grad_norm": 2.1883575916290283, |
|
"learning_rate": 0.000192688, |
|
"loss": 1.2368, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 5.376, |
|
"grad_norm": 2.3068554401397705, |
|
"learning_rate": 0.00019252799999999997, |
|
"loss": 1.2621, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 5.384, |
|
"grad_norm": 2.039863109588623, |
|
"learning_rate": 0.00019236799999999997, |
|
"loss": 1.2687, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 5.392, |
|
"grad_norm": 2.26802396774292, |
|
"learning_rate": 0.00019220799999999997, |
|
"loss": 1.1788, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 2.010828733444214, |
|
"learning_rate": 0.00019204799999999997, |
|
"loss": 1.2232, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 5.408, |
|
"grad_norm": 2.1727616786956787, |
|
"learning_rate": 0.00019188799999999998, |
|
"loss": 1.2625, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 5.416, |
|
"grad_norm": 2.030134439468384, |
|
"learning_rate": 0.00019172799999999998, |
|
"loss": 1.2391, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 5.424, |
|
"grad_norm": 2.2361104488372803, |
|
"learning_rate": 0.00019156799999999998, |
|
"loss": 1.2329, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 5.432, |
|
"grad_norm": 2.1066739559173584, |
|
"learning_rate": 0.00019140799999999998, |
|
"loss": 1.2456, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 5.44, |
|
"grad_norm": 2.1428840160369873, |
|
"learning_rate": 0.00019124799999999998, |
|
"loss": 1.1813, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 5.448, |
|
"grad_norm": 2.3433635234832764, |
|
"learning_rate": 0.00019108799999999998, |
|
"loss": 1.2672, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 5.456, |
|
"grad_norm": 2.185671091079712, |
|
"learning_rate": 0.00019092799999999999, |
|
"loss": 1.2162, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 5.464, |
|
"grad_norm": 2.205509662628174, |
|
"learning_rate": 0.000190768, |
|
"loss": 1.2383, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 5.4719999999999995, |
|
"grad_norm": 2.428114891052246, |
|
"learning_rate": 0.000190608, |
|
"loss": 1.3059, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 5.48, |
|
"grad_norm": 2.135251998901367, |
|
"learning_rate": 0.000190448, |
|
"loss": 1.2053, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 5.4879999999999995, |
|
"grad_norm": 2.074209213256836, |
|
"learning_rate": 0.000190288, |
|
"loss": 1.194, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 5.496, |
|
"grad_norm": 2.0454697608947754, |
|
"learning_rate": 0.000190128, |
|
"loss": 1.232, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 5.504, |
|
"grad_norm": 1.9665228128433228, |
|
"learning_rate": 0.000189968, |
|
"loss": 1.2077, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 5.5120000000000005, |
|
"grad_norm": 2.0836398601531982, |
|
"learning_rate": 0.00018980799999999997, |
|
"loss": 1.186, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 2.0634419918060303, |
|
"learning_rate": 0.000189648, |
|
"loss": 1.2539, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 5.5280000000000005, |
|
"grad_norm": 2.2017769813537598, |
|
"learning_rate": 0.00018948799999999997, |
|
"loss": 1.2484, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 5.536, |
|
"grad_norm": 2.193028450012207, |
|
"learning_rate": 0.00018932799999999997, |
|
"loss": 1.2916, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 5.5440000000000005, |
|
"grad_norm": 2.163944721221924, |
|
"learning_rate": 0.00018916799999999998, |
|
"loss": 1.2706, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 5.552, |
|
"grad_norm": 2.214864730834961, |
|
"learning_rate": 0.00018900799999999998, |
|
"loss": 1.2626, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 5.5600000000000005, |
|
"grad_norm": 2.167754888534546, |
|
"learning_rate": 0.00018884799999999998, |
|
"loss": 1.2654, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 5.568, |
|
"grad_norm": 2.114359140396118, |
|
"learning_rate": 0.00018868799999999998, |
|
"loss": 1.2345, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 5.576, |
|
"grad_norm": 2.2773566246032715, |
|
"learning_rate": 0.00018852799999999998, |
|
"loss": 1.2244, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 5.584, |
|
"grad_norm": 2.1949045658111572, |
|
"learning_rate": 0.00018836799999999998, |
|
"loss": 1.2508, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 5.592, |
|
"grad_norm": 2.0954575538635254, |
|
"learning_rate": 0.00018820799999999998, |
|
"loss": 1.2387, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 2.1742050647735596, |
|
"learning_rate": 0.00018804799999999999, |
|
"loss": 1.2407, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 5.608, |
|
"grad_norm": 2.1627070903778076, |
|
"learning_rate": 0.000187888, |
|
"loss": 1.1706, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 5.616, |
|
"grad_norm": 2.1110544204711914, |
|
"learning_rate": 0.000187728, |
|
"loss": 1.2538, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 5.624, |
|
"grad_norm": 2.255958318710327, |
|
"learning_rate": 0.000187568, |
|
"loss": 1.209, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 5.632, |
|
"grad_norm": 2.2075769901275635, |
|
"learning_rate": 0.000187408, |
|
"loss": 1.2942, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 1.964128851890564, |
|
"learning_rate": 0.000187248, |
|
"loss": 1.2519, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 5.648, |
|
"grad_norm": 2.2681636810302734, |
|
"learning_rate": 0.000187088, |
|
"loss": 1.2993, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 5.656, |
|
"grad_norm": 2.313188076019287, |
|
"learning_rate": 0.000186928, |
|
"loss": 1.2438, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 5.664, |
|
"grad_norm": 2.369359254837036, |
|
"learning_rate": 0.00018676799999999997, |
|
"loss": 1.2157, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 5.672, |
|
"grad_norm": 2.2245047092437744, |
|
"learning_rate": 0.00018660799999999997, |
|
"loss": 1.2857, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 5.68, |
|
"grad_norm": 2.058401107788086, |
|
"learning_rate": 0.00018644799999999997, |
|
"loss": 1.2045, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 5.688, |
|
"grad_norm": 2.2531964778900146, |
|
"learning_rate": 0.00018628799999999998, |
|
"loss": 1.2493, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 5.696, |
|
"grad_norm": 2.2315497398376465, |
|
"learning_rate": 0.00018612799999999998, |
|
"loss": 1.235, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 5.704, |
|
"grad_norm": 2.018808603286743, |
|
"learning_rate": 0.00018596799999999998, |
|
"loss": 1.2016, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 5.712, |
|
"grad_norm": 2.0911753177642822, |
|
"learning_rate": 0.00018580799999999998, |
|
"loss": 1.2624, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 5.72, |
|
"grad_norm": 2.147120475769043, |
|
"learning_rate": 0.00018564799999999998, |
|
"loss": 1.2376, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 5.728, |
|
"grad_norm": 2.1546943187713623, |
|
"learning_rate": 0.00018548799999999998, |
|
"loss": 1.2286, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 5.736, |
|
"grad_norm": 2.0924603939056396, |
|
"learning_rate": 0.00018532799999999998, |
|
"loss": 1.2964, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 5.744, |
|
"grad_norm": 2.337070941925049, |
|
"learning_rate": 0.00018516799999999999, |
|
"loss": 1.2699, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 5.752, |
|
"grad_norm": 2.4989166259765625, |
|
"learning_rate": 0.000185008, |
|
"loss": 1.2631, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 5.76, |
|
"grad_norm": 2.3049070835113525, |
|
"learning_rate": 0.000184848, |
|
"loss": 1.2879, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 5.768, |
|
"grad_norm": 2.328397274017334, |
|
"learning_rate": 0.000184688, |
|
"loss": 1.2624, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 5.776, |
|
"grad_norm": 2.147589921951294, |
|
"learning_rate": 0.000184528, |
|
"loss": 1.2825, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 5.784, |
|
"grad_norm": 2.348174571990967, |
|
"learning_rate": 0.000184368, |
|
"loss": 1.2751, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 5.792, |
|
"grad_norm": 2.270873785018921, |
|
"learning_rate": 0.000184208, |
|
"loss": 1.2714, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 2.289658308029175, |
|
"learning_rate": 0.000184048, |
|
"loss": 1.2412, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 5.808, |
|
"grad_norm": 2.3569588661193848, |
|
"learning_rate": 0.00018388799999999997, |
|
"loss": 1.2714, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 5.816, |
|
"grad_norm": 2.372729539871216, |
|
"learning_rate": 0.00018372799999999997, |
|
"loss": 1.2824, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 5.824, |
|
"grad_norm": 2.3369643688201904, |
|
"learning_rate": 0.00018356799999999997, |
|
"loss": 1.2897, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 5.832, |
|
"grad_norm": 2.149664878845215, |
|
"learning_rate": 0.00018340799999999998, |
|
"loss": 1.2411, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 2.1661763191223145, |
|
"learning_rate": 0.00018324799999999998, |
|
"loss": 1.2484, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 5.848, |
|
"grad_norm": 2.2296934127807617, |
|
"learning_rate": 0.00018308799999999998, |
|
"loss": 1.2713, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 5.856, |
|
"grad_norm": 2.0819859504699707, |
|
"learning_rate": 0.00018292799999999998, |
|
"loss": 1.2636, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 5.864, |
|
"grad_norm": 2.158386468887329, |
|
"learning_rate": 0.00018276799999999998, |
|
"loss": 1.2886, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 5.872, |
|
"grad_norm": 2.1622161865234375, |
|
"learning_rate": 0.00018260799999999998, |
|
"loss": 1.2663, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"grad_norm": 2.1625213623046875, |
|
"learning_rate": 0.00018244799999999999, |
|
"loss": 1.3031, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 5.888, |
|
"grad_norm": 2.1951282024383545, |
|
"learning_rate": 0.000182288, |
|
"loss": 1.2569, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 5.896, |
|
"grad_norm": 2.2481329441070557, |
|
"learning_rate": 0.000182128, |
|
"loss": 1.2376, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 5.904, |
|
"grad_norm": 2.11740779876709, |
|
"learning_rate": 0.000181968, |
|
"loss": 1.2642, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 5.912, |
|
"grad_norm": 2.3954527378082275, |
|
"learning_rate": 0.000181808, |
|
"loss": 1.3029, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 5.92, |
|
"grad_norm": 2.222752571105957, |
|
"learning_rate": 0.000181648, |
|
"loss": 1.3028, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 5.928, |
|
"grad_norm": 2.15301513671875, |
|
"learning_rate": 0.000181488, |
|
"loss": 1.3011, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 5.936, |
|
"grad_norm": 2.27708101272583, |
|
"learning_rate": 0.000181328, |
|
"loss": 1.2727, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 5.944, |
|
"grad_norm": 2.1490461826324463, |
|
"learning_rate": 0.00018116799999999997, |
|
"loss": 1.2968, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 5.952, |
|
"grad_norm": 2.247800588607788, |
|
"learning_rate": 0.000181008, |
|
"loss": 1.3003, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 5.96, |
|
"grad_norm": 2.2584476470947266, |
|
"learning_rate": 0.00018084799999999997, |
|
"loss": 1.2659, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 5.968, |
|
"grad_norm": 2.1247005462646484, |
|
"learning_rate": 0.00018068799999999997, |
|
"loss": 1.26, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 5.976, |
|
"grad_norm": 2.2989518642425537, |
|
"learning_rate": 0.00018052799999999998, |
|
"loss": 1.28, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 5.984, |
|
"grad_norm": 2.3190391063690186, |
|
"learning_rate": 0.00018036799999999998, |
|
"loss": 1.2763, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 5.992, |
|
"grad_norm": 2.170459032058716, |
|
"learning_rate": 0.00018020799999999998, |
|
"loss": 1.2263, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 2.1551315784454346, |
|
"learning_rate": 0.00018004799999999998, |
|
"loss": 1.3013, |
|
"step": 7500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 18750, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 2500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.2603608444698624e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|