|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.7850629613547546, |
|
"eval_steps": 500, |
|
"global_step": 1130, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006947459834997829, |
|
"grad_norm": 2.5, |
|
"learning_rate": 5.780346820809248e-07, |
|
"loss": 4.1915, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0013894919669995658, |
|
"grad_norm": 4.21875, |
|
"learning_rate": 1.1560693641618497e-06, |
|
"loss": 3.7445, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0020842379504993486, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 1.7341040462427746e-06, |
|
"loss": 3.9999, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0027789839339991316, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 2.3121387283236993e-06, |
|
"loss": 4.0444, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0034737299174989146, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 2.8901734104046244e-06, |
|
"loss": 4.1148, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004168475900998697, |
|
"grad_norm": 1.7578125, |
|
"learning_rate": 3.468208092485549e-06, |
|
"loss": 4.5023, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.004863221884498481, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 4.046242774566474e-06, |
|
"loss": 4.0132, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.005557967867998263, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 4.624277456647399e-06, |
|
"loss": 3.7643, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.006252713851498046, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 5.202312138728324e-06, |
|
"loss": 3.3901, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.006947459834997829, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 5.780346820809249e-06, |
|
"loss": 3.249, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.007642205818497612, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 6.358381502890173e-06, |
|
"loss": 3.8669, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.008336951801997394, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 6.936416184971098e-06, |
|
"loss": 3.9794, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.009031697785497178, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 7.514450867052024e-06, |
|
"loss": 3.3814, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.009726443768996961, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 8.092485549132949e-06, |
|
"loss": 3.7746, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.010421189752496743, |
|
"grad_norm": 1.7265625, |
|
"learning_rate": 8.670520231213873e-06, |
|
"loss": 3.0555, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.011115935735996526, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.248554913294797e-06, |
|
"loss": 3.6958, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.01181068171949631, |
|
"grad_norm": 2.25, |
|
"learning_rate": 9.826589595375723e-06, |
|
"loss": 5.0544, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.012505427702996091, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.0404624277456647e-05, |
|
"loss": 5.395, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.013200173686495875, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 1.0982658959537573e-05, |
|
"loss": 3.3306, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.013894919669995658, |
|
"grad_norm": 1.625, |
|
"learning_rate": 1.1560693641618498e-05, |
|
"loss": 4.0474, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01458966565349544, |
|
"grad_norm": 1.65625, |
|
"learning_rate": 1.2138728323699422e-05, |
|
"loss": 3.6259, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.015284411636995223, |
|
"grad_norm": 3.0, |
|
"learning_rate": 1.2716763005780346e-05, |
|
"loss": 4.0321, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.015979157620495007, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.329479768786127e-05, |
|
"loss": 4.1047, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.01667390360399479, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.3872832369942197e-05, |
|
"loss": 3.4635, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.017368649587494574, |
|
"grad_norm": 3.03125, |
|
"learning_rate": 1.4450867052023123e-05, |
|
"loss": 5.1757, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.018063395570994355, |
|
"grad_norm": 7.0, |
|
"learning_rate": 1.5028901734104049e-05, |
|
"loss": 4.4566, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.018758141554494137, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.5606936416184973e-05, |
|
"loss": 3.9157, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.019452887537993922, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 1.6184971098265897e-05, |
|
"loss": 3.5971, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.020147633521493704, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.676300578034682e-05, |
|
"loss": 3.1815, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.020842379504993486, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 1.7341040462427746e-05, |
|
"loss": 3.5147, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02153712548849327, |
|
"grad_norm": 2.625, |
|
"learning_rate": 1.791907514450867e-05, |
|
"loss": 3.4658, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.022231871471993053, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.8497109826589594e-05, |
|
"loss": 3.2493, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.022926617455492834, |
|
"grad_norm": 1.75, |
|
"learning_rate": 1.907514450867052e-05, |
|
"loss": 2.5355, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.02362136343899262, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.9653179190751446e-05, |
|
"loss": 3.1388, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0243161094224924, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 2.023121387283237e-05, |
|
"loss": 2.7128, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.025010855405992183, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 2.0809248554913295e-05, |
|
"loss": 2.9951, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.025705601389491968, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 2.1387283236994223e-05, |
|
"loss": 2.938, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.02640034737299175, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 2.1965317919075147e-05, |
|
"loss": 2.8222, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.02709509335649153, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 2.254335260115607e-05, |
|
"loss": 2.4152, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.027789839339991317, |
|
"grad_norm": 1.7890625, |
|
"learning_rate": 2.3121387283236996e-05, |
|
"loss": 2.5405, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.028484585323491098, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 2.369942196531792e-05, |
|
"loss": 2.6256, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.02917933130699088, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.4277456647398844e-05, |
|
"loss": 2.646, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.029874077290490665, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 2.485549132947977e-05, |
|
"loss": 2.3207, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.030568823273990447, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 2.5433526011560693e-05, |
|
"loss": 2.3893, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03126356925749023, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 2.6011560693641617e-05, |
|
"loss": 1.8784, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.031958315240990014, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 2.658959537572254e-05, |
|
"loss": 2.0503, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0326530612244898, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 2.7167630057803466e-05, |
|
"loss": 2.1115, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03334780720798958, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 2.7745664739884393e-05, |
|
"loss": 1.92, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.03404255319148936, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 2.832369942196532e-05, |
|
"loss": 2.1723, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03473729917498915, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 2.8901734104046245e-05, |
|
"loss": 2.1771, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.035432045158488926, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 2.947976878612717e-05, |
|
"loss": 2.5537, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.03612679114198871, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 3.0057803468208097e-05, |
|
"loss": 2.0183, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.036821537125488496, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 3.063583815028902e-05, |
|
"loss": 2.2641, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.037516283108988274, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 3.1213872832369946e-05, |
|
"loss": 2.2391, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.03821102909248806, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 3.179190751445087e-05, |
|
"loss": 2.0064, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.038905775075987845, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 3.2369942196531794e-05, |
|
"loss": 1.6102, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.03960052105948762, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 3.294797687861272e-05, |
|
"loss": 2.1865, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.04029526704298741, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 3.352601156069364e-05, |
|
"loss": 1.9878, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.04099001302648719, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 3.410404624277457e-05, |
|
"loss": 2.1405, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.04168475900998697, |
|
"grad_norm": 1.625, |
|
"learning_rate": 3.468208092485549e-05, |
|
"loss": 2.3983, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04237950499348676, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 3.5260115606936416e-05, |
|
"loss": 2.3009, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04307425097698654, |
|
"grad_norm": 1.0, |
|
"learning_rate": 3.583815028901734e-05, |
|
"loss": 1.8959, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.04376899696048632, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 3.6416184971098265e-05, |
|
"loss": 1.9368, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.044463742943986105, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 3.699421965317919e-05, |
|
"loss": 2.1604, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.04515848892748589, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 3.757225433526011e-05, |
|
"loss": 2.034, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04585323491098567, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.815028901734104e-05, |
|
"loss": 2.3586, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.046547980894485454, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 3.872832369942196e-05, |
|
"loss": 1.8835, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.04724272687798524, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 3.930635838150289e-05, |
|
"loss": 2.1474, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.04793747286148502, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 3.988439306358382e-05, |
|
"loss": 1.988, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0486322188449848, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 4.046242774566474e-05, |
|
"loss": 2.2501, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04932696482848459, |
|
"grad_norm": 1.125, |
|
"learning_rate": 4.1040462427745666e-05, |
|
"loss": 1.6597, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.050021710811984366, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.161849710982659e-05, |
|
"loss": 2.2616, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.05071645679548415, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 4.2196531791907514e-05, |
|
"loss": 1.8914, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.051411202778983936, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 4.2774566473988445e-05, |
|
"loss": 2.0235, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.052105948762483714, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 4.335260115606937e-05, |
|
"loss": 2.1633, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0528006947459835, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.3930635838150294e-05, |
|
"loss": 2.1997, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.053495440729483285, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 4.450867052023122e-05, |
|
"loss": 2.2325, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.05419018671298306, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 4.508670520231214e-05, |
|
"loss": 1.6797, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.05488493269648285, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 4.566473988439307e-05, |
|
"loss": 2.0388, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.05557967867998263, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 4.624277456647399e-05, |
|
"loss": 1.8112, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05627442466348241, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 4.6820809248554915e-05, |
|
"loss": 1.925, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.056969170646982197, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 4.739884393063584e-05, |
|
"loss": 1.8969, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.05766391663048198, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 4.7976878612716764e-05, |
|
"loss": 2.1033, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.05835866261398176, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 4.855491329479769e-05, |
|
"loss": 2.0978, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.059053408597481545, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 4.913294797687861e-05, |
|
"loss": 2.0516, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05974815458098133, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 4.971098265895954e-05, |
|
"loss": 2.0648, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.06044290056448111, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 5.028901734104047e-05, |
|
"loss": 2.098, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.061137646547980894, |
|
"grad_norm": 1.25, |
|
"learning_rate": 5.0867052023121385e-05, |
|
"loss": 2.1498, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.06183239253148068, |
|
"grad_norm": 1.375, |
|
"learning_rate": 5.1445086705202317e-05, |
|
"loss": 1.8586, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.06252713851498046, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 5.2023121387283234e-05, |
|
"loss": 1.6702, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06322188449848025, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 5.2601156069364165e-05, |
|
"loss": 2.1599, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.06391663048198003, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 5.317919075144508e-05, |
|
"loss": 2.0213, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0646113764654798, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 5.3757225433526014e-05, |
|
"loss": 2.3254, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0653061224489796, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 5.433526011560693e-05, |
|
"loss": 2.0617, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.06600086843247938, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 5.491329479768786e-05, |
|
"loss": 1.8925, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06669561441597915, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 5.5491329479768787e-05, |
|
"loss": 1.886, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.06739036039947895, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 5.606936416184971e-05, |
|
"loss": 2.2635, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.06808510638297872, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 5.664739884393064e-05, |
|
"loss": 1.8094, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0687798523664785, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 5.722543352601156e-05, |
|
"loss": 1.7222, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0694745983499783, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 5.780346820809249e-05, |
|
"loss": 2.1751, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07016934433347807, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 5.8381502890173415e-05, |
|
"loss": 2.0186, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.07086409031697785, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 5.895953757225434e-05, |
|
"loss": 1.7453, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.07155883630047764, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 5.9537572254335263e-05, |
|
"loss": 2.1655, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.07225358228397742, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 6.0115606936416195e-05, |
|
"loss": 2.0565, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.0729483282674772, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 6.069364161849711e-05, |
|
"loss": 1.9825, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.07364307425097699, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 6.127167630057804e-05, |
|
"loss": 1.4917, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.07433782023447677, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 6.184971098265896e-05, |
|
"loss": 1.7809, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.07503256621797655, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 6.242774566473989e-05, |
|
"loss": 2.2905, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.07572731220147634, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.300578034682081e-05, |
|
"loss": 1.8123, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.07642205818497612, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 6.358381502890174e-05, |
|
"loss": 2.1268, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0771168041684759, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 6.416184971098266e-05, |
|
"loss": 1.9522, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.07781155015197569, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.473988439306359e-05, |
|
"loss": 2.0203, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.07850629613547547, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 6.53179190751445e-05, |
|
"loss": 2.4639, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.07920104211897525, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 6.589595375722544e-05, |
|
"loss": 2.1491, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.07989578810247504, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 6.647398843930635e-05, |
|
"loss": 2.0459, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.08059053408597482, |
|
"grad_norm": 1.6640625, |
|
"learning_rate": 6.705202312138729e-05, |
|
"loss": 2.0957, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.0812852800694746, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 6.763005780346822e-05, |
|
"loss": 2.1087, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.08198002605297439, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 6.820809248554913e-05, |
|
"loss": 1.6713, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.08267477203647416, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 6.878612716763007e-05, |
|
"loss": 2.0883, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.08336951801997394, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 6.936416184971098e-05, |
|
"loss": 1.8738, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08406426400347373, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 6.994219653179191e-05, |
|
"loss": 2.0907, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.08475900998697351, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 7.052023121387283e-05, |
|
"loss": 1.7412, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.08545375597047329, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 7.109826589595376e-05, |
|
"loss": 1.9133, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.08614850195397308, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 7.167630057803468e-05, |
|
"loss": 1.9658, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.08684324793747286, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 7.225433526011561e-05, |
|
"loss": 1.947, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08753799392097264, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 7.283236994219653e-05, |
|
"loss": 1.912, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.08823273990447243, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 7.341040462427746e-05, |
|
"loss": 2.0971, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.08892748588797221, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 7.398843930635838e-05, |
|
"loss": 2.044, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.08962223187147199, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 7.456647398843931e-05, |
|
"loss": 2.0081, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.09031697785497178, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 7.514450867052023e-05, |
|
"loss": 1.8501, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09101172383847156, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 7.572254335260116e-05, |
|
"loss": 1.7543, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.09170646982197134, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 7.630057803468207e-05, |
|
"loss": 1.9667, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.09240121580547113, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.6878612716763e-05, |
|
"loss": 2.1222, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.09309596178897091, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 7.745664739884392e-05, |
|
"loss": 1.5583, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.09379070777247069, |
|
"grad_norm": 1.625, |
|
"learning_rate": 7.803468208092485e-05, |
|
"loss": 2.2327, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.09448545375597048, |
|
"grad_norm": 1.0, |
|
"learning_rate": 7.861271676300579e-05, |
|
"loss": 2.3996, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.09518019973947026, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 7.91907514450867e-05, |
|
"loss": 2.1297, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.09587494572297003, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 7.976878612716763e-05, |
|
"loss": 1.5923, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.09656969170646983, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 8.034682080924855e-05, |
|
"loss": 1.671, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.0972644376899696, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 8.092485549132948e-05, |
|
"loss": 1.9664, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09795918367346938, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 8.15028901734104e-05, |
|
"loss": 1.8026, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.09865392965696917, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.208092485549133e-05, |
|
"loss": 2.1695, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.09934867564046895, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 8.265895953757226e-05, |
|
"loss": 1.9029, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.10004342162396873, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 8.323699421965318e-05, |
|
"loss": 1.6349, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.10073816760746852, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 8.381502890173411e-05, |
|
"loss": 2.1295, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.1014329135909683, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 8.439306358381503e-05, |
|
"loss": 1.9088, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.10212765957446808, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 8.497109826589596e-05, |
|
"loss": 2.0262, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.10282240555796787, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 8.554913294797689e-05, |
|
"loss": 1.7243, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.10351715154146765, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 8.612716763005781e-05, |
|
"loss": 2.2122, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.10421189752496743, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 8.670520231213874e-05, |
|
"loss": 2.3888, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.10490664350846722, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 8.728323699421966e-05, |
|
"loss": 1.8788, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.105601389491967, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 8.786127167630059e-05, |
|
"loss": 1.9623, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.10629613547546678, |
|
"grad_norm": 1.25, |
|
"learning_rate": 8.84393063583815e-05, |
|
"loss": 1.7922, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.10699088145896657, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 8.901734104046244e-05, |
|
"loss": 2.0573, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.10768562744246635, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 8.959537572254337e-05, |
|
"loss": 1.978, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.10838037342596613, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.017341040462428e-05, |
|
"loss": 2.4477, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.10907511940946592, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 9.075144508670522e-05, |
|
"loss": 2.1113, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.1097698653929657, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 9.132947976878613e-05, |
|
"loss": 1.7494, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.11046461137646547, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 9.190751445086706e-05, |
|
"loss": 2.1112, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.11115935735996527, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 9.248554913294798e-05, |
|
"loss": 1.9607, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11185410334346504, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 9.306358381502891e-05, |
|
"loss": 1.8264, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.11254884932696482, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 9.364161849710983e-05, |
|
"loss": 1.9532, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.11324359531046461, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.421965317919076e-05, |
|
"loss": 1.9819, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.11393834129396439, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 9.479768786127168e-05, |
|
"loss": 2.0391, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.11463308727746417, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 9.537572254335261e-05, |
|
"loss": 2.027, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.11532783326096396, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 9.595375722543353e-05, |
|
"loss": 1.5242, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.11602257924446374, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 9.653179190751446e-05, |
|
"loss": 1.8081, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.11671732522796352, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 9.710982658959538e-05, |
|
"loss": 1.8001, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.11741207121146331, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 9.768786127167631e-05, |
|
"loss": 1.8917, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.11810681719496309, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 9.826589595375723e-05, |
|
"loss": 2.0557, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11880156317846287, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 9.884393063583816e-05, |
|
"loss": 2.0773, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.11949630916196266, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.942196531791907e-05, |
|
"loss": 1.566, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.12019105514546244, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0001, |
|
"loss": 2.0485, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.12088580112896222, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00010057803468208094, |
|
"loss": 1.7415, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.12158054711246201, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00010115606936416187, |
|
"loss": 2.2481, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.12227529309596179, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00010173410404624277, |
|
"loss": 1.878, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.12297003907946157, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0001023121387283237, |
|
"loss": 1.5244, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.12366478506296136, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00010289017341040463, |
|
"loss": 2.1072, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.12435953104646114, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00010346820809248556, |
|
"loss": 1.9914, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.12505427702996091, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010404624277456647, |
|
"loss": 1.3949, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1257490230134607, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001046242774566474, |
|
"loss": 1.7482, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.1264437689969605, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00010520231213872833, |
|
"loss": 1.6729, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.12713851498046028, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00010578034682080926, |
|
"loss": 1.7445, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.12783326096396005, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00010635838150289017, |
|
"loss": 2.1268, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.12852800694745983, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.0001069364161849711, |
|
"loss": 1.6518, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.1292227529309596, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00010751445086705203, |
|
"loss": 1.9006, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.1299174989144594, |
|
"grad_norm": 17.25, |
|
"learning_rate": 0.00010809248554913296, |
|
"loss": 2.0701, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.1306122448979592, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010867052023121386, |
|
"loss": 2.0978, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.13130699088145897, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001092485549132948, |
|
"loss": 1.8128, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.13200173686495875, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00010982658959537572, |
|
"loss": 1.9241, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13269648284845853, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00011040462427745666, |
|
"loss": 1.7011, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.1333912288319583, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00011098265895953757, |
|
"loss": 2.2784, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.1340859748154581, |
|
"grad_norm": 3.078125, |
|
"learning_rate": 0.00011156069364161849, |
|
"loss": 2.16, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.1347807207989579, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00011213872832369942, |
|
"loss": 1.734, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.13547546678245767, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00011271676300578035, |
|
"loss": 1.6558, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.13617021276595745, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00011329479768786128, |
|
"loss": 1.7766, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.13686495874945723, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001138728323699422, |
|
"loss": 1.6735, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.137559704732957, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00011445086705202312, |
|
"loss": 2.2215, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.13825445071645678, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00011502890173410405, |
|
"loss": 1.8671, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.1389491966999566, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.00011560693641618498, |
|
"loss": 1.9517, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13964394268345637, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.0001161849710982659, |
|
"loss": 2.0064, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.14033868866695615, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00011676300578034683, |
|
"loss": 1.7618, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.14103343465045592, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00011734104046242775, |
|
"loss": 1.746, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.1417281806339557, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00011791907514450868, |
|
"loss": 2.4571, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.14242292661745548, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0001184971098265896, |
|
"loss": 2.0719, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.1431176726009553, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00011907514450867053, |
|
"loss": 1.892, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.14381241858445507, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00011965317919075146, |
|
"loss": 1.9472, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.14450716456795484, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00012023121387283239, |
|
"loss": 1.9143, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.14520191055145462, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.00012080924855491329, |
|
"loss": 1.9386, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.1458966565349544, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.00012138728323699422, |
|
"loss": 1.6159, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.14659140251845418, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00012196531791907516, |
|
"loss": 1.9722, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.14728614850195398, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00012254335260115609, |
|
"loss": 2.2088, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.14798089448545376, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00012312138728323702, |
|
"loss": 2.3268, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.14867564046895354, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00012369942196531792, |
|
"loss": 1.9869, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.14937038645245332, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00012427745664739885, |
|
"loss": 1.6529, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1500651324359531, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00012485549132947978, |
|
"loss": 1.788, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.15075987841945288, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00012543352601156071, |
|
"loss": 2.1698, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.15145462440295268, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00012601156069364162, |
|
"loss": 1.8094, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.15214937038645246, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00012658959537572255, |
|
"loss": 1.5737, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.15284411636995224, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00012716763005780348, |
|
"loss": 2.0224, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15353886235345202, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.0001277456647398844, |
|
"loss": 1.9444, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.1542336083369518, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00012832369942196532, |
|
"loss": 1.8394, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.15492835432045157, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00012890173410404625, |
|
"loss": 1.9949, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.15562310030395138, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00012947976878612718, |
|
"loss": 1.8063, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.15631784628745116, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.0001300578034682081, |
|
"loss": 2.108, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.15701259227095093, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.000130635838150289, |
|
"loss": 1.865, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.1577073382544507, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00013121387283236994, |
|
"loss": 1.9677, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.1584020842379505, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00013179190751445087, |
|
"loss": 1.5685, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.15909683022145027, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.0001323699421965318, |
|
"loss": 1.7492, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.15979157620495008, |
|
"grad_norm": 1.609375, |
|
"learning_rate": 0.0001329479768786127, |
|
"loss": 2.3405, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.16048632218844985, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00013352601156069364, |
|
"loss": 1.8523, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.16118106817194963, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00013410404624277457, |
|
"loss": 1.6507, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.1618758141554494, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001346820809248555, |
|
"loss": 1.9661, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.1625705601389492, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00013526011560693643, |
|
"loss": 1.8777, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00013583815028901734, |
|
"loss": 1.7511, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.16396005210594877, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00013641618497109827, |
|
"loss": 2.1641, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.16465479808944855, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0001369942196531792, |
|
"loss": 2.0807, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.16534954407294833, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00013757225433526013, |
|
"loss": 1.5565, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.1660442900564481, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00013815028901734104, |
|
"loss": 2.0807, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.16673903603994789, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00013872832369942197, |
|
"loss": 1.7981, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.16743378202344766, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.0001393063583815029, |
|
"loss": 1.6482, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.16812852800694747, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00013988439306358383, |
|
"loss": 1.8768, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.16882327399044725, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00014046242774566473, |
|
"loss": 1.6573, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.16951801997394703, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00014104046242774566, |
|
"loss": 1.6465, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0001416184971098266, |
|
"loss": 2.1823, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.17090751194094658, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00014219653179190753, |
|
"loss": 2.1286, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.17160225792444636, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00014277456647398843, |
|
"loss": 2.0725, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.17229700390794617, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00014335260115606936, |
|
"loss": 1.9063, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.17299174989144595, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001439306358381503, |
|
"loss": 2.1774, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.17368649587494572, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00014450867052023122, |
|
"loss": 1.8539, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1743812418584455, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00014508670520231215, |
|
"loss": 1.8404, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.17507598784194528, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00014566473988439306, |
|
"loss": 1.8873, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.17577073382544506, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.000146242774566474, |
|
"loss": 1.9706, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.17646547980894486, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00014682080924855492, |
|
"loss": 1.9709, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.17716022579244464, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00014739884393063585, |
|
"loss": 1.8306, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.17785497177594442, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00014797687861271676, |
|
"loss": 2.1021, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.1785497177594442, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00014855491329479769, |
|
"loss": 2.1095, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.17924446374294398, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00014913294797687862, |
|
"loss": 2.0039, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.17993920972644378, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00014971098265895955, |
|
"loss": 1.7203, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.18063395570994356, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00015028901734104045, |
|
"loss": 1.6399, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.18132870169344334, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00015086705202312138, |
|
"loss": 1.6332, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.18202344767694312, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00015144508670520231, |
|
"loss": 1.9613, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.1827181936604429, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00015202312138728325, |
|
"loss": 1.7818, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.18341293964394267, |
|
"grad_norm": 1.5625, |
|
"learning_rate": 0.00015260115606936415, |
|
"loss": 2.2765, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.18410768562744248, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.00015317919075144508, |
|
"loss": 1.7086, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.18480243161094226, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.000153757225433526, |
|
"loss": 1.7275, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.18549717759444204, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00015433526011560694, |
|
"loss": 2.0655, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.18619192357794181, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00015491329479768785, |
|
"loss": 1.5989, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.1868866695614416, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.00015549132947976878, |
|
"loss": 1.1796, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.18758141554494137, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.0001560693641618497, |
|
"loss": 1.8115, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18827616152844118, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00015664739884393064, |
|
"loss": 1.9066, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.18897090751194096, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.00015722543352601157, |
|
"loss": 2.0165, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.18966565349544073, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00015780346820809248, |
|
"loss": 2.3133, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.1903603994789405, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0001583815028901734, |
|
"loss": 2.0332, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.1910551454624403, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 0.00015895953757225434, |
|
"loss": 1.9067, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.19174989144594007, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00015953757225433527, |
|
"loss": 1.9406, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.19244463742943987, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00016011560693641617, |
|
"loss": 1.8024, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.19313938341293965, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0001606936416184971, |
|
"loss": 1.803, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.19383412939643943, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00016127167630057803, |
|
"loss": 2.1169, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.1945288753799392, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00016184971098265897, |
|
"loss": 1.8095, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.195223621363439, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0001624277456647399, |
|
"loss": 2.1282, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.19591836734693877, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.0001630057803468208, |
|
"loss": 1.7845, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.19661311333043857, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00016358381502890173, |
|
"loss": 1.8735, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.19730785931393835, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00016416184971098266, |
|
"loss": 1.9958, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.19800260529743813, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0001647398843930636, |
|
"loss": 2.1224, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.1986973512809379, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00016531791907514452, |
|
"loss": 2.1101, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.19939209726443768, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00016589595375722543, |
|
"loss": 2.1356, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.20008684324793746, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00016647398843930636, |
|
"loss": 1.6759, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.20078158923143727, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.0001670520231213873, |
|
"loss": 2.1158, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.20147633521493705, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00016763005780346822, |
|
"loss": 2.1826, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.20217108119843683, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00016820809248554915, |
|
"loss": 1.8656, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.2028658271819366, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00016878612716763006, |
|
"loss": 2.2818, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.20356057316543638, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.000169364161849711, |
|
"loss": 1.6402, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.20425531914893616, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 0.00016994219653179192, |
|
"loss": 1.8538, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.20495006513243597, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 0.00017052023121387285, |
|
"loss": 1.6352, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.20564481111593574, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00017109826589595378, |
|
"loss": 1.7506, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.20633955709943552, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0001716763005780347, |
|
"loss": 1.889, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.2070343030829353, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00017225433526011562, |
|
"loss": 2.0453, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.20772904906643508, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00017283236994219655, |
|
"loss": 2.182, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.20842379504993486, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00017341040462427748, |
|
"loss": 1.943, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.20911854103343466, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0001739884393063584, |
|
"loss": 1.9609, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.20981328701693444, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0001745664739884393, |
|
"loss": 1.8991, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.21050803300043422, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00017514450867052024, |
|
"loss": 1.8573, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.211202778983934, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00017572254335260118, |
|
"loss": 2.1771, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.21189752496743378, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0001763005780346821, |
|
"loss": 1.9985, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.21259227095093355, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.000176878612716763, |
|
"loss": 1.5911, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.21328701693443336, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00017745664739884394, |
|
"loss": 2.0792, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.21398176291793314, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00017803468208092487, |
|
"loss": 1.5303, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.21467650890143292, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.0001786127167630058, |
|
"loss": 2.1808, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.2153712548849327, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00017919075144508673, |
|
"loss": 1.5205, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.21606600086843247, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00017976878612716764, |
|
"loss": 1.8961, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.21676074685193225, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00018034682080924857, |
|
"loss": 1.634, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.21745549283543206, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0001809248554913295, |
|
"loss": 1.9757, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.21815023881893184, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00018150289017341043, |
|
"loss": 1.6177, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.2188449848024316, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00018208092485549134, |
|
"loss": 1.5741, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2195397307859314, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00018265895953757227, |
|
"loss": 1.8792, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.22023447676943117, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0001832369942196532, |
|
"loss": 1.973, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.22092922275293095, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00018381502890173413, |
|
"loss": 2.0379, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.22162396873643075, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00018439306358381503, |
|
"loss": 1.9269, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.22231871471993053, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00018497109826589596, |
|
"loss": 2.2882, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.2230134607034303, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0001855491329479769, |
|
"loss": 1.7531, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.2237082066869301, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00018612716763005783, |
|
"loss": 2.011, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.22440295267042987, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00018670520231213873, |
|
"loss": 1.7293, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.22509769865392965, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00018728323699421966, |
|
"loss": 1.8176, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.22579244463742945, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0001878612716763006, |
|
"loss": 2.1176, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.22648719062092923, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00018843930635838152, |
|
"loss": 1.9007, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.227181936604429, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00018901734104046245, |
|
"loss": 1.9553, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.22787668258792879, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.00018959537572254336, |
|
"loss": 1.7696, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.0001901734104046243, |
|
"loss": 2.1033, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.22926617455492834, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00019075144508670522, |
|
"loss": 2.0961, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.22996092053842815, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00019132947976878615, |
|
"loss": 1.8456, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.23065566652192793, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00019190751445086706, |
|
"loss": 1.6038, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.2313504125054277, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.000192485549132948, |
|
"loss": 1.9724, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.23204515848892748, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00019306358381502892, |
|
"loss": 1.9463, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.23273990447242726, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00019364161849710985, |
|
"loss": 1.8556, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.23343465045592704, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.00019421965317919075, |
|
"loss": 2.1624, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.23412939643942685, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00019479768786127168, |
|
"loss": 1.8647, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.23482414242292662, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00019537572254335262, |
|
"loss": 1.5004, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.2355188884064264, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00019595375722543355, |
|
"loss": 2.1977, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.23621363438992618, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00019653179190751445, |
|
"loss": 1.8312, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.23690838037342596, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00019710982658959538, |
|
"loss": 1.9086, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.23760312635692574, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0001976878612716763, |
|
"loss": 1.7357, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.23829787234042554, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00019826589595375724, |
|
"loss": 1.7354, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.23899261832392532, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00019884393063583815, |
|
"loss": 1.867, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.2396873643074251, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00019942196531791908, |
|
"loss": 1.9156, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.24038211029092488, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.0002, |
|
"loss": 1.9306, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.24107685627442466, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019994963485268195, |
|
"loss": 2.1757, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.24177160225792443, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 0.0001998992697053639, |
|
"loss": 1.7667, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.24246634824142424, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00019984890455804585, |
|
"loss": 1.9543, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.24316109422492402, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00019979853941072778, |
|
"loss": 1.5744, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2438558402084238, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00019974817426340972, |
|
"loss": 1.9601, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.24455058619192357, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00019969780911609168, |
|
"loss": 2.0014, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.24524533217542335, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00019964744396877362, |
|
"loss": 1.8605, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.24594007815892313, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00019959707882145555, |
|
"loss": 1.9851, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.24663482414242294, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00019954671367413752, |
|
"loss": 2.1791, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.24732957012592272, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00019949634852681945, |
|
"loss": 1.8084, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.2480243161094225, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.0001994459833795014, |
|
"loss": 1.9735, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.24871906209292227, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00019939561823218333, |
|
"loss": 2.0874, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.24941380807642205, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0001993452530848653, |
|
"loss": 2.0039, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.25010855405992183, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00019929488793754723, |
|
"loss": 1.8483, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.2508033000434216, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00019924452279022916, |
|
"loss": 1.7729, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.2514980460269214, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.00019919415764291113, |
|
"loss": 1.8132, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.25219279201042116, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00019914379249559306, |
|
"loss": 1.5844, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.252887537993921, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.000199093427348275, |
|
"loss": 2.1404, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.2535822839774208, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 0.00019904306220095693, |
|
"loss": 2.0058, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.25427702996092055, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0001989926970536389, |
|
"loss": 1.795, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.25497177594442033, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00019894233190632083, |
|
"loss": 2.0315, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.2556665219279201, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0001988919667590028, |
|
"loss": 1.8603, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.2563612679114199, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00019884160161168473, |
|
"loss": 2.0518, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.25705601389491967, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00019879123646436667, |
|
"loss": 1.4851, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.25775075987841944, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0001987408713170486, |
|
"loss": 1.5712, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.2584455058619192, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00019869050616973054, |
|
"loss": 2.2505, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.259140251845419, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.0001986401410224125, |
|
"loss": 2.3709, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.2598349978289188, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00019858977587509444, |
|
"loss": 2.1194, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.26052974381241856, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0001985394107277764, |
|
"loss": 1.5963, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.2612244897959184, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00019848904558045834, |
|
"loss": 2.1682, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.26191923577941817, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00019843868043314028, |
|
"loss": 1.848, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.26261398176291795, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00019838831528582222, |
|
"loss": 1.5301, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.2633087277464177, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00019833795013850415, |
|
"loss": 1.6342, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.2640034737299175, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00019828758499118611, |
|
"loss": 1.7481, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.2646982197134173, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00019823721984386805, |
|
"loss": 2.0733, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.26539296569691706, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00019818685469655001, |
|
"loss": 2.2002, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.26608771168041684, |
|
"grad_norm": 4.0, |
|
"learning_rate": 0.00019813648954923195, |
|
"loss": 1.8585, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.2667824576639166, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.0001980861244019139, |
|
"loss": 1.8138, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.2674772036474164, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00019803575925459582, |
|
"loss": 2.0099, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2681719496309162, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00019798539410727776, |
|
"loss": 1.9675, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.26886669561441595, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00019793502895995972, |
|
"loss": 1.6869, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.2695614415979158, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00019788466381264166, |
|
"loss": 1.9112, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.27025618758141556, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00019783429866532362, |
|
"loss": 1.7991, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.27095093356491534, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00019778393351800556, |
|
"loss": 1.7384, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2716456795484151, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.0001977335683706875, |
|
"loss": 1.763, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.2723404255319149, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00019768320322336943, |
|
"loss": 1.837, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.2730351715154147, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00019763283807605137, |
|
"loss": 1.9485, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.27372991749891445, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00019758247292873333, |
|
"loss": 2.2162, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.27442466348241423, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00019753210778141527, |
|
"loss": 2.1763, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.275119409465914, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00019748174263409723, |
|
"loss": 1.8636, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.2758141554494138, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00019743137748677917, |
|
"loss": 2.0849, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.27650890143291357, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0001973810123394611, |
|
"loss": 2.1198, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.2772036474164134, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00019733064719214304, |
|
"loss": 1.8171, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.2778983933999132, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00019728028204482498, |
|
"loss": 2.1271, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.27859313938341296, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00019722991689750694, |
|
"loss": 1.8004, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.27928788536691274, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00019717955175018888, |
|
"loss": 2.0326, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.2799826313504125, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00019712918660287084, |
|
"loss": 1.6586, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.2806773773339123, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00019707882145555278, |
|
"loss": 1.7802, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.28137212331741207, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.0001970284563082347, |
|
"loss": 1.9849, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.28206686930091185, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00019697809116091665, |
|
"loss": 2.3161, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.2827616152844116, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00019692772601359858, |
|
"loss": 1.7711, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.2834563612679114, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00019687736086628055, |
|
"loss": 1.8916, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.2841511072514112, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00019682699571896248, |
|
"loss": 2.0297, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.28484585323491096, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00019677663057164445, |
|
"loss": 1.8274, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2855405992184108, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00019672626542432638, |
|
"loss": 1.5261, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.2862353452019106, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00019667590027700832, |
|
"loss": 1.9996, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.28693009118541035, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00019662553512969026, |
|
"loss": 2.1384, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.28762483716891013, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.0001965751699823722, |
|
"loss": 1.6071, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.2883195831524099, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00019652480483505416, |
|
"loss": 2.268, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.2890143291359097, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.0001964744396877361, |
|
"loss": 1.6901, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.28970907511940946, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00019642407454041806, |
|
"loss": 2.0621, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.29040382110290924, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.0001963737093931, |
|
"loss": 1.7457, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.291098567086409, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00019632334424578193, |
|
"loss": 1.8165, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.2917933130699088, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019627297909846387, |
|
"loss": 1.8413, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2924880590534086, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0001962226139511458, |
|
"loss": 2.0921, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.29318280503690836, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00019617224880382777, |
|
"loss": 1.9443, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.2938775510204082, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 0.0001961218836565097, |
|
"loss": 1.8594, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.29457229700390797, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.00019607151850919166, |
|
"loss": 1.8851, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.29526704298740775, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.0001960211533618736, |
|
"loss": 2.0707, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.2959617889709075, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00019597078821455554, |
|
"loss": 2.2316, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.2966565349544073, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00019592042306723747, |
|
"loss": 1.8408, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.2973512809379071, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0001958700579199194, |
|
"loss": 1.893, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.29804602692140686, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00019581969277260137, |
|
"loss": 1.929, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.29874077290490664, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0001957693276252833, |
|
"loss": 1.9385, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2994355188884064, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00019571896247796527, |
|
"loss": 1.9914, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.3001302648719062, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.0001956685973306472, |
|
"loss": 1.7315, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.30082501085540597, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00019561823218332915, |
|
"loss": 1.5899, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.30151975683890575, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00019556786703601108, |
|
"loss": 2.0665, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.3022145028224056, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00019551750188869305, |
|
"loss": 1.7969, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.30290924880590536, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00019546713674137498, |
|
"loss": 1.8387, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.30360399478940514, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00019541677159405692, |
|
"loss": 1.8154, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.3042987407729049, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00019536640644673888, |
|
"loss": 1.9395, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.3049934867564047, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00019531604129942082, |
|
"loss": 1.8653, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.3056882327399045, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00019526567615210275, |
|
"loss": 1.9524, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.30638297872340425, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.0001952153110047847, |
|
"loss": 1.5947, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.30707772470690403, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00019516494585746665, |
|
"loss": 2.1353, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.3077724706904038, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.0001951145807101486, |
|
"loss": 1.6529, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.3084672166739036, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00019506421556283053, |
|
"loss": 1.9062, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.30916196265740337, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.0001950138504155125, |
|
"loss": 2.128, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.30985670864090314, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.0001949634852681944, |
|
"loss": 1.9101, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.310551454624403, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00019491312012087636, |
|
"loss": 1.6201, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.31124620060790276, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0001948627549735583, |
|
"loss": 2.1286, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.31194094659140253, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00019481238982624026, |
|
"loss": 1.7674, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.3126356925749023, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.0001947620246789222, |
|
"loss": 2.2924, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3133304385584021, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00019471165953160413, |
|
"loss": 2.1334, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.31402518454190187, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0001946612943842861, |
|
"loss": 2.1055, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.31471993052540165, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.000194610929236968, |
|
"loss": 2.0685, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.3154146765089014, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00019456056408964997, |
|
"loss": 2.2465, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.3161094224924012, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.0001945101989423319, |
|
"loss": 1.4865, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.316804168475901, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00019445983379501387, |
|
"loss": 1.9726, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.31749891445940076, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 0.0001944094686476958, |
|
"loss": 1.8845, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.31819366044290054, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00019435910350037774, |
|
"loss": 1.9736, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.3188884064264004, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0001943087383530597, |
|
"loss": 2.0074, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.31958315240990015, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00019425837320574162, |
|
"loss": 2.0931, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.32027789839339993, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00019420800805842358, |
|
"loss": 1.9506, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.3209726443768997, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.00019415764291110552, |
|
"loss": 2.0672, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.3216673903603995, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00019410727776378748, |
|
"loss": 2.1114, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.32236213634389926, |
|
"grad_norm": 1.4296875, |
|
"learning_rate": 0.00019405691261646942, |
|
"loss": 1.8596, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.32305688232739904, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00019400654746915138, |
|
"loss": 2.2623, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3237516283108988, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00019395618232183331, |
|
"loss": 1.9356, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.3244463742943986, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00019390581717451522, |
|
"loss": 2.1368, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.3251411202778984, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001938554520271972, |
|
"loss": 1.9551, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.32583586626139815, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00019380508687987912, |
|
"loss": 1.7151, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001937547217325611, |
|
"loss": 1.6639, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.32722535822839777, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00019370435658524302, |
|
"loss": 1.9154, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.32792010421189755, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.000193653991437925, |
|
"loss": 1.6919, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.3286148501953973, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00019360362629060692, |
|
"loss": 2.0988, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.3293095961788971, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00019355326114328883, |
|
"loss": 2.12, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.3300043421623969, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.0001935028959959708, |
|
"loss": 2.1701, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.33069908814589666, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00019345253084865273, |
|
"loss": 2.0642, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.33139383412939644, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.0001934021657013347, |
|
"loss": 1.9536, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.3320885801128962, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.00019335180055401663, |
|
"loss": 2.023, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.332783326096396, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0001933014354066986, |
|
"loss": 1.9748, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.33347807207989577, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 0.0001932510702593805, |
|
"loss": 1.7722, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.33417281806339555, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00019320070511206244, |
|
"loss": 2.1057, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.3348675640468953, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0001931503399647444, |
|
"loss": 1.6782, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.33556231003039516, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00019309997481742634, |
|
"loss": 2.1179, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.33625705601389494, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0001930496096701083, |
|
"loss": 2.192, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.3369518019973947, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00019299924452279024, |
|
"loss": 1.8594, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3376465479808945, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.0001929488793754722, |
|
"loss": 1.9312, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.3383412939643943, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.0001928985142281541, |
|
"loss": 1.9013, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.33903603994789405, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00019284814908083605, |
|
"loss": 1.9257, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.33973078593139383, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.000192797783933518, |
|
"loss": 2.0159, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00019274741878619995, |
|
"loss": 1.5344, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3411202778983934, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0001926970536388819, |
|
"loss": 1.5615, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.34181502388189317, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00019264668849156385, |
|
"loss": 1.8554, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.34250976986539294, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0001925963233442458, |
|
"loss": 1.8949, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.3432045158488927, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00019254595819692772, |
|
"loss": 1.8137, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.34389926183239256, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00019249559304960968, |
|
"loss": 2.0595, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.34459400781589233, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00019244522790229162, |
|
"loss": 2.2088, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.3452887537993921, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00019239486275497356, |
|
"loss": 1.789, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.3459834997828919, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00019234449760765552, |
|
"loss": 1.7053, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.34667824576639167, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00019229413246033746, |
|
"loss": 1.985, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.34737299174989145, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00019224376731301942, |
|
"loss": 2.0731, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3480677377333912, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00019219340216570133, |
|
"loss": 2.3603, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 0.348762483716891, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0001921430370183833, |
|
"loss": 2.2065, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.3494572297003908, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00019209267187106523, |
|
"loss": 2.1569, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 0.35015197568389056, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00019204230672374717, |
|
"loss": 1.8459, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.35084672166739034, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00019199194157642913, |
|
"loss": 1.8065, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3515414676508901, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00019194157642911107, |
|
"loss": 2.2484, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.35223621363438995, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00019189121128179303, |
|
"loss": 1.9146, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 0.35293095961788973, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00019184084613447494, |
|
"loss": 1.7471, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.3536257056013895, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0001917904809871569, |
|
"loss": 1.7203, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 0.3543204515848893, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00019174011583983884, |
|
"loss": 2.0287, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.35501519756838906, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00019168975069252077, |
|
"loss": 1.7405, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 0.35570994355188884, |
|
"grad_norm": 1.71875, |
|
"learning_rate": 0.00019163938554520274, |
|
"loss": 1.7935, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.3564046895353886, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00019158902039788467, |
|
"loss": 1.7058, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 0.3570994355188884, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.0001915386552505666, |
|
"loss": 1.6631, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.3577941815023882, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00019148829010324855, |
|
"loss": 2.0297, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.35848892748588795, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001914379249559305, |
|
"loss": 1.3673, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.35918367346938773, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00019138755980861245, |
|
"loss": 1.7896, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 0.35987841945288757, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00019133719466129438, |
|
"loss": 2.2388, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.36057316543638734, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019128682951397635, |
|
"loss": 1.8827, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 0.3612679114198871, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.00019123646436665828, |
|
"loss": 1.8957, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3619626574033869, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00019118609921934022, |
|
"loss": 1.666, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 0.3626574033868867, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00019113573407202215, |
|
"loss": 1.6648, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.36335214937038646, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00019108536892470412, |
|
"loss": 1.9235, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 0.36404689535388624, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00019103500377738605, |
|
"loss": 2.1473, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.364741641337386, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.000190984638630068, |
|
"loss": 2.0953, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3654363873208858, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00019093427348274995, |
|
"loss": 1.8025, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.36613113330438557, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0001908839083354319, |
|
"loss": 2.0172, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 0.36682587928788535, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00019083354318811383, |
|
"loss": 1.7116, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.3675206252713851, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00019078317804079576, |
|
"loss": 2.0673, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 0.36821537125488496, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00019073281289347773, |
|
"loss": 2.1515, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.36891011723838474, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00019068244774615966, |
|
"loss": 2.1945, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 0.3696048632218845, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00019063208259884163, |
|
"loss": 1.9585, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.3702996092053843, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00019058171745152356, |
|
"loss": 2.1528, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 0.3709943551888841, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.0001905313523042055, |
|
"loss": 1.8423, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.37168910117238385, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00019048098715688743, |
|
"loss": 1.9329, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.37238384715588363, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00019043062200956937, |
|
"loss": 1.7946, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.3730785931393834, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00019038025686225133, |
|
"loss": 2.1579, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 0.3737733391228832, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00019032989171493327, |
|
"loss": 1.9882, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.37446808510638296, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00019027952656761523, |
|
"loss": 1.6992, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 0.37516283108988274, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00019022916142029717, |
|
"loss": 2.003, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3758575770733825, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.0001901787962729791, |
|
"loss": 2.1709, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 0.37655232305688235, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00019012843112566104, |
|
"loss": 1.9809, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 0.37724706904038213, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00019007806597834298, |
|
"loss": 1.956, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 0.3779418150238819, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00019002770083102494, |
|
"loss": 1.619, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 0.3786365610073817, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00018997733568370688, |
|
"loss": 1.7824, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.37933130699088147, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.00018992697053638884, |
|
"loss": 1.885, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 0.38002605297438125, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00018987660538907078, |
|
"loss": 2.0227, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 0.380720798957881, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00018982624024175272, |
|
"loss": 1.7396, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 0.3814155449413808, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00018977587509443465, |
|
"loss": 1.8219, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 0.3821102909248806, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 0.0001897255099471166, |
|
"loss": 2.2175, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.38280503690838036, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00018967514479979855, |
|
"loss": 1.7872, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 0.38349978289188014, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0001896247796524805, |
|
"loss": 1.6591, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 0.3841945288753799, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00018957441450516245, |
|
"loss": 2.0484, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 0.38488927485887975, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001895240493578444, |
|
"loss": 1.8777, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 0.3855840208423795, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00018947368421052632, |
|
"loss": 2.1238, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3862787668258793, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00018942331906320826, |
|
"loss": 1.9607, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 0.3869735128093791, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.0001893729539158902, |
|
"loss": 1.6038, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 0.38766825879287886, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00018932258876857216, |
|
"loss": 1.7207, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 0.38836300477637864, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.0001892722236212541, |
|
"loss": 1.94, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 0.3890577507598784, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.00018922185847393606, |
|
"loss": 1.6762, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.3897524967433782, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.000189171493326618, |
|
"loss": 1.8562, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 0.390447242726878, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00018912112817929993, |
|
"loss": 2.252, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 0.39114198871037775, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00018907076303198187, |
|
"loss": 1.7797, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 0.39183673469387753, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0001890203978846638, |
|
"loss": 2.028, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 0.3925314806773773, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00018897003273734577, |
|
"loss": 1.9406, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.39322622666087714, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.0001889196675900277, |
|
"loss": 1.4522, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 0.3939209726443769, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.00018886930244270967, |
|
"loss": 2.1899, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 0.3946157186278767, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.0001888189372953916, |
|
"loss": 2.0125, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 0.3953104646113765, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00018876857214807354, |
|
"loss": 1.8498, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 0.39600521059487626, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00018871820700075548, |
|
"loss": 2.1336, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.39669995657837603, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.0001886678418534374, |
|
"loss": 1.9395, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 0.3973947025618758, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00018861747670611938, |
|
"loss": 1.9798, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 0.3980894485453756, |
|
"grad_norm": 2.375, |
|
"learning_rate": 0.0001885671115588013, |
|
"loss": 1.9257, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 0.39878419452887537, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00018851674641148328, |
|
"loss": 1.7834, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 0.39947894051237515, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0001884663812641652, |
|
"loss": 2.0443, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.4001736864958749, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00018841601611684715, |
|
"loss": 1.6684, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 0.4008684324793747, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00018836565096952908, |
|
"loss": 1.6101, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 0.40156317846287454, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00018831528582221102, |
|
"loss": 1.5633, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 0.4022579244463743, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00018826492067489298, |
|
"loss": 1.9582, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 0.4029526704298741, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00018821455552757492, |
|
"loss": 2.1624, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.40364741641337387, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00018816419038025688, |
|
"loss": 1.8846, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 0.40434216239687365, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00018811382523293882, |
|
"loss": 1.6138, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 0.40503690838037343, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00018806346008562076, |
|
"loss": 1.6372, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 0.4057316543638732, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0001880130949383027, |
|
"loss": 2.1048, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 0.406426400347373, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00018796272979098463, |
|
"loss": 2.3389, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.40712114633087276, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.0001879123646436666, |
|
"loss": 2.0405, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 0.40781589231437254, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00018786199949634853, |
|
"loss": 1.8406, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 0.4085106382978723, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0001878116343490305, |
|
"loss": 2.2409, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 0.4092053842813721, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.00018776126920171243, |
|
"loss": 1.8545, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 0.40990013026487193, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00018771090405439437, |
|
"loss": 1.5796, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.4105948762483717, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0001876605389070763, |
|
"loss": 1.3451, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 0.4112896222318715, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 0.00018761017375975824, |
|
"loss": 2.2266, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 0.41198436821537127, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.0001875598086124402, |
|
"loss": 1.9017, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 0.41267911419887104, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00018750944346512214, |
|
"loss": 1.6085, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 0.4133738601823708, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001874590783178041, |
|
"loss": 2.0503, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.4140686061658706, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00018740871317048604, |
|
"loss": 1.9612, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 0.4147633521493704, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00018735834802316797, |
|
"loss": 1.6432, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 0.41545809813287016, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0001873079828758499, |
|
"loss": 1.855, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 0.41615284411636994, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00018725761772853187, |
|
"loss": 1.7901, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 0.4168475900998697, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0001872072525812138, |
|
"loss": 1.6167, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4175423360833695, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00018715688743389575, |
|
"loss": 1.6097, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 0.4182370820668693, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0001871065222865777, |
|
"loss": 1.8758, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 0.4189318280503691, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00018705615713925965, |
|
"loss": 1.8387, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 0.4196265740338689, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00018700579199194158, |
|
"loss": 1.5821, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 0.42032132001736866, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00018695542684462352, |
|
"loss": 1.8596, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.42101606600086844, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00018690506169730548, |
|
"loss": 1.6423, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 0.4217108119843682, |
|
"grad_norm": 0.640625, |
|
"learning_rate": 0.00018685469654998742, |
|
"loss": 1.9369, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 0.422405557967868, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00018680433140266935, |
|
"loss": 1.7383, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 0.4231003039513678, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00018675396625535132, |
|
"loss": 1.5791, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 0.42379504993486755, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00018670360110803325, |
|
"loss": 1.8138, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.42448979591836733, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0001866532359607152, |
|
"loss": 1.8374, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 0.4251845419018671, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00018660287081339713, |
|
"loss": 2.1108, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 0.4258792878853669, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0001865525056660791, |
|
"loss": 1.7101, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 0.4265740338688667, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00018650214051876103, |
|
"loss": 1.8065, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 0.4272687798523665, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00018645177537144296, |
|
"loss": 1.7606, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.4279635258358663, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00018640141022412493, |
|
"loss": 1.9831, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.42865827181936605, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00018635104507680686, |
|
"loss": 2.1788, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 0.42935301780286583, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0001863006799294888, |
|
"loss": 1.7741, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 0.4300477637863656, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00018625031478217073, |
|
"loss": 1.6123, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 0.4307425097698654, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.0001861999496348527, |
|
"loss": 2.0256, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.43143725575336517, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00018614958448753463, |
|
"loss": 2.1769, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 0.43213200173686495, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00018609921934021657, |
|
"loss": 1.765, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 0.4328267477203647, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00018604885419289853, |
|
"loss": 2.0414, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 0.4335214937038645, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00018599848904558047, |
|
"loss": 1.823, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 0.4342162396873643, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.0001859481238982624, |
|
"loss": 1.9846, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.4349109856708641, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00018589775875094434, |
|
"loss": 2.1152, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 0.4356057316543639, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.0001858473936036263, |
|
"loss": 1.8232, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 0.43630047763786367, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00018579702845630824, |
|
"loss": 1.4993, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 0.43699522362136345, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.0001857466633089902, |
|
"loss": 1.7799, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 0.4376899696048632, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00018569629816167214, |
|
"loss": 1.5612, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.438384715588363, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00018564593301435408, |
|
"loss": 2.114, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 0.4390794615718628, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00018559556786703602, |
|
"loss": 1.8577, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 0.43977420755536256, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00018554520271971795, |
|
"loss": 1.9846, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 0.44046895353886234, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.00018549483757239991, |
|
"loss": 1.9394, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 0.4411636995223621, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00018544447242508185, |
|
"loss": 1.6845, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.4418584455058619, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00018539410727776381, |
|
"loss": 1.9656, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 0.4425531914893617, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00018534374213044575, |
|
"loss": 2.2499, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 0.4432479374728615, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001852933769831277, |
|
"loss": 1.9494, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 0.4439426834563613, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.00018524301183580962, |
|
"loss": 1.2849, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 0.44463742943986106, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00018519264668849156, |
|
"loss": 1.8238, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.44533217542336084, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00018514228154117352, |
|
"loss": 1.8664, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 0.4460269214068606, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00018509191639385546, |
|
"loss": 1.8457, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 0.4467216673903604, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00018504155124653742, |
|
"loss": 1.9668, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 0.4474164133738602, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00018499118609921936, |
|
"loss": 1.5451, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 0.44811115935735996, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0001849408209519013, |
|
"loss": 2.2038, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.44880590534085973, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00018489045580458323, |
|
"loss": 1.7649, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 0.4495006513243595, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 0.00018484009065726517, |
|
"loss": 1.9661, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 0.4501953973078593, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00018478972550994713, |
|
"loss": 1.7701, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 0.4508901432913591, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.00018473936036262907, |
|
"loss": 1.7554, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 0.4515848892748589, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 0.00018468899521531103, |
|
"loss": 1.9016, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4522796352583587, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00018463863006799297, |
|
"loss": 1.9223, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 0.45297438124185846, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0001845882649206749, |
|
"loss": 1.7449, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 0.45366912722535824, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00018453789977335684, |
|
"loss": 1.9779, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 0.454363873208858, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00018448753462603878, |
|
"loss": 1.7003, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 0.4550586191923578, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00018443716947872074, |
|
"loss": 1.6883, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.45575336517585757, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00018438680433140268, |
|
"loss": 1.9557, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 0.45644811115935735, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.00018433643918408464, |
|
"loss": 2.0112, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00018428607403676658, |
|
"loss": 2.0644, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 0.4578376031263569, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.0001842357088894485, |
|
"loss": 2.2675, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 0.4585323491098567, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00018418534374213045, |
|
"loss": 2.0472, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4592270950933565, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00018413497859481238, |
|
"loss": 1.7624, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 0.4599218410768563, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00018408461344749435, |
|
"loss": 1.9001, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 0.4606165870603561, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 0.00018403424830017628, |
|
"loss": 2.1622, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 0.46131133304385585, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00018398388315285825, |
|
"loss": 2.0592, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 0.46200607902735563, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00018393351800554018, |
|
"loss": 1.97, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.4627008250108554, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00018388315285822212, |
|
"loss": 1.7164, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 0.4633955709943552, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00018383278771090406, |
|
"loss": 2.0458, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 0.46409031697785497, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.000183782422563586, |
|
"loss": 2.0149, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 0.46478506296135474, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00018373205741626796, |
|
"loss": 2.3638, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 0.4654798089448545, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0001836816922689499, |
|
"loss": 2.0574, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.4661745549283543, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00018363132712163186, |
|
"loss": 1.7857, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 0.4668693009118541, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0001835809619743138, |
|
"loss": 1.8794, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 0.4675640468953539, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00018353059682699573, |
|
"loss": 1.7081, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 0.4682587928788537, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00018348023167967767, |
|
"loss": 1.5881, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 0.46895353886235347, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.0001834298665323596, |
|
"loss": 2.0648, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.46964828484585325, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00018337950138504156, |
|
"loss": 1.9419, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 0.470343030829353, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001833291362377235, |
|
"loss": 2.1462, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 0.4710377768128528, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00018327877109040546, |
|
"loss": 2.0603, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 0.4717325227963526, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0001832284059430874, |
|
"loss": 1.9721, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 0.47242726877985236, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00018317804079576934, |
|
"loss": 1.5208, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.47312201476335214, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00018312767564845127, |
|
"loss": 2.3462, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 0.4738167607468519, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.0001830773105011332, |
|
"loss": 1.8435, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 0.4745115067303517, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00018302694535381517, |
|
"loss": 2.2692, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 0.4752062527138515, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.0001829765802064971, |
|
"loss": 2.1077, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 0.4759009986973513, |
|
"grad_norm": 0.796875, |
|
"learning_rate": 0.00018292621505917907, |
|
"loss": 1.9386, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.4765957446808511, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.000182875849911861, |
|
"loss": 1.7375, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 0.47729049066435086, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00018282548476454295, |
|
"loss": 1.8864, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 0.47798523664785064, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00018277511961722488, |
|
"loss": 2.2137, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 0.4786799826313504, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00018272475446990682, |
|
"loss": 2.0135, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 0.4793747286148502, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00018267438932258878, |
|
"loss": 1.9841, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.48006947459835, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.00018262402417527072, |
|
"loss": 1.8652, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 0.48076422058184975, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00018257365902795268, |
|
"loss": 1.533, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 0.48145896656534953, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00018252329388063462, |
|
"loss": 1.7846, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 0.4821537125488493, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00018247292873331655, |
|
"loss": 1.8461, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 0.4828484585323491, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.0001824225635859985, |
|
"loss": 1.8622, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.48354320451584887, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00018237219843868045, |
|
"loss": 2.1919, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 0.4842379504993487, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0001823218332913624, |
|
"loss": 1.9689, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 0.4849326964828485, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00018227146814404433, |
|
"loss": 1.5199, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 0.48562744246634826, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.0001822211029967263, |
|
"loss": 2.2635, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 0.48632218844984804, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00018217073784940823, |
|
"loss": 2.2775, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4870169344333478, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00018212037270209016, |
|
"loss": 1.9399, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 0.4877116804168476, |
|
"grad_norm": 2.9375, |
|
"learning_rate": 0.0001820700075547721, |
|
"loss": 2.0305, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 0.48840642640034737, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00018201964240745406, |
|
"loss": 2.098, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 0.48910117238384715, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.000181969277260136, |
|
"loss": 2.0633, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 0.00018191891211281793, |
|
"loss": 1.8335, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.4904906643508467, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0001818685469654999, |
|
"loss": 1.4018, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 0.4911854103343465, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 1.5096, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 0.49188015631784626, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00018176781667086377, |
|
"loss": 2.0383, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 0.4925749023013461, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0001817174515235457, |
|
"loss": 2.0676, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 0.4932696482848459, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00018166708637622767, |
|
"loss": 2.1933, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.49396439426834565, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 0.0001816167212289096, |
|
"loss": 1.7498, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 0.49465914025184543, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00018156635608159154, |
|
"loss": 1.9815, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 0.4953538862353452, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0001815159909342735, |
|
"loss": 2.2162, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 0.496048632218845, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00018146562578695542, |
|
"loss": 1.8575, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 0.49674337820234477, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00018141526063963738, |
|
"loss": 1.8807, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.49743812418584454, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00018136489549231932, |
|
"loss": 1.1918, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 0.4981328701693443, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00018131453034500128, |
|
"loss": 2.1739, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 0.4988276161528441, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00018126416519768321, |
|
"loss": 1.9931, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 0.4995223621363439, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00018121380005036515, |
|
"loss": 1.8006, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 0.5002171081198437, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00018116343490304711, |
|
"loss": 1.9731, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5009118541033435, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00018111306975572902, |
|
"loss": 1.9277, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 0.5016066000868432, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 0.000181062704608411, |
|
"loss": 1.8397, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 0.502301346070343, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00018101233946109292, |
|
"loss": 1.6404, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 0.5029960920538428, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001809619743137749, |
|
"loss": 1.6856, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 0.5036908380373426, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00018091160916645682, |
|
"loss": 1.8246, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.5043855840208423, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00018086124401913876, |
|
"loss": 1.9523, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 0.5050803300043422, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00018081087887182072, |
|
"loss": 1.8332, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 0.505775075987842, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00018076051372450263, |
|
"loss": 2.008, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 0.5064698219713417, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0001807101485771846, |
|
"loss": 2.0791, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 0.5071645679548415, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00018065978342986653, |
|
"loss": 2.2381, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.5078593139383413, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0001806094182825485, |
|
"loss": 1.6686, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 0.5085540599218411, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00018055905313523043, |
|
"loss": 1.9747, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 0.5092488059053408, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0001805086879879124, |
|
"loss": 1.8613, |
|
"step": 733 |
|
}, |
|
{ |
|
"epoch": 0.5099435518888407, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00018045832284059433, |
|
"loss": 1.6721, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00018040795769327624, |
|
"loss": 1.9698, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5113330438558402, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0001803575925459582, |
|
"loss": 1.9346, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 0.5120277898393399, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.00018030722739864014, |
|
"loss": 1.6338, |
|
"step": 737 |
|
}, |
|
{ |
|
"epoch": 0.5127225358228398, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0001802568622513221, |
|
"loss": 1.7765, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 0.5134172818063396, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00018020649710400404, |
|
"loss": 1.6058, |
|
"step": 739 |
|
}, |
|
{ |
|
"epoch": 0.5141120277898393, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.000180156131956686, |
|
"loss": 1.7684, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.5148067737733392, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.00018010576680936794, |
|
"loss": 1.7943, |
|
"step": 741 |
|
}, |
|
{ |
|
"epoch": 0.5155015197568389, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00018005540166204985, |
|
"loss": 2.029, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 0.5161962657403387, |
|
"grad_norm": 13.1875, |
|
"learning_rate": 0.0001800050365147318, |
|
"loss": 2.0415, |
|
"step": 743 |
|
}, |
|
{ |
|
"epoch": 0.5168910117238384, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00017995467136741375, |
|
"loss": 2.0677, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 0.5175857577073383, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.0001799043062200957, |
|
"loss": 1.7026, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.518280503690838, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00017985394107277765, |
|
"loss": 1.7546, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 0.5189752496743378, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0001798035759254596, |
|
"loss": 1.7259, |
|
"step": 747 |
|
}, |
|
{ |
|
"epoch": 0.5196699956578376, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00017975321077814152, |
|
"loss": 1.8416, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 0.5203647416413374, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00017970284563082346, |
|
"loss": 2.3549, |
|
"step": 749 |
|
}, |
|
{ |
|
"epoch": 0.5210594876248371, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 0.00017965248048350542, |
|
"loss": 1.573, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.521754233608337, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00017960211533618736, |
|
"loss": 1.5468, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 0.5224489795918368, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00017955175018886932, |
|
"loss": 1.8732, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 0.5231437255753365, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00017950138504155126, |
|
"loss": 1.8024, |
|
"step": 753 |
|
}, |
|
{ |
|
"epoch": 0.5238384715588363, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.00017945101989423322, |
|
"loss": 1.8361, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 0.5245332175423361, |
|
"grad_norm": 36.75, |
|
"learning_rate": 0.00017940065474691513, |
|
"loss": 2.3295, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.5252279635258359, |
|
"grad_norm": 2.25, |
|
"learning_rate": 0.00017935028959959707, |
|
"loss": 2.1014, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 0.5259227095093356, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00017929992445227903, |
|
"loss": 1.6904, |
|
"step": 757 |
|
}, |
|
{ |
|
"epoch": 0.5266174554928355, |
|
"grad_norm": 3.25, |
|
"learning_rate": 0.00017924955930496097, |
|
"loss": 1.8936, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 0.5273122014763352, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00017919919415764293, |
|
"loss": 2.0644, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 0.528006947459835, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.00017914882901032486, |
|
"loss": 1.9939, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.5287016934433347, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00017909846386300683, |
|
"loss": 1.992, |
|
"step": 761 |
|
}, |
|
{ |
|
"epoch": 0.5293964394268346, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.00017904809871568874, |
|
"loss": 2.1399, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 0.5300911854103344, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.0001789977335683707, |
|
"loss": 1.4181, |
|
"step": 763 |
|
}, |
|
{ |
|
"epoch": 0.5307859313938341, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00017894736842105264, |
|
"loss": 1.8635, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 0.531480677377334, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00017889700327373457, |
|
"loss": 1.6296, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.5321754233608337, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00017884663812641654, |
|
"loss": 2.0959, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 0.5328701693443335, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00017879627297909847, |
|
"loss": 1.6697, |
|
"step": 767 |
|
}, |
|
{ |
|
"epoch": 0.5335649153278332, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00017874590783178044, |
|
"loss": 2.4245, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 0.5342596613113331, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00017869554268446235, |
|
"loss": 1.5763, |
|
"step": 769 |
|
}, |
|
{ |
|
"epoch": 0.5349544072948328, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.0001786451775371443, |
|
"loss": 1.9295, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.5356491532783326, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00017859481238982625, |
|
"loss": 1.3931, |
|
"step": 771 |
|
}, |
|
{ |
|
"epoch": 0.5363438992618323, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00017854444724250818, |
|
"loss": 2.1037, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 0.5370386452453322, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00017849408209519015, |
|
"loss": 1.7615, |
|
"step": 773 |
|
}, |
|
{ |
|
"epoch": 0.5377333912288319, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00017844371694787208, |
|
"loss": 1.7058, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 0.5384281372123317, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00017839335180055405, |
|
"loss": 1.9699, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.5391228831958316, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00017834298665323595, |
|
"loss": 1.9709, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 0.5398176291793313, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00017829262150591792, |
|
"loss": 1.9188, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 0.5405123751628311, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00017824225635859985, |
|
"loss": 1.658, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 0.5412071211463308, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001781918912112818, |
|
"loss": 1.9932, |
|
"step": 779 |
|
}, |
|
{ |
|
"epoch": 0.5419018671298307, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00017814152606396375, |
|
"loss": 1.9009, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5425966131133304, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0001780911609166457, |
|
"loss": 1.734, |
|
"step": 781 |
|
}, |
|
{ |
|
"epoch": 0.5432913590968302, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.00017804079576932763, |
|
"loss": 2.1049, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 0.54398610508033, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00017799043062200956, |
|
"loss": 1.6874, |
|
"step": 783 |
|
}, |
|
{ |
|
"epoch": 0.5446808510638298, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00017794006547469153, |
|
"loss": 1.9051, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 0.5453755970473295, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00017788970032737346, |
|
"loss": 1.8446, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5460703430308294, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.0001778393351800554, |
|
"loss": 1.9275, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 0.5467650890143292, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00017778897003273736, |
|
"loss": 2.1717, |
|
"step": 787 |
|
}, |
|
{ |
|
"epoch": 0.5474598349978289, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.0001777386048854193, |
|
"loss": 2.5151, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 0.5481545809813287, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00017768823973810123, |
|
"loss": 1.7945, |
|
"step": 789 |
|
}, |
|
{ |
|
"epoch": 0.5488493269648285, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00017763787459078317, |
|
"loss": 1.9144, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5495440729483283, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00017758750944346513, |
|
"loss": 1.7331, |
|
"step": 791 |
|
}, |
|
{ |
|
"epoch": 0.550238818931828, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00017753714429614707, |
|
"loss": 1.8174, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 0.5509335649153279, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00017748677914882903, |
|
"loss": 2.1623, |
|
"step": 793 |
|
}, |
|
{ |
|
"epoch": 0.5516283108988276, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00017743641400151097, |
|
"loss": 2.0982, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 0.5523230568823274, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.0001773860488541929, |
|
"loss": 1.9725, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5530178028658271, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00017733568370687484, |
|
"loss": 1.9904, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 0.553712548849327, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00017728531855955678, |
|
"loss": 2.066, |
|
"step": 797 |
|
}, |
|
{ |
|
"epoch": 0.5544072948328268, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00017723495341223874, |
|
"loss": 1.9711, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 0.5551020408163265, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00017718458826492068, |
|
"loss": 1.64, |
|
"step": 799 |
|
}, |
|
{ |
|
"epoch": 0.5557967867998264, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.00017713422311760264, |
|
"loss": 1.3968, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5564915327833261, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00017708385797028458, |
|
"loss": 2.1326, |
|
"step": 801 |
|
}, |
|
{ |
|
"epoch": 0.5571862787668259, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00017703349282296652, |
|
"loss": 1.8806, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 0.5578810247503256, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00017698312767564845, |
|
"loss": 2.022, |
|
"step": 803 |
|
}, |
|
{ |
|
"epoch": 0.5585757707338255, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0001769327625283304, |
|
"loss": 1.79, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 0.5592705167173252, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00017688239738101235, |
|
"loss": 2.2328, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.559965262700825, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.0001768320322336943, |
|
"loss": 2.0766, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 0.5606600086843248, |
|
"grad_norm": 1.5234375, |
|
"learning_rate": 0.00017678166708637625, |
|
"loss": 2.4812, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.5613547546678246, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0001767313019390582, |
|
"loss": 1.9864, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 0.5620495006513243, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00017668093679174012, |
|
"loss": 2.1901, |
|
"step": 809 |
|
}, |
|
{ |
|
"epoch": 0.5627442466348241, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00017663057164442206, |
|
"loss": 1.9752, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.563438992618324, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.000176580206497104, |
|
"loss": 1.5796, |
|
"step": 811 |
|
}, |
|
{ |
|
"epoch": 0.5641337386018237, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00017652984134978596, |
|
"loss": 2.244, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 0.5648284845853235, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0001764794762024679, |
|
"loss": 1.7504, |
|
"step": 813 |
|
}, |
|
{ |
|
"epoch": 0.5655232305688233, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00017642911105514986, |
|
"loss": 2.24, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 0.5662179765523231, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0001763787459078318, |
|
"loss": 2.1324, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.5669127225358228, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00017632838076051373, |
|
"loss": 1.9129, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 0.5676074685193226, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00017627801561319567, |
|
"loss": 1.8645, |
|
"step": 817 |
|
}, |
|
{ |
|
"epoch": 0.5683022145028224, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.0001762276504658776, |
|
"loss": 1.8864, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 0.5689969604863222, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00017617728531855957, |
|
"loss": 1.9661, |
|
"step": 819 |
|
}, |
|
{ |
|
"epoch": 0.5696917064698219, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.0001761269201712415, |
|
"loss": 2.0306, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5703864524533218, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00017607655502392347, |
|
"loss": 1.8562, |
|
"step": 821 |
|
}, |
|
{ |
|
"epoch": 0.5710811984368216, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.0001760261898766054, |
|
"loss": 2.1271, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 0.5717759444203213, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00017597582472928734, |
|
"loss": 2.0361, |
|
"step": 823 |
|
}, |
|
{ |
|
"epoch": 0.5724706904038211, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00017592545958196928, |
|
"loss": 1.5519, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 0.5731654363873209, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.0001758750944346512, |
|
"loss": 2.0971, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.5738601823708207, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00017582472928733318, |
|
"loss": 1.9318, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 0.5745549283543204, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0001757743641400151, |
|
"loss": 2.0558, |
|
"step": 827 |
|
}, |
|
{ |
|
"epoch": 0.5752496743378203, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00017572399899269708, |
|
"loss": 1.7626, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 0.57594442032132, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.000175673633845379, |
|
"loss": 2.0998, |
|
"step": 829 |
|
}, |
|
{ |
|
"epoch": 0.5766391663048198, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00017562326869806095, |
|
"loss": 1.6269, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5773339122883195, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00017557290355074288, |
|
"loss": 2.3553, |
|
"step": 831 |
|
}, |
|
{ |
|
"epoch": 0.5780286582718194, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00017552253840342482, |
|
"loss": 1.9309, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 0.5787234042553191, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00017547217325610678, |
|
"loss": 1.7086, |
|
"step": 833 |
|
}, |
|
{ |
|
"epoch": 0.5794181502388189, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00017542180810878872, |
|
"loss": 1.8288, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 0.5801128962223188, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017537144296147068, |
|
"loss": 1.6589, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.5808076422058185, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 0.00017532107781415262, |
|
"loss": 1.7197, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 0.5815023881893183, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.00017527071266683456, |
|
"loss": 2.2181, |
|
"step": 837 |
|
}, |
|
{ |
|
"epoch": 0.582197134172818, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0001752203475195165, |
|
"loss": 2.1823, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 0.5828918801563179, |
|
"grad_norm": 0.6328125, |
|
"learning_rate": 0.00017516998237219843, |
|
"loss": 1.3199, |
|
"step": 839 |
|
}, |
|
{ |
|
"epoch": 0.5835866261398176, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0001751196172248804, |
|
"loss": 1.8895, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5842813721233174, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00017506925207756233, |
|
"loss": 2.1302, |
|
"step": 841 |
|
}, |
|
{ |
|
"epoch": 0.5849761181068172, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001750188869302443, |
|
"loss": 1.9623, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 0.585670864090317, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.00017496852178292623, |
|
"loss": 1.61, |
|
"step": 843 |
|
}, |
|
{ |
|
"epoch": 0.5863656100738167, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.00017491815663560817, |
|
"loss": 1.8907, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 0.5870603560573165, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.0001748677914882901, |
|
"loss": 1.7086, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5877551020408164, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00017481742634097204, |
|
"loss": 1.718, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 0.5884498480243161, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.000174767061193654, |
|
"loss": 2.0364, |
|
"step": 847 |
|
}, |
|
{ |
|
"epoch": 0.5891445940078159, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00017471669604633594, |
|
"loss": 2.1759, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 0.5898393399913157, |
|
"grad_norm": 1.5, |
|
"learning_rate": 0.0001746663308990179, |
|
"loss": 2.1323, |
|
"step": 849 |
|
}, |
|
{ |
|
"epoch": 0.5905340859748155, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00017461596575169984, |
|
"loss": 1.9511, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5912288319583152, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00017456560060438177, |
|
"loss": 2.0314, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 0.591923577941815, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.0001745152354570637, |
|
"loss": 1.5775, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 0.5926183239253148, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.00017446487030974565, |
|
"loss": 2.047, |
|
"step": 853 |
|
}, |
|
{ |
|
"epoch": 0.5933130699088146, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0001744145051624276, |
|
"loss": 2.1235, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 0.5940078158923143, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00017436414001510955, |
|
"loss": 1.7731, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5947025618758142, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0001743137748677915, |
|
"loss": 1.9088, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 0.5953973078593139, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00017426340972047345, |
|
"loss": 1.6199, |
|
"step": 857 |
|
}, |
|
{ |
|
"epoch": 0.5960920538428137, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.00017421304457315538, |
|
"loss": 1.6654, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 0.5967867998263136, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00017416267942583732, |
|
"loss": 2.0971, |
|
"step": 859 |
|
}, |
|
{ |
|
"epoch": 0.5974815458098133, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00017411231427851928, |
|
"loss": 1.8046, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5981762917933131, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.00017406194913120122, |
|
"loss": 2.2793, |
|
"step": 861 |
|
}, |
|
{ |
|
"epoch": 0.5988710377768128, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.00017401158398388315, |
|
"loss": 1.8008, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 0.5995657837603127, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00017396121883656512, |
|
"loss": 2.0149, |
|
"step": 863 |
|
}, |
|
{ |
|
"epoch": 0.6002605297438124, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00017391085368924705, |
|
"loss": 2.0395, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 0.6009552757273122, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.000173860488541929, |
|
"loss": 2.0035, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.6016500217108119, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00017381012339461093, |
|
"loss": 1.856, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 0.6023447676943118, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 0.0001737597582472929, |
|
"loss": 1.8616, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 0.6030395136778115, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00017370939309997483, |
|
"loss": 1.829, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 0.6037342596613113, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00017365902795265676, |
|
"loss": 2.1977, |
|
"step": 869 |
|
}, |
|
{ |
|
"epoch": 0.6044290056448112, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 0.00017360866280533873, |
|
"loss": 2.1831, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.6051237516283109, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00017355829765802066, |
|
"loss": 1.5552, |
|
"step": 871 |
|
}, |
|
{ |
|
"epoch": 0.6058184976118107, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 0.0001735079325107026, |
|
"loss": 1.977, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 0.6065132435953104, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00017345756736338453, |
|
"loss": 1.9865, |
|
"step": 873 |
|
}, |
|
{ |
|
"epoch": 0.6072079895788103, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.0001734072022160665, |
|
"loss": 1.9972, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 0.60790273556231, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00017335683706874843, |
|
"loss": 1.7239, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.6085974815458098, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 0.00017330647192143037, |
|
"loss": 1.948, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 0.6092922275293096, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00017325610677411233, |
|
"loss": 1.8617, |
|
"step": 877 |
|
}, |
|
{ |
|
"epoch": 0.6099869735128094, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00017320574162679427, |
|
"loss": 2.1794, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 0.6106817194963091, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0001731553764794762, |
|
"loss": 1.9517, |
|
"step": 879 |
|
}, |
|
{ |
|
"epoch": 0.611376465479809, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00017310501133215814, |
|
"loss": 1.8475, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.6120712114633087, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 0.0001730546461848401, |
|
"loss": 2.4872, |
|
"step": 881 |
|
}, |
|
{ |
|
"epoch": 0.6127659574468085, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00017300428103752204, |
|
"loss": 1.7795, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 0.6134607034303083, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00017295391589020398, |
|
"loss": 1.8401, |
|
"step": 883 |
|
}, |
|
{ |
|
"epoch": 0.6141554494138081, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00017290355074288594, |
|
"loss": 1.6861, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 0.6148501953973079, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00017285318559556788, |
|
"loss": 2.1298, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.6155449413808076, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00017280282044824982, |
|
"loss": 1.9718, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 0.6162396873643075, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00017275245530093175, |
|
"loss": 2.0941, |
|
"step": 887 |
|
}, |
|
{ |
|
"epoch": 0.6169344333478072, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00017270209015361371, |
|
"loss": 1.972, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 0.617629179331307, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00017265172500629565, |
|
"loss": 1.6474, |
|
"step": 889 |
|
}, |
|
{ |
|
"epoch": 0.6183239253148067, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0001726013598589776, |
|
"loss": 1.9091, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.6190186712983066, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00017255099471165955, |
|
"loss": 1.6085, |
|
"step": 891 |
|
}, |
|
{ |
|
"epoch": 0.6197134172818063, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001725006295643415, |
|
"loss": 1.8403, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 0.6204081632653061, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00017245026441702342, |
|
"loss": 1.6317, |
|
"step": 893 |
|
}, |
|
{ |
|
"epoch": 0.621102909248806, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00017239989926970536, |
|
"loss": 1.6257, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 0.6217976552323057, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.00017234953412238732, |
|
"loss": 2.384, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.6224924012158055, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00017229916897506926, |
|
"loss": 1.8285, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 0.6231871471993052, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00017224880382775122, |
|
"loss": 1.9576, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 0.6238818931828051, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00017219843868043316, |
|
"loss": 1.8259, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 0.6245766391663048, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.0001721480735331151, |
|
"loss": 2.2432, |
|
"step": 899 |
|
}, |
|
{ |
|
"epoch": 0.6252713851498046, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00017209770838579703, |
|
"loss": 1.6558, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6259661311333043, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00017204734323847897, |
|
"loss": 2.0904, |
|
"step": 901 |
|
}, |
|
{ |
|
"epoch": 0.6266608771168042, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00017199697809116093, |
|
"loss": 1.8032, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 0.6273556231003039, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00017194661294384287, |
|
"loss": 1.7511, |
|
"step": 903 |
|
}, |
|
{ |
|
"epoch": 0.6280503690838037, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00017189624779652483, |
|
"loss": 1.8939, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 0.6287451150673035, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00017184588264920677, |
|
"loss": 1.9086, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.6294398610508033, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001717955175018887, |
|
"loss": 1.6633, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 0.6301346070343031, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00017174515235457064, |
|
"loss": 1.9224, |
|
"step": 907 |
|
}, |
|
{ |
|
"epoch": 0.6308293530178029, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.00017169478720725258, |
|
"loss": 2.0203, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 0.6315240990013027, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00017164442205993454, |
|
"loss": 1.9564, |
|
"step": 909 |
|
}, |
|
{ |
|
"epoch": 0.6322188449848024, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00017159405691261648, |
|
"loss": 1.8416, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.6329135909683022, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00017154369176529844, |
|
"loss": 1.9112, |
|
"step": 911 |
|
}, |
|
{ |
|
"epoch": 0.633608336951802, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00017149332661798038, |
|
"loss": 1.9129, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 0.6343030829353018, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0001714429614706623, |
|
"loss": 1.7555, |
|
"step": 913 |
|
}, |
|
{ |
|
"epoch": 0.6349978289188015, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.00017139259632334425, |
|
"loss": 1.7537, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 0.6356925749023014, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00017134223117602618, |
|
"loss": 2.1289, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.6363873208858011, |
|
"grad_norm": 1.2734375, |
|
"learning_rate": 0.00017129186602870815, |
|
"loss": 2.0833, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 0.6370820668693009, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00017124150088139008, |
|
"loss": 2.0708, |
|
"step": 917 |
|
}, |
|
{ |
|
"epoch": 0.6377768128528007, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00017119113573407205, |
|
"loss": 1.9677, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 0.6384715588363005, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00017114077058675398, |
|
"loss": 1.7507, |
|
"step": 919 |
|
}, |
|
{ |
|
"epoch": 0.6391663048198003, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00017109040543943592, |
|
"loss": 2.2454, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.6398610508033, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00017104004029211786, |
|
"loss": 2.0876, |
|
"step": 921 |
|
}, |
|
{ |
|
"epoch": 0.6405557967867999, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.0001709896751447998, |
|
"loss": 1.7415, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 0.6412505427702996, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 0.00017093930999748176, |
|
"loss": 2.4147, |
|
"step": 923 |
|
}, |
|
{ |
|
"epoch": 0.6419452887537994, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.0001708889448501637, |
|
"loss": 1.6442, |
|
"step": 924 |
|
}, |
|
{ |
|
"epoch": 0.6426400347372991, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.00017083857970284566, |
|
"loss": 1.9067, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.643334780720799, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001707882145555276, |
|
"loss": 2.1339, |
|
"step": 926 |
|
}, |
|
{ |
|
"epoch": 0.6440295267042987, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00017073784940820953, |
|
"loss": 1.7962, |
|
"step": 927 |
|
}, |
|
{ |
|
"epoch": 0.6447242726877985, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00017068748426089147, |
|
"loss": 2.1669, |
|
"step": 928 |
|
}, |
|
{ |
|
"epoch": 0.6454190186712984, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0001706371191135734, |
|
"loss": 1.9113, |
|
"step": 929 |
|
}, |
|
{ |
|
"epoch": 0.6461137646547981, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00017058675396625536, |
|
"loss": 1.9345, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.6468085106382979, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0001705363888189373, |
|
"loss": 2.1483, |
|
"step": 931 |
|
}, |
|
{ |
|
"epoch": 0.6475032566217976, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00017048602367161926, |
|
"loss": 2.1292, |
|
"step": 932 |
|
}, |
|
{ |
|
"epoch": 0.6481980026052975, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.0001704356585243012, |
|
"loss": 1.855, |
|
"step": 933 |
|
}, |
|
{ |
|
"epoch": 0.6488927485887972, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00017038529337698314, |
|
"loss": 1.9374, |
|
"step": 934 |
|
}, |
|
{ |
|
"epoch": 0.649587494572297, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00017033492822966507, |
|
"loss": 1.9404, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.6502822405557968, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.000170284563082347, |
|
"loss": 1.6083, |
|
"step": 936 |
|
}, |
|
{ |
|
"epoch": 0.6509769865392966, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00017023419793502897, |
|
"loss": 1.8623, |
|
"step": 937 |
|
}, |
|
{ |
|
"epoch": 0.6516717325227963, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.0001701838327877109, |
|
"loss": 2.0822, |
|
"step": 938 |
|
}, |
|
{ |
|
"epoch": 0.6523664785062961, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017013346764039287, |
|
"loss": 1.6943, |
|
"step": 939 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.0001700831024930748, |
|
"loss": 1.745, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6537559704732957, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00017003273734575675, |
|
"loss": 1.7084, |
|
"step": 941 |
|
}, |
|
{ |
|
"epoch": 0.6544507164567955, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00016998237219843868, |
|
"loss": 1.6061, |
|
"step": 942 |
|
}, |
|
{ |
|
"epoch": 0.6551454624402953, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00016993200705112062, |
|
"loss": 2.2639, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 0.6558402084237951, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00016988164190380258, |
|
"loss": 1.9709, |
|
"step": 944 |
|
}, |
|
{ |
|
"epoch": 0.6565349544072948, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00016983127675648452, |
|
"loss": 1.9258, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.6572297003907946, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00016978091160916648, |
|
"loss": 2.153, |
|
"step": 946 |
|
}, |
|
{ |
|
"epoch": 0.6579244463742944, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00016973054646184842, |
|
"loss": 1.7945, |
|
"step": 947 |
|
}, |
|
{ |
|
"epoch": 0.6586191923577942, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00016968018131453035, |
|
"loss": 1.9769, |
|
"step": 948 |
|
}, |
|
{ |
|
"epoch": 0.6593139383412939, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.0001696298161672123, |
|
"loss": 1.9792, |
|
"step": 949 |
|
}, |
|
{ |
|
"epoch": 0.6600086843247938, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00016957945101989423, |
|
"loss": 1.9845, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6607034303082935, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001695290858725762, |
|
"loss": 1.9359, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 0.6613981762917933, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 0.00016947872072525813, |
|
"loss": 1.9572, |
|
"step": 952 |
|
}, |
|
{ |
|
"epoch": 0.6620929222752931, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.0001694283555779401, |
|
"loss": 1.3144, |
|
"step": 953 |
|
}, |
|
{ |
|
"epoch": 0.6627876682587929, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00016937799043062203, |
|
"loss": 1.4918, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 0.6634824142422927, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00016932762528330396, |
|
"loss": 2.0, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.6641771602257924, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.0001692772601359859, |
|
"loss": 1.7969, |
|
"step": 956 |
|
}, |
|
{ |
|
"epoch": 0.6648719062092923, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00016922689498866783, |
|
"loss": 1.7975, |
|
"step": 957 |
|
}, |
|
{ |
|
"epoch": 0.665566652192792, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0001691765298413498, |
|
"loss": 1.5022, |
|
"step": 958 |
|
}, |
|
{ |
|
"epoch": 0.6662613981762918, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00016912616469403173, |
|
"loss": 1.7859, |
|
"step": 959 |
|
}, |
|
{ |
|
"epoch": 0.6669561441597915, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0001690757995467137, |
|
"loss": 2.1235, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6676508901432914, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.00016902543439939563, |
|
"loss": 1.8601, |
|
"step": 961 |
|
}, |
|
{ |
|
"epoch": 0.6683456361267911, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00016897506925207757, |
|
"loss": 2.0707, |
|
"step": 962 |
|
}, |
|
{ |
|
"epoch": 0.6690403821102909, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.0001689247041047595, |
|
"loss": 1.9595, |
|
"step": 963 |
|
}, |
|
{ |
|
"epoch": 0.6697351280937907, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.00016887433895744147, |
|
"loss": 2.1069, |
|
"step": 964 |
|
}, |
|
{ |
|
"epoch": 0.6704298740772905, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0001688239738101234, |
|
"loss": 1.6447, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.6711246200607903, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00016877360866280534, |
|
"loss": 1.8459, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 0.67181936604429, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.0001687232435154873, |
|
"loss": 1.9345, |
|
"step": 967 |
|
}, |
|
{ |
|
"epoch": 0.6725141120277899, |
|
"grad_norm": 1.625, |
|
"learning_rate": 0.00016867287836816924, |
|
"loss": 1.8392, |
|
"step": 968 |
|
}, |
|
{ |
|
"epoch": 0.6732088580112896, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00016862251322085118, |
|
"loss": 1.8335, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 0.6739036039947894, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00016857214807353312, |
|
"loss": 1.8878, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6745983499782892, |
|
"grad_norm": 3.390625, |
|
"learning_rate": 0.00016852178292621508, |
|
"loss": 2.0614, |
|
"step": 971 |
|
}, |
|
{ |
|
"epoch": 0.675293095961789, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00016847141777889701, |
|
"loss": 1.7292, |
|
"step": 972 |
|
}, |
|
{ |
|
"epoch": 0.6759878419452887, |
|
"grad_norm": 1.34375, |
|
"learning_rate": 0.00016842105263157895, |
|
"loss": 2.096, |
|
"step": 973 |
|
}, |
|
{ |
|
"epoch": 0.6766825879287885, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00016837068748426091, |
|
"loss": 1.7483, |
|
"step": 974 |
|
}, |
|
{ |
|
"epoch": 0.6773773339122883, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00016832032233694282, |
|
"loss": 1.8725, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.6780720798957881, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0001682699571896248, |
|
"loss": 2.1252, |
|
"step": 976 |
|
}, |
|
{ |
|
"epoch": 0.6787668258792879, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00016821959204230672, |
|
"loss": 1.6795, |
|
"step": 977 |
|
}, |
|
{ |
|
"epoch": 0.6794615718627877, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.0001681692268949887, |
|
"loss": 1.7347, |
|
"step": 978 |
|
}, |
|
{ |
|
"epoch": 0.6801563178462875, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00016811886174767062, |
|
"loss": 2.0712, |
|
"step": 979 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00016806849660035256, |
|
"loss": 1.7589, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.681545809813287, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00016801813145303452, |
|
"loss": 2.2129, |
|
"step": 981 |
|
}, |
|
{ |
|
"epoch": 0.6822405557967868, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00016796776630571643, |
|
"loss": 2.141, |
|
"step": 982 |
|
}, |
|
{ |
|
"epoch": 0.6829353017802866, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.0001679174011583984, |
|
"loss": 1.8133, |
|
"step": 983 |
|
}, |
|
{ |
|
"epoch": 0.6836300477637863, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00016786703601108033, |
|
"loss": 1.7274, |
|
"step": 984 |
|
}, |
|
{ |
|
"epoch": 0.6843247937472862, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001678166708637623, |
|
"loss": 1.7442, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.6850195397307859, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00016776630571644423, |
|
"loss": 1.7292, |
|
"step": 986 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00016771594056912617, |
|
"loss": 1.8515, |
|
"step": 987 |
|
}, |
|
{ |
|
"epoch": 0.6864090316977854, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00016766557542180813, |
|
"loss": 1.3847, |
|
"step": 988 |
|
}, |
|
{ |
|
"epoch": 0.6871037776812853, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00016761521027449004, |
|
"loss": 1.9493, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 0.6877985236647851, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.000167564845127172, |
|
"loss": 1.8116, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6884932696482848, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00016751447997985394, |
|
"loss": 1.5132, |
|
"step": 991 |
|
}, |
|
{ |
|
"epoch": 0.6891880156317847, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0001674641148325359, |
|
"loss": 2.2143, |
|
"step": 992 |
|
}, |
|
{ |
|
"epoch": 0.6898827616152844, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00016741374968521784, |
|
"loss": 1.8619, |
|
"step": 993 |
|
}, |
|
{ |
|
"epoch": 0.6905775075987842, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 0.0001673633845378998, |
|
"loss": 1.8431, |
|
"step": 994 |
|
}, |
|
{ |
|
"epoch": 0.691272253582284, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00016731301939058174, |
|
"loss": 1.942, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.6919669995657838, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00016726265424326365, |
|
"loss": 2.0124, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 0.6926617455492835, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.0001672122890959456, |
|
"loss": 1.9465, |
|
"step": 997 |
|
}, |
|
{ |
|
"epoch": 0.6933564915327833, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00016716192394862755, |
|
"loss": 1.6335, |
|
"step": 998 |
|
}, |
|
{ |
|
"epoch": 0.6940512375162831, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0001671115588013095, |
|
"loss": 1.8172, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 0.6947459834997829, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00016706119365399145, |
|
"loss": 1.8174, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6954407294832827, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.0001670108285066734, |
|
"loss": 1.8297, |
|
"step": 1001 |
|
}, |
|
{ |
|
"epoch": 0.6961354754667824, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00016696046335935535, |
|
"loss": 1.9633, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 0.6968302214502823, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00016691009821203726, |
|
"loss": 2.1313, |
|
"step": 1003 |
|
}, |
|
{ |
|
"epoch": 0.697524967433782, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00016685973306471922, |
|
"loss": 1.6867, |
|
"step": 1004 |
|
}, |
|
{ |
|
"epoch": 0.6982197134172818, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00016680936791740116, |
|
"loss": 1.4534, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6989144594007816, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00016675900277008312, |
|
"loss": 2.0626, |
|
"step": 1006 |
|
}, |
|
{ |
|
"epoch": 0.6996092053842814, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00016670863762276506, |
|
"loss": 2.0868, |
|
"step": 1007 |
|
}, |
|
{ |
|
"epoch": 0.7003039513677811, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00016665827247544702, |
|
"loss": 1.6758, |
|
"step": 1008 |
|
}, |
|
{ |
|
"epoch": 0.700998697351281, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 0.00016660790732812893, |
|
"loss": 2.0535, |
|
"step": 1009 |
|
}, |
|
{ |
|
"epoch": 0.7016934433347807, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00016655754218081087, |
|
"loss": 1.7197, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.7023881893182805, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 0.00016650717703349283, |
|
"loss": 2.23, |
|
"step": 1011 |
|
}, |
|
{ |
|
"epoch": 0.7030829353017802, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00016645681188617477, |
|
"loss": 1.4958, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 0.7037776812852801, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00016640644673885673, |
|
"loss": 1.569, |
|
"step": 1013 |
|
}, |
|
{ |
|
"epoch": 0.7044724272687799, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00016635608159153866, |
|
"loss": 1.9083, |
|
"step": 1014 |
|
}, |
|
{ |
|
"epoch": 0.7051671732522796, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00016630571644422063, |
|
"loss": 2.0236, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.7058619192357795, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00016625535129690254, |
|
"loss": 1.8555, |
|
"step": 1016 |
|
}, |
|
{ |
|
"epoch": 0.7065566652192792, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00016620498614958447, |
|
"loss": 2.0917, |
|
"step": 1017 |
|
}, |
|
{ |
|
"epoch": 0.707251411202779, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00016615462100226644, |
|
"loss": 1.9695, |
|
"step": 1018 |
|
}, |
|
{ |
|
"epoch": 0.7079461571862787, |
|
"grad_norm": 0.486328125, |
|
"learning_rate": 0.00016610425585494837, |
|
"loss": 1.951, |
|
"step": 1019 |
|
}, |
|
{ |
|
"epoch": 0.7086409031697786, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00016605389070763034, |
|
"loss": 2.0722, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.7093356491532783, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00016600352556031227, |
|
"loss": 1.7612, |
|
"step": 1021 |
|
}, |
|
{ |
|
"epoch": 0.7100303951367781, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00016595316041299424, |
|
"loss": 1.5708, |
|
"step": 1022 |
|
}, |
|
{ |
|
"epoch": 0.7107251411202778, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00016590279526567615, |
|
"loss": 2.0463, |
|
"step": 1023 |
|
}, |
|
{ |
|
"epoch": 0.7114198871037777, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00016585243011835808, |
|
"loss": 1.9702, |
|
"step": 1024 |
|
}, |
|
{ |
|
"epoch": 0.7121146330872775, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00016580206497104005, |
|
"loss": 1.6529, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.7128093790707772, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.00016575169982372198, |
|
"loss": 1.8015, |
|
"step": 1026 |
|
}, |
|
{ |
|
"epoch": 0.7135041250542771, |
|
"grad_norm": 0.78125, |
|
"learning_rate": 0.00016570133467640395, |
|
"loss": 2.2328, |
|
"step": 1027 |
|
}, |
|
{ |
|
"epoch": 0.7141988710377768, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 0.00016565096952908588, |
|
"loss": 1.9973, |
|
"step": 1028 |
|
}, |
|
{ |
|
"epoch": 0.7148936170212766, |
|
"grad_norm": 1.5703125, |
|
"learning_rate": 0.00016560060438176784, |
|
"loss": 2.0212, |
|
"step": 1029 |
|
}, |
|
{ |
|
"epoch": 0.7155883630047764, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.00016555023923444975, |
|
"loss": 1.986, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.7162831089882762, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00016549987408713172, |
|
"loss": 2.0747, |
|
"step": 1031 |
|
}, |
|
{ |
|
"epoch": 0.7169778549717759, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00016544950893981365, |
|
"loss": 1.8655, |
|
"step": 1032 |
|
}, |
|
{ |
|
"epoch": 0.7176726009552757, |
|
"grad_norm": 1.7109375, |
|
"learning_rate": 0.0001653991437924956, |
|
"loss": 2.2013, |
|
"step": 1033 |
|
}, |
|
{ |
|
"epoch": 0.7183673469387755, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.00016534877864517755, |
|
"loss": 2.1917, |
|
"step": 1034 |
|
}, |
|
{ |
|
"epoch": 0.7190620929222753, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.0001652984134978595, |
|
"loss": 1.8524, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.7197568389057751, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00016524804835054145, |
|
"loss": 2.0421, |
|
"step": 1036 |
|
}, |
|
{ |
|
"epoch": 0.7204515848892749, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00016519768320322336, |
|
"loss": 1.6317, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 0.7211463308727747, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.00016514731805590533, |
|
"loss": 1.6442, |
|
"step": 1038 |
|
}, |
|
{ |
|
"epoch": 0.7218410768562744, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.00016509695290858726, |
|
"loss": 1.8236, |
|
"step": 1039 |
|
}, |
|
{ |
|
"epoch": 0.7225358228397742, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0001650465877612692, |
|
"loss": 1.6599, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.723230568823274, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00016499622261395116, |
|
"loss": 2.0284, |
|
"step": 1041 |
|
}, |
|
{ |
|
"epoch": 0.7239253148067738, |
|
"grad_norm": 3.515625, |
|
"learning_rate": 0.0001649458574666331, |
|
"loss": 1.6601, |
|
"step": 1042 |
|
}, |
|
{ |
|
"epoch": 0.7246200607902735, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00016489549231931503, |
|
"loss": 1.3995, |
|
"step": 1043 |
|
}, |
|
{ |
|
"epoch": 0.7253148067737734, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 0.00016484512717199697, |
|
"loss": 1.9502, |
|
"step": 1044 |
|
}, |
|
{ |
|
"epoch": 0.7260095527572731, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00016479476202467893, |
|
"loss": 1.7405, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.7267042987407729, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00016474439687736087, |
|
"loss": 1.6638, |
|
"step": 1046 |
|
}, |
|
{ |
|
"epoch": 0.7273990447242726, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0001646940317300428, |
|
"loss": 1.9068, |
|
"step": 1047 |
|
}, |
|
{ |
|
"epoch": 0.7280937907077725, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00016464366658272477, |
|
"loss": 1.9891, |
|
"step": 1048 |
|
}, |
|
{ |
|
"epoch": 0.7287885366912723, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0001645933014354067, |
|
"loss": 1.6797, |
|
"step": 1049 |
|
}, |
|
{ |
|
"epoch": 0.729483282674772, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00016454293628808864, |
|
"loss": 2.1491, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7301780286582719, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00016449257114077058, |
|
"loss": 1.6768, |
|
"step": 1051 |
|
}, |
|
{ |
|
"epoch": 0.7308727746417716, |
|
"grad_norm": 1.21875, |
|
"learning_rate": 0.00016444220599345254, |
|
"loss": 1.6134, |
|
"step": 1052 |
|
}, |
|
{ |
|
"epoch": 0.7315675206252714, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.00016439184084613448, |
|
"loss": 1.4917, |
|
"step": 1053 |
|
}, |
|
{ |
|
"epoch": 0.7322622666087711, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00016434147569881642, |
|
"loss": 1.9061, |
|
"step": 1054 |
|
}, |
|
{ |
|
"epoch": 0.732957012592271, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00016429111055149838, |
|
"loss": 2.0705, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.7336517585757707, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00016424074540418031, |
|
"loss": 1.7986, |
|
"step": 1056 |
|
}, |
|
{ |
|
"epoch": 0.7343465045592705, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00016419038025686225, |
|
"loss": 1.5941, |
|
"step": 1057 |
|
}, |
|
{ |
|
"epoch": 0.7350412505427703, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.0001641400151095442, |
|
"loss": 2.0374, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 0.7357359965262701, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00016408964996222615, |
|
"loss": 1.8155, |
|
"step": 1059 |
|
}, |
|
{ |
|
"epoch": 0.7364307425097699, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0001640392848149081, |
|
"loss": 1.7869, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.7371254884932696, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00016398891966759005, |
|
"loss": 2.0092, |
|
"step": 1061 |
|
}, |
|
{ |
|
"epoch": 0.7378202344767695, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.000163938554520272, |
|
"loss": 1.8563, |
|
"step": 1062 |
|
}, |
|
{ |
|
"epoch": 0.7385149804602692, |
|
"grad_norm": 3.453125, |
|
"learning_rate": 0.00016388818937295392, |
|
"loss": 1.93, |
|
"step": 1063 |
|
}, |
|
{ |
|
"epoch": 0.739209726443769, |
|
"grad_norm": 1.234375, |
|
"learning_rate": 0.00016383782422563586, |
|
"loss": 1.957, |
|
"step": 1064 |
|
}, |
|
{ |
|
"epoch": 0.7399044724272688, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.0001637874590783178, |
|
"loss": 1.6903, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.7405992184107686, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00016373709393099976, |
|
"loss": 1.7496, |
|
"step": 1066 |
|
}, |
|
{ |
|
"epoch": 0.7412939643942683, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0001636867287836817, |
|
"loss": 2.2642, |
|
"step": 1067 |
|
}, |
|
{ |
|
"epoch": 0.7419887103777681, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00016363636363636366, |
|
"loss": 1.8255, |
|
"step": 1068 |
|
}, |
|
{ |
|
"epoch": 0.7426834563612679, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.0001635859984890456, |
|
"loss": 2.0617, |
|
"step": 1069 |
|
}, |
|
{ |
|
"epoch": 0.7433782023447677, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00016353563334172753, |
|
"loss": 2.2496, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.7440729483282674, |
|
"grad_norm": 1.5390625, |
|
"learning_rate": 0.00016348526819440947, |
|
"loss": 2.2341, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 0.7447676943117673, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.0001634349030470914, |
|
"loss": 2.0895, |
|
"step": 1072 |
|
}, |
|
{ |
|
"epoch": 0.7454624402952671, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00016338453789977337, |
|
"loss": 1.3802, |
|
"step": 1073 |
|
}, |
|
{ |
|
"epoch": 0.7461571862787668, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.0001633341727524553, |
|
"loss": 1.874, |
|
"step": 1074 |
|
}, |
|
{ |
|
"epoch": 0.7468519322622666, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00016328380760513727, |
|
"loss": 2.2254, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.7475466782457664, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.0001632334424578192, |
|
"loss": 1.6083, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.7482414242292662, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00016318307731050114, |
|
"loss": 2.1731, |
|
"step": 1077 |
|
}, |
|
{ |
|
"epoch": 0.7489361702127659, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 0.00016313271216318308, |
|
"loss": 1.7249, |
|
"step": 1078 |
|
}, |
|
{ |
|
"epoch": 0.7496309161962658, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.000163082347015865, |
|
"loss": 1.9163, |
|
"step": 1079 |
|
}, |
|
{ |
|
"epoch": 0.7503256621797655, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.00016303198186854698, |
|
"loss": 1.8562, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.7510204081632653, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.0001629816167212289, |
|
"loss": 1.7651, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 0.751715154146765, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00016293125157391088, |
|
"loss": 2.4086, |
|
"step": 1082 |
|
}, |
|
{ |
|
"epoch": 0.7524099001302649, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0001628808864265928, |
|
"loss": 1.6701, |
|
"step": 1083 |
|
}, |
|
{ |
|
"epoch": 0.7531046461137647, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00016283052127927475, |
|
"loss": 1.7093, |
|
"step": 1084 |
|
}, |
|
{ |
|
"epoch": 0.7537993920972644, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 0.00016278015613195668, |
|
"loss": 1.8675, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.7544941380807643, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00016272979098463862, |
|
"loss": 1.8314, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 0.755188884064264, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00016267942583732058, |
|
"loss": 1.9986, |
|
"step": 1087 |
|
}, |
|
{ |
|
"epoch": 0.7558836300477638, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 0.00016262906069000252, |
|
"loss": 2.0108, |
|
"step": 1088 |
|
}, |
|
{ |
|
"epoch": 0.7565783760312635, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00016257869554268448, |
|
"loss": 1.6761, |
|
"step": 1089 |
|
}, |
|
{ |
|
"epoch": 0.7572731220147634, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00016252833039536642, |
|
"loss": 1.9731, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.7579678679982631, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00016247796524804836, |
|
"loss": 1.6051, |
|
"step": 1091 |
|
}, |
|
{ |
|
"epoch": 0.7586626139817629, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0001624276001007303, |
|
"loss": 2.0622, |
|
"step": 1092 |
|
}, |
|
{ |
|
"epoch": 0.7593573599652627, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00016237723495341223, |
|
"loss": 1.8382, |
|
"step": 1093 |
|
}, |
|
{ |
|
"epoch": 0.7600521059487625, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0001623268698060942, |
|
"loss": 2.0037, |
|
"step": 1094 |
|
}, |
|
{ |
|
"epoch": 0.7607468519322622, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00016227650465877613, |
|
"loss": 2.0415, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.761441597915762, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.0001622261395114581, |
|
"loss": 2.0115, |
|
"step": 1096 |
|
}, |
|
{ |
|
"epoch": 0.7621363438992619, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.00016217577436414003, |
|
"loss": 2.1002, |
|
"step": 1097 |
|
}, |
|
{ |
|
"epoch": 0.7628310898827616, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00016212540921682196, |
|
"loss": 1.3513, |
|
"step": 1098 |
|
}, |
|
{ |
|
"epoch": 0.7635258358662614, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0001620750440695039, |
|
"loss": 1.8398, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 0.7642205818497612, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00016202467892218584, |
|
"loss": 2.1121, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.764915327833261, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.0001619743137748678, |
|
"loss": 1.3797, |
|
"step": 1101 |
|
}, |
|
{ |
|
"epoch": 0.7656100738167607, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00016192394862754974, |
|
"loss": 2.0456, |
|
"step": 1102 |
|
}, |
|
{ |
|
"epoch": 0.7663048198002606, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.0001618735834802317, |
|
"loss": 1.7388, |
|
"step": 1103 |
|
}, |
|
{ |
|
"epoch": 0.7669995657837603, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00016182321833291364, |
|
"loss": 2.0159, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 0.7676943117672601, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00016177285318559557, |
|
"loss": 1.6245, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.7683890577507598, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.0001617224880382775, |
|
"loss": 1.9893, |
|
"step": 1106 |
|
}, |
|
{ |
|
"epoch": 0.7690838037342597, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00016167212289095945, |
|
"loss": 1.8131, |
|
"step": 1107 |
|
}, |
|
{ |
|
"epoch": 0.7697785497177595, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0001616217577436414, |
|
"loss": 2.1454, |
|
"step": 1108 |
|
}, |
|
{ |
|
"epoch": 0.7704732957012592, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.00016157139259632335, |
|
"loss": 1.9464, |
|
"step": 1109 |
|
}, |
|
{ |
|
"epoch": 0.771168041684759, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0001615210274490053, |
|
"loss": 1.5576, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7718627876682588, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00016147066230168725, |
|
"loss": 1.524, |
|
"step": 1111 |
|
}, |
|
{ |
|
"epoch": 0.7725575336517586, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 0.00016142029715436918, |
|
"loss": 1.597, |
|
"step": 1112 |
|
}, |
|
{ |
|
"epoch": 0.7732522796352583, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.00016136993200705112, |
|
"loss": 1.8816, |
|
"step": 1113 |
|
}, |
|
{ |
|
"epoch": 0.7739470256187582, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.00016131956685973305, |
|
"loss": 2.019, |
|
"step": 1114 |
|
}, |
|
{ |
|
"epoch": 0.7746417716022579, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00016126920171241502, |
|
"loss": 1.7259, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.7753365175857577, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.00016121883656509695, |
|
"loss": 1.7952, |
|
"step": 1116 |
|
}, |
|
{ |
|
"epoch": 0.7760312635692574, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00016116847141777892, |
|
"loss": 1.8621, |
|
"step": 1117 |
|
}, |
|
{ |
|
"epoch": 0.7767260095527573, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00016111810627046085, |
|
"loss": 2.1855, |
|
"step": 1118 |
|
}, |
|
{ |
|
"epoch": 0.777420755536257, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.0001610677411231428, |
|
"loss": 1.9703, |
|
"step": 1119 |
|
}, |
|
{ |
|
"epoch": 0.7781155015197568, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.00016101737597582473, |
|
"loss": 2.0817, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7788102475032567, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00016096701082850666, |
|
"loss": 2.18, |
|
"step": 1121 |
|
}, |
|
{ |
|
"epoch": 0.7795049934867564, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00016091664568118863, |
|
"loss": 1.8113, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 0.7801997394702562, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00016086628053387056, |
|
"loss": 1.9625, |
|
"step": 1123 |
|
}, |
|
{ |
|
"epoch": 0.780894485453756, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 0.00016081591538655253, |
|
"loss": 1.6948, |
|
"step": 1124 |
|
}, |
|
{ |
|
"epoch": 0.7815892314372558, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.00016076555023923446, |
|
"loss": 2.1205, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.7822839774207555, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001607151850919164, |
|
"loss": 1.8704, |
|
"step": 1126 |
|
}, |
|
{ |
|
"epoch": 0.7829787234042553, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00016066481994459833, |
|
"loss": 1.6526, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 0.7836734693877551, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.0001606144547972803, |
|
"loss": 2.0315, |
|
"step": 1128 |
|
}, |
|
{ |
|
"epoch": 0.7843682153712549, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.00016056408964996223, |
|
"loss": 2.0635, |
|
"step": 1129 |
|
}, |
|
{ |
|
"epoch": 0.7850629613547546, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00016051372450264417, |
|
"loss": 1.9912, |
|
"step": 1130 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 4317, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10, |
|
"total_flos": 7.924900854625124e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|