{ "best_metric": 5.182580947875977, "best_model_checkpoint": "./results/models/mistral-prot/checkpoint-937410", "epoch": 7.0, "eval_steps": 500, "global_step": 1093645, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032003072294940313, "grad_norm": 0.291015625, "learning_rate": 0.00039997439754216405, "loss": 5.83, "step": 500 }, { "epoch": 0.006400614458988063, "grad_norm": 0.30859375, "learning_rate": 0.0003999487950843281, "loss": 5.6885, "step": 1000 }, { "epoch": 0.009600921688482094, "grad_norm": 0.271484375, "learning_rate": 0.0003999231926264922, "loss": 5.6553, "step": 1500 }, { "epoch": 0.012801228917976125, "grad_norm": 0.28125, "learning_rate": 0.0003998975901686562, "loss": 5.6407, "step": 2000 }, { "epoch": 0.016001536147470158, "grad_norm": 0.2578125, "learning_rate": 0.00039987198771082024, "loss": 5.6315, "step": 2500 }, { "epoch": 0.01920184337696419, "grad_norm": 0.2470703125, "learning_rate": 0.0003998463852529843, "loss": 5.6252, "step": 3000 }, { "epoch": 0.02240215060645822, "grad_norm": 0.26953125, "learning_rate": 0.00039982078279514837, "loss": 5.6176, "step": 3500 }, { "epoch": 0.02560245783595225, "grad_norm": 0.30078125, "learning_rate": 0.0003997951803373124, "loss": 5.6118, "step": 4000 }, { "epoch": 0.02880276506544628, "grad_norm": 0.259765625, "learning_rate": 0.0003997695778794765, "loss": 5.6065, "step": 4500 }, { "epoch": 0.032003072294940316, "grad_norm": 0.26171875, "learning_rate": 0.0003997439754216405, "loss": 5.6034, "step": 5000 }, { "epoch": 0.03520337952443434, "grad_norm": 0.271484375, "learning_rate": 0.00039971837296380456, "loss": 5.5962, "step": 5500 }, { "epoch": 0.03840368675392838, "grad_norm": 0.26953125, "learning_rate": 0.0003996927705059686, "loss": 5.5968, "step": 6000 }, { "epoch": 0.04160399398342241, "grad_norm": 0.265625, "learning_rate": 0.0003996671680481326, "loss": 5.5895, "step": 6500 }, { "epoch": 0.04480430121291644, "grad_norm": 0.2578125, "learning_rate": 0.0003996415655902967, "loss": 5.5867, "step": 7000 }, { "epoch": 0.048004608442410474, "grad_norm": 0.328125, "learning_rate": 0.00039961596313246075, "loss": 5.5837, "step": 7500 }, { "epoch": 0.0512049156719045, "grad_norm": 0.328125, "learning_rate": 0.0003995903606746248, "loss": 5.5813, "step": 8000 }, { "epoch": 0.054405222901398535, "grad_norm": 0.267578125, "learning_rate": 0.0003995647582167888, "loss": 5.577, "step": 8500 }, { "epoch": 0.05760553013089256, "grad_norm": 0.251953125, "learning_rate": 0.00039953915575895285, "loss": 5.571, "step": 9000 }, { "epoch": 0.0608058373603866, "grad_norm": 0.25390625, "learning_rate": 0.0003995135533011169, "loss": 5.5684, "step": 9500 }, { "epoch": 0.06400614458988063, "grad_norm": 0.25390625, "learning_rate": 0.000399487950843281, "loss": 5.566, "step": 10000 }, { "epoch": 0.06720645181937467, "grad_norm": 0.255859375, "learning_rate": 0.000399462348385445, "loss": 5.5603, "step": 10500 }, { "epoch": 0.07040675904886869, "grad_norm": 0.2734375, "learning_rate": 0.00039943674592760904, "loss": 5.5588, "step": 11000 }, { "epoch": 0.07360706627836272, "grad_norm": 0.2392578125, "learning_rate": 0.00039941114346977313, "loss": 5.5549, "step": 11500 }, { "epoch": 0.07680737350785675, "grad_norm": 0.265625, "learning_rate": 0.00039938554101193717, "loss": 5.5491, "step": 12000 }, { "epoch": 0.08000768073735079, "grad_norm": 0.244140625, "learning_rate": 0.00039935993855410125, "loss": 5.5479, "step": 12500 }, { "epoch": 0.08320798796684482, "grad_norm": 0.27734375, "learning_rate": 0.0003993343360962653, "loss": 5.5413, "step": 13000 }, { "epoch": 0.08640829519633884, "grad_norm": 0.26171875, "learning_rate": 0.0003993087336384293, "loss": 5.5361, "step": 13500 }, { "epoch": 0.08960860242583288, "grad_norm": 0.263671875, "learning_rate": 0.00039928313118059336, "loss": 5.5309, "step": 14000 }, { "epoch": 0.09280890965532691, "grad_norm": 0.314453125, "learning_rate": 0.0003992575287227574, "loss": 5.5225, "step": 14500 }, { "epoch": 0.09600921688482095, "grad_norm": 0.265625, "learning_rate": 0.0003992319262649214, "loss": 5.5158, "step": 15000 }, { "epoch": 0.09920952411431497, "grad_norm": 0.28125, "learning_rate": 0.0003992063238070855, "loss": 5.5118, "step": 15500 }, { "epoch": 0.102409831343809, "grad_norm": 0.2421875, "learning_rate": 0.00039918072134924955, "loss": 5.5003, "step": 16000 }, { "epoch": 0.10561013857330304, "grad_norm": 0.255859375, "learning_rate": 0.0003991551188914136, "loss": 5.4983, "step": 16500 }, { "epoch": 0.10881044580279707, "grad_norm": 0.25390625, "learning_rate": 0.0003991295164335776, "loss": 5.4975, "step": 17000 }, { "epoch": 0.1120107530322911, "grad_norm": 0.26953125, "learning_rate": 0.00039910391397574165, "loss": 5.4921, "step": 17500 }, { "epoch": 0.11521106026178513, "grad_norm": 0.26171875, "learning_rate": 0.00039907831151790574, "loss": 5.4822, "step": 18000 }, { "epoch": 0.11841136749127916, "grad_norm": 0.25390625, "learning_rate": 0.0003990527090600698, "loss": 5.476, "step": 18500 }, { "epoch": 0.1216116747207732, "grad_norm": 0.251953125, "learning_rate": 0.00039902710660223386, "loss": 5.4742, "step": 19000 }, { "epoch": 0.12481198195026723, "grad_norm": 0.248046875, "learning_rate": 0.0003990015041443979, "loss": 5.4732, "step": 19500 }, { "epoch": 0.12801228917976126, "grad_norm": 0.248046875, "learning_rate": 0.00039897590168656193, "loss": 5.4715, "step": 20000 }, { "epoch": 0.13121259640925528, "grad_norm": 0.2578125, "learning_rate": 0.000398950299228726, "loss": 5.4682, "step": 20500 }, { "epoch": 0.13441290363874933, "grad_norm": 0.2734375, "learning_rate": 0.00039892469677089005, "loss": 5.4649, "step": 21000 }, { "epoch": 0.13761321086824335, "grad_norm": 0.26171875, "learning_rate": 0.0003988990943130541, "loss": 5.4609, "step": 21500 }, { "epoch": 0.14081351809773737, "grad_norm": 0.271484375, "learning_rate": 0.0003988734918552181, "loss": 5.4567, "step": 22000 }, { "epoch": 0.14401382532723142, "grad_norm": 0.255859375, "learning_rate": 0.00039884788939738216, "loss": 5.4546, "step": 22500 }, { "epoch": 0.14721413255672544, "grad_norm": 0.2578125, "learning_rate": 0.0003988222869395462, "loss": 5.4503, "step": 23000 }, { "epoch": 0.1504144397862195, "grad_norm": 0.267578125, "learning_rate": 0.0003987966844817103, "loss": 5.4432, "step": 23500 }, { "epoch": 0.1536147470157135, "grad_norm": 0.25390625, "learning_rate": 0.0003987710820238743, "loss": 5.4463, "step": 24000 }, { "epoch": 0.15681505424520753, "grad_norm": 0.267578125, "learning_rate": 0.00039874547956603835, "loss": 5.4436, "step": 24500 }, { "epoch": 0.16001536147470158, "grad_norm": 0.279296875, "learning_rate": 0.0003987198771082024, "loss": 5.4418, "step": 25000 }, { "epoch": 0.1632156687041956, "grad_norm": 0.259765625, "learning_rate": 0.0003986942746503664, "loss": 5.4418, "step": 25500 }, { "epoch": 0.16641597593368965, "grad_norm": 0.26953125, "learning_rate": 0.0003986686721925305, "loss": 5.4369, "step": 26000 }, { "epoch": 0.16961628316318367, "grad_norm": 0.2490234375, "learning_rate": 0.00039864306973469454, "loss": 5.4352, "step": 26500 }, { "epoch": 0.1728165903926777, "grad_norm": 0.2578125, "learning_rate": 0.00039861746727685863, "loss": 5.4318, "step": 27000 }, { "epoch": 0.17601689762217174, "grad_norm": 0.25390625, "learning_rate": 0.00039859186481902266, "loss": 5.4271, "step": 27500 }, { "epoch": 0.17921720485166576, "grad_norm": 0.265625, "learning_rate": 0.0003985662623611867, "loss": 5.4238, "step": 28000 }, { "epoch": 0.1824175120811598, "grad_norm": 0.251953125, "learning_rate": 0.00039854065990335073, "loss": 5.4235, "step": 28500 }, { "epoch": 0.18561781931065383, "grad_norm": 0.271484375, "learning_rate": 0.0003985150574455148, "loss": 5.4201, "step": 29000 }, { "epoch": 0.18881812654014785, "grad_norm": 0.263671875, "learning_rate": 0.00039848945498767885, "loss": 5.4201, "step": 29500 }, { "epoch": 0.1920184337696419, "grad_norm": 0.259765625, "learning_rate": 0.0003984638525298429, "loss": 5.4195, "step": 30000 }, { "epoch": 0.19521874099913591, "grad_norm": 0.255859375, "learning_rate": 0.0003984382500720069, "loss": 5.4198, "step": 30500 }, { "epoch": 0.19841904822862994, "grad_norm": 0.263671875, "learning_rate": 0.00039841264761417096, "loss": 5.4172, "step": 31000 }, { "epoch": 0.20161935545812398, "grad_norm": 0.255859375, "learning_rate": 0.00039838704515633504, "loss": 5.4132, "step": 31500 }, { "epoch": 0.204819662687618, "grad_norm": 0.271484375, "learning_rate": 0.0003983614426984991, "loss": 5.4128, "step": 32000 }, { "epoch": 0.20801996991711205, "grad_norm": 0.25, "learning_rate": 0.0003983358402406631, "loss": 5.4085, "step": 32500 }, { "epoch": 0.21122027714660607, "grad_norm": 0.2578125, "learning_rate": 0.00039831023778282715, "loss": 5.4076, "step": 33000 }, { "epoch": 0.2144205843761001, "grad_norm": 0.26953125, "learning_rate": 0.0003982846353249912, "loss": 5.4045, "step": 33500 }, { "epoch": 0.21762089160559414, "grad_norm": 0.2578125, "learning_rate": 0.00039825903286715527, "loss": 5.4033, "step": 34000 }, { "epoch": 0.22082119883508816, "grad_norm": 0.2490234375, "learning_rate": 0.0003982334304093193, "loss": 5.4007, "step": 34500 }, { "epoch": 0.2240215060645822, "grad_norm": 0.26953125, "learning_rate": 0.0003982078279514834, "loss": 5.3985, "step": 35000 }, { "epoch": 0.22722181329407623, "grad_norm": 0.265625, "learning_rate": 0.0003981822254936474, "loss": 5.3999, "step": 35500 }, { "epoch": 0.23042212052357025, "grad_norm": 0.2490234375, "learning_rate": 0.00039815662303581146, "loss": 5.3982, "step": 36000 }, { "epoch": 0.2336224277530643, "grad_norm": 0.28515625, "learning_rate": 0.0003981310205779755, "loss": 5.3966, "step": 36500 }, { "epoch": 0.23682273498255832, "grad_norm": 0.265625, "learning_rate": 0.0003981054181201396, "loss": 5.3965, "step": 37000 }, { "epoch": 0.24002304221205237, "grad_norm": 0.26953125, "learning_rate": 0.0003980798156623036, "loss": 5.3941, "step": 37500 }, { "epoch": 0.2432233494415464, "grad_norm": 0.267578125, "learning_rate": 0.00039805421320446765, "loss": 5.3914, "step": 38000 }, { "epoch": 0.2464236566710404, "grad_norm": 0.265625, "learning_rate": 0.0003980286107466317, "loss": 5.387, "step": 38500 }, { "epoch": 0.24962396390053446, "grad_norm": 0.265625, "learning_rate": 0.0003980030082887957, "loss": 5.3889, "step": 39000 }, { "epoch": 0.2528242711300285, "grad_norm": 0.2734375, "learning_rate": 0.00039797740583095975, "loss": 5.3829, "step": 39500 }, { "epoch": 0.2560245783595225, "grad_norm": 0.2734375, "learning_rate": 0.00039795180337312384, "loss": 5.3887, "step": 40000 }, { "epoch": 0.2592248855890165, "grad_norm": 0.26953125, "learning_rate": 0.0003979262009152879, "loss": 5.3815, "step": 40500 }, { "epoch": 0.26242519281851057, "grad_norm": 0.28515625, "learning_rate": 0.0003979005984574519, "loss": 5.3823, "step": 41000 }, { "epoch": 0.2656255000480046, "grad_norm": 0.2734375, "learning_rate": 0.000397874995999616, "loss": 5.3804, "step": 41500 }, { "epoch": 0.26882580727749866, "grad_norm": 0.2734375, "learning_rate": 0.00039784939354178003, "loss": 5.3816, "step": 42000 }, { "epoch": 0.27202611450699266, "grad_norm": 0.26171875, "learning_rate": 0.0003978237910839441, "loss": 5.3798, "step": 42500 }, { "epoch": 0.2752264217364867, "grad_norm": 0.25390625, "learning_rate": 0.00039779818862610816, "loss": 5.3706, "step": 43000 }, { "epoch": 0.27842672896598075, "grad_norm": 0.26953125, "learning_rate": 0.0003977725861682722, "loss": 5.3747, "step": 43500 }, { "epoch": 0.28162703619547474, "grad_norm": 0.279296875, "learning_rate": 0.0003977469837104362, "loss": 5.3684, "step": 44000 }, { "epoch": 0.2848273434249688, "grad_norm": 0.26953125, "learning_rate": 0.00039772138125260026, "loss": 5.3669, "step": 44500 }, { "epoch": 0.28802765065446284, "grad_norm": 0.259765625, "learning_rate": 0.0003976957787947643, "loss": 5.3725, "step": 45000 }, { "epoch": 0.29122795788395683, "grad_norm": 0.2734375, "learning_rate": 0.0003976701763369284, "loss": 5.3737, "step": 45500 }, { "epoch": 0.2944282651134509, "grad_norm": 0.2734375, "learning_rate": 0.0003976445738790924, "loss": 5.3694, "step": 46000 }, { "epoch": 0.29762857234294493, "grad_norm": 0.265625, "learning_rate": 0.00039761897142125645, "loss": 5.3707, "step": 46500 }, { "epoch": 0.300828879572439, "grad_norm": 0.283203125, "learning_rate": 0.0003975933689634205, "loss": 5.3625, "step": 47000 }, { "epoch": 0.30402918680193297, "grad_norm": 0.267578125, "learning_rate": 0.0003975677665055845, "loss": 5.3647, "step": 47500 }, { "epoch": 0.307229494031427, "grad_norm": 0.2578125, "learning_rate": 0.0003975421640477486, "loss": 5.3624, "step": 48000 }, { "epoch": 0.31042980126092107, "grad_norm": 0.267578125, "learning_rate": 0.00039751656158991264, "loss": 5.3623, "step": 48500 }, { "epoch": 0.31363010849041506, "grad_norm": 0.2734375, "learning_rate": 0.0003974909591320767, "loss": 5.3642, "step": 49000 }, { "epoch": 0.3168304157199091, "grad_norm": 0.28125, "learning_rate": 0.00039746535667424076, "loss": 5.3574, "step": 49500 }, { "epoch": 0.32003072294940316, "grad_norm": 0.267578125, "learning_rate": 0.0003974397542164048, "loss": 5.361, "step": 50000 }, { "epoch": 0.32323103017889715, "grad_norm": 0.279296875, "learning_rate": 0.00039741415175856883, "loss": 5.3607, "step": 50500 }, { "epoch": 0.3264313374083912, "grad_norm": 0.265625, "learning_rate": 0.0003973885493007329, "loss": 5.3574, "step": 51000 }, { "epoch": 0.32963164463788525, "grad_norm": 0.271484375, "learning_rate": 0.00039736294684289696, "loss": 5.353, "step": 51500 }, { "epoch": 0.3328319518673793, "grad_norm": 0.263671875, "learning_rate": 0.000397337344385061, "loss": 5.3515, "step": 52000 }, { "epoch": 0.3360322590968733, "grad_norm": 0.283203125, "learning_rate": 0.000397311741927225, "loss": 5.3555, "step": 52500 }, { "epoch": 0.33923256632636734, "grad_norm": 0.279296875, "learning_rate": 0.00039728613946938906, "loss": 5.3451, "step": 53000 }, { "epoch": 0.3424328735558614, "grad_norm": 0.267578125, "learning_rate": 0.00039726053701155315, "loss": 5.3454, "step": 53500 }, { "epoch": 0.3456331807853554, "grad_norm": 0.27734375, "learning_rate": 0.0003972349345537172, "loss": 5.3493, "step": 54000 }, { "epoch": 0.3488334880148494, "grad_norm": 0.279296875, "learning_rate": 0.0003972093320958812, "loss": 5.3508, "step": 54500 }, { "epoch": 0.3520337952443435, "grad_norm": 0.27734375, "learning_rate": 0.00039718372963804525, "loss": 5.3413, "step": 55000 }, { "epoch": 0.35523410247383747, "grad_norm": 0.275390625, "learning_rate": 0.0003971581271802093, "loss": 5.3471, "step": 55500 }, { "epoch": 0.3584344097033315, "grad_norm": 0.26953125, "learning_rate": 0.0003971325247223733, "loss": 5.3464, "step": 56000 }, { "epoch": 0.36163471693282556, "grad_norm": 0.2890625, "learning_rate": 0.0003971069222645374, "loss": 5.3434, "step": 56500 }, { "epoch": 0.3648350241623196, "grad_norm": 0.263671875, "learning_rate": 0.0003970813198067015, "loss": 5.3393, "step": 57000 }, { "epoch": 0.3680353313918136, "grad_norm": 0.28515625, "learning_rate": 0.00039705571734886553, "loss": 5.3449, "step": 57500 }, { "epoch": 0.37123563862130765, "grad_norm": 0.291015625, "learning_rate": 0.00039703011489102956, "loss": 5.3393, "step": 58000 }, { "epoch": 0.3744359458508017, "grad_norm": 0.28515625, "learning_rate": 0.0003970045124331936, "loss": 5.3411, "step": 58500 }, { "epoch": 0.3776362530802957, "grad_norm": 0.271484375, "learning_rate": 0.0003969789099753577, "loss": 5.3293, "step": 59000 }, { "epoch": 0.38083656030978974, "grad_norm": 0.28515625, "learning_rate": 0.0003969533075175217, "loss": 5.3352, "step": 59500 }, { "epoch": 0.3840368675392838, "grad_norm": 0.2734375, "learning_rate": 0.00039692770505968575, "loss": 5.334, "step": 60000 }, { "epoch": 0.3872371747687778, "grad_norm": 0.271484375, "learning_rate": 0.0003969021026018498, "loss": 5.3365, "step": 60500 }, { "epoch": 0.39043748199827183, "grad_norm": 0.2734375, "learning_rate": 0.0003968765001440138, "loss": 5.3376, "step": 61000 }, { "epoch": 0.3936377892277659, "grad_norm": 0.275390625, "learning_rate": 0.0003968508976861779, "loss": 5.332, "step": 61500 }, { "epoch": 0.39683809645725987, "grad_norm": 0.275390625, "learning_rate": 0.00039682529522834195, "loss": 5.3327, "step": 62000 }, { "epoch": 0.4000384036867539, "grad_norm": 0.28515625, "learning_rate": 0.000396799692770506, "loss": 5.3324, "step": 62500 }, { "epoch": 0.40323871091624797, "grad_norm": 0.29296875, "learning_rate": 0.00039677409031267, "loss": 5.333, "step": 63000 }, { "epoch": 0.406439018145742, "grad_norm": 0.283203125, "learning_rate": 0.00039674848785483405, "loss": 5.3365, "step": 63500 }, { "epoch": 0.409639325375236, "grad_norm": 0.271484375, "learning_rate": 0.00039672288539699814, "loss": 5.3299, "step": 64000 }, { "epoch": 0.41283963260473006, "grad_norm": 0.291015625, "learning_rate": 0.00039669728293916217, "loss": 5.3274, "step": 64500 }, { "epoch": 0.4160399398342241, "grad_norm": 0.279296875, "learning_rate": 0.00039667168048132626, "loss": 5.3297, "step": 65000 }, { "epoch": 0.4192402470637181, "grad_norm": 0.27734375, "learning_rate": 0.0003966460780234903, "loss": 5.3297, "step": 65500 }, { "epoch": 0.42244055429321214, "grad_norm": 0.310546875, "learning_rate": 0.00039662047556565433, "loss": 5.3258, "step": 66000 }, { "epoch": 0.4256408615227062, "grad_norm": 0.279296875, "learning_rate": 0.00039659487310781836, "loss": 5.3202, "step": 66500 }, { "epoch": 0.4288411687522002, "grad_norm": 0.279296875, "learning_rate": 0.00039656927064998245, "loss": 5.3167, "step": 67000 }, { "epoch": 0.43204147598169423, "grad_norm": 0.2890625, "learning_rate": 0.0003965436681921465, "loss": 5.3217, "step": 67500 }, { "epoch": 0.4352417832111883, "grad_norm": 0.291015625, "learning_rate": 0.0003965180657343105, "loss": 5.3199, "step": 68000 }, { "epoch": 0.43844209044068233, "grad_norm": 0.310546875, "learning_rate": 0.00039649246327647455, "loss": 5.3201, "step": 68500 }, { "epoch": 0.4416423976701763, "grad_norm": 0.2734375, "learning_rate": 0.0003964668608186386, "loss": 5.3223, "step": 69000 }, { "epoch": 0.44484270489967037, "grad_norm": 0.279296875, "learning_rate": 0.0003964412583608026, "loss": 5.3276, "step": 69500 }, { "epoch": 0.4480430121291644, "grad_norm": 0.271484375, "learning_rate": 0.0003964156559029667, "loss": 5.3137, "step": 70000 }, { "epoch": 0.4512433193586584, "grad_norm": 0.2890625, "learning_rate": 0.00039639005344513074, "loss": 5.3179, "step": 70500 }, { "epoch": 0.45444362658815246, "grad_norm": 0.283203125, "learning_rate": 0.0003963644509872948, "loss": 5.3192, "step": 71000 }, { "epoch": 0.4576439338176465, "grad_norm": 0.291015625, "learning_rate": 0.0003963388485294588, "loss": 5.317, "step": 71500 }, { "epoch": 0.4608442410471405, "grad_norm": 0.291015625, "learning_rate": 0.0003963132460716229, "loss": 5.3145, "step": 72000 }, { "epoch": 0.46404454827663455, "grad_norm": 0.283203125, "learning_rate": 0.00039628764361378694, "loss": 5.3123, "step": 72500 }, { "epoch": 0.4672448555061286, "grad_norm": 0.287109375, "learning_rate": 0.000396262041155951, "loss": 5.3127, "step": 73000 }, { "epoch": 0.47044516273562265, "grad_norm": 0.287109375, "learning_rate": 0.00039623643869811506, "loss": 5.3195, "step": 73500 }, { "epoch": 0.47364546996511664, "grad_norm": 0.28515625, "learning_rate": 0.0003962108362402791, "loss": 5.3111, "step": 74000 }, { "epoch": 0.4768457771946107, "grad_norm": 0.2890625, "learning_rate": 0.00039618523378244313, "loss": 5.3141, "step": 74500 }, { "epoch": 0.48004608442410474, "grad_norm": 0.287109375, "learning_rate": 0.00039615963132460716, "loss": 5.3062, "step": 75000 }, { "epoch": 0.48324639165359873, "grad_norm": 0.283203125, "learning_rate": 0.00039613402886677125, "loss": 5.3127, "step": 75500 }, { "epoch": 0.4864466988830928, "grad_norm": 0.2890625, "learning_rate": 0.0003961084264089353, "loss": 5.31, "step": 76000 }, { "epoch": 0.4896470061125868, "grad_norm": 0.287109375, "learning_rate": 0.0003960828239510993, "loss": 5.3139, "step": 76500 }, { "epoch": 0.4928473133420808, "grad_norm": 0.27734375, "learning_rate": 0.00039605722149326335, "loss": 5.3109, "step": 77000 }, { "epoch": 0.49604762057157487, "grad_norm": 0.279296875, "learning_rate": 0.0003960316190354274, "loss": 5.3037, "step": 77500 }, { "epoch": 0.4992479278010689, "grad_norm": 0.275390625, "learning_rate": 0.0003960060165775915, "loss": 5.3057, "step": 78000 }, { "epoch": 0.5024482350305629, "grad_norm": 0.29296875, "learning_rate": 0.0003959804141197555, "loss": 5.3097, "step": 78500 }, { "epoch": 0.505648542260057, "grad_norm": 0.3046875, "learning_rate": 0.00039595481166191954, "loss": 5.3118, "step": 79000 }, { "epoch": 0.508848849489551, "grad_norm": 0.291015625, "learning_rate": 0.00039592920920408363, "loss": 5.3031, "step": 79500 }, { "epoch": 0.512049156719045, "grad_norm": 0.287109375, "learning_rate": 0.00039590360674624767, "loss": 5.3047, "step": 80000 }, { "epoch": 0.5152494639485391, "grad_norm": 0.283203125, "learning_rate": 0.0003958780042884117, "loss": 5.307, "step": 80500 }, { "epoch": 0.518449771178033, "grad_norm": 0.291015625, "learning_rate": 0.0003958524018305758, "loss": 5.3015, "step": 81000 }, { "epoch": 0.5216500784075271, "grad_norm": 0.3046875, "learning_rate": 0.0003958267993727398, "loss": 5.2987, "step": 81500 }, { "epoch": 0.5248503856370211, "grad_norm": 0.28515625, "learning_rate": 0.00039580119691490386, "loss": 5.2999, "step": 82000 }, { "epoch": 0.5280506928665152, "grad_norm": 0.279296875, "learning_rate": 0.0003957755944570679, "loss": 5.2987, "step": 82500 }, { "epoch": 0.5312510000960092, "grad_norm": 0.294921875, "learning_rate": 0.0003957499919992319, "loss": 5.301, "step": 83000 }, { "epoch": 0.5344513073255033, "grad_norm": 0.296875, "learning_rate": 0.000395724389541396, "loss": 5.3026, "step": 83500 }, { "epoch": 0.5376516145549973, "grad_norm": 0.2890625, "learning_rate": 0.00039569878708356005, "loss": 5.2979, "step": 84000 }, { "epoch": 0.5408519217844913, "grad_norm": 0.298828125, "learning_rate": 0.0003956731846257241, "loss": 5.3044, "step": 84500 }, { "epoch": 0.5440522290139853, "grad_norm": 0.30078125, "learning_rate": 0.0003956475821678881, "loss": 5.3011, "step": 85000 }, { "epoch": 0.5472525362434794, "grad_norm": 0.27734375, "learning_rate": 0.00039562197971005215, "loss": 5.3003, "step": 85500 }, { "epoch": 0.5504528434729734, "grad_norm": 0.287109375, "learning_rate": 0.0003955963772522162, "loss": 5.3009, "step": 86000 }, { "epoch": 0.5536531507024675, "grad_norm": 0.2890625, "learning_rate": 0.0003955707747943803, "loss": 5.299, "step": 86500 }, { "epoch": 0.5568534579319615, "grad_norm": 0.27734375, "learning_rate": 0.0003955451723365443, "loss": 5.295, "step": 87000 }, { "epoch": 0.5600537651614556, "grad_norm": 0.28515625, "learning_rate": 0.0003955195698787084, "loss": 5.2971, "step": 87500 }, { "epoch": 0.5632540723909495, "grad_norm": 0.29296875, "learning_rate": 0.00039549396742087243, "loss": 5.2982, "step": 88000 }, { "epoch": 0.5664543796204435, "grad_norm": 0.30078125, "learning_rate": 0.00039546836496303647, "loss": 5.2923, "step": 88500 }, { "epoch": 0.5696546868499376, "grad_norm": 0.283203125, "learning_rate": 0.00039544276250520055, "loss": 5.2898, "step": 89000 }, { "epoch": 0.5728549940794316, "grad_norm": 0.296875, "learning_rate": 0.0003954171600473646, "loss": 5.298, "step": 89500 }, { "epoch": 0.5760553013089257, "grad_norm": 0.291015625, "learning_rate": 0.0003953915575895286, "loss": 5.2947, "step": 90000 }, { "epoch": 0.5792556085384197, "grad_norm": 0.28125, "learning_rate": 0.00039536595513169266, "loss": 5.2937, "step": 90500 }, { "epoch": 0.5824559157679137, "grad_norm": 0.283203125, "learning_rate": 0.0003953403526738567, "loss": 5.287, "step": 91000 }, { "epoch": 0.5856562229974077, "grad_norm": 0.28515625, "learning_rate": 0.0003953147502160207, "loss": 5.2907, "step": 91500 }, { "epoch": 0.5888565302269018, "grad_norm": 0.294921875, "learning_rate": 0.0003952891477581848, "loss": 5.2879, "step": 92000 }, { "epoch": 0.5920568374563958, "grad_norm": 0.29296875, "learning_rate": 0.00039526354530034885, "loss": 5.2897, "step": 92500 }, { "epoch": 0.5952571446858899, "grad_norm": 0.30078125, "learning_rate": 0.0003952379428425129, "loss": 5.2899, "step": 93000 }, { "epoch": 0.5984574519153839, "grad_norm": 0.287109375, "learning_rate": 0.0003952123403846769, "loss": 5.2895, "step": 93500 }, { "epoch": 0.601657759144878, "grad_norm": 0.279296875, "learning_rate": 0.00039518673792684095, "loss": 5.293, "step": 94000 }, { "epoch": 0.6048580663743719, "grad_norm": 0.296875, "learning_rate": 0.00039516113546900504, "loss": 5.2785, "step": 94500 }, { "epoch": 0.6080583736038659, "grad_norm": 0.291015625, "learning_rate": 0.00039513553301116913, "loss": 5.2833, "step": 95000 }, { "epoch": 0.61125868083336, "grad_norm": 0.287109375, "learning_rate": 0.00039510993055333316, "loss": 5.2893, "step": 95500 }, { "epoch": 0.614458988062854, "grad_norm": 0.283203125, "learning_rate": 0.0003950843280954972, "loss": 5.2882, "step": 96000 }, { "epoch": 0.6176592952923481, "grad_norm": 0.302734375, "learning_rate": 0.00039505872563766123, "loss": 5.2862, "step": 96500 }, { "epoch": 0.6208596025218421, "grad_norm": 0.298828125, "learning_rate": 0.0003950331231798253, "loss": 5.2826, "step": 97000 }, { "epoch": 0.6240599097513362, "grad_norm": 0.306640625, "learning_rate": 0.00039500752072198935, "loss": 5.2812, "step": 97500 }, { "epoch": 0.6272602169808301, "grad_norm": 0.291015625, "learning_rate": 0.0003949819182641534, "loss": 5.2875, "step": 98000 }, { "epoch": 0.6304605242103242, "grad_norm": 0.29296875, "learning_rate": 0.0003949563158063174, "loss": 5.2858, "step": 98500 }, { "epoch": 0.6336608314398182, "grad_norm": 0.30078125, "learning_rate": 0.00039493071334848146, "loss": 5.2816, "step": 99000 }, { "epoch": 0.6368611386693123, "grad_norm": 0.294921875, "learning_rate": 0.0003949051108906455, "loss": 5.2786, "step": 99500 }, { "epoch": 0.6400614458988063, "grad_norm": 0.28125, "learning_rate": 0.0003948795084328096, "loss": 5.2865, "step": 100000 }, { "epoch": 0.6432617531283004, "grad_norm": 0.294921875, "learning_rate": 0.0003948539059749736, "loss": 5.2846, "step": 100500 }, { "epoch": 0.6464620603577943, "grad_norm": 0.310546875, "learning_rate": 0.00039482830351713765, "loss": 5.2812, "step": 101000 }, { "epoch": 0.6496623675872883, "grad_norm": 0.3125, "learning_rate": 0.0003948027010593017, "loss": 5.2776, "step": 101500 }, { "epoch": 0.6528626748167824, "grad_norm": 0.30859375, "learning_rate": 0.00039477709860146577, "loss": 5.2828, "step": 102000 }, { "epoch": 0.6560629820462764, "grad_norm": 0.30859375, "learning_rate": 0.0003947514961436298, "loss": 5.28, "step": 102500 }, { "epoch": 0.6592632892757705, "grad_norm": 0.29296875, "learning_rate": 0.0003947258936857939, "loss": 5.2795, "step": 103000 }, { "epoch": 0.6624635965052645, "grad_norm": 0.287109375, "learning_rate": 0.0003947002912279579, "loss": 5.2777, "step": 103500 }, { "epoch": 0.6656639037347586, "grad_norm": 0.30859375, "learning_rate": 0.00039467468877012196, "loss": 5.2839, "step": 104000 }, { "epoch": 0.6688642109642525, "grad_norm": 0.298828125, "learning_rate": 0.000394649086312286, "loss": 5.2749, "step": 104500 }, { "epoch": 0.6720645181937466, "grad_norm": 0.283203125, "learning_rate": 0.00039462348385445003, "loss": 5.2791, "step": 105000 }, { "epoch": 0.6752648254232406, "grad_norm": 0.3125, "learning_rate": 0.0003945978813966141, "loss": 5.2742, "step": 105500 }, { "epoch": 0.6784651326527347, "grad_norm": 0.29296875, "learning_rate": 0.00039457227893877815, "loss": 5.2746, "step": 106000 }, { "epoch": 0.6816654398822287, "grad_norm": 0.294921875, "learning_rate": 0.0003945466764809422, "loss": 5.2777, "step": 106500 }, { "epoch": 0.6848657471117228, "grad_norm": 0.298828125, "learning_rate": 0.0003945210740231062, "loss": 5.2717, "step": 107000 }, { "epoch": 0.6880660543412167, "grad_norm": 0.314453125, "learning_rate": 0.00039449547156527025, "loss": 5.2749, "step": 107500 }, { "epoch": 0.6912663615707108, "grad_norm": 0.2890625, "learning_rate": 0.00039446986910743434, "loss": 5.2783, "step": 108000 }, { "epoch": 0.6944666688002048, "grad_norm": 0.306640625, "learning_rate": 0.0003944442666495984, "loss": 5.2739, "step": 108500 }, { "epoch": 0.6976669760296988, "grad_norm": 0.333984375, "learning_rate": 0.0003944186641917624, "loss": 5.2767, "step": 109000 }, { "epoch": 0.7008672832591929, "grad_norm": 0.3046875, "learning_rate": 0.00039439306173392645, "loss": 5.2801, "step": 109500 }, { "epoch": 0.704067590488687, "grad_norm": 0.302734375, "learning_rate": 0.00039436745927609053, "loss": 5.2789, "step": 110000 }, { "epoch": 0.707267897718181, "grad_norm": 0.302734375, "learning_rate": 0.00039434185681825457, "loss": 5.2754, "step": 110500 }, { "epoch": 0.7104682049476749, "grad_norm": 0.3046875, "learning_rate": 0.00039431625436041866, "loss": 5.2722, "step": 111000 }, { "epoch": 0.713668512177169, "grad_norm": 0.298828125, "learning_rate": 0.0003942906519025827, "loss": 5.2671, "step": 111500 }, { "epoch": 0.716868819406663, "grad_norm": 0.314453125, "learning_rate": 0.0003942650494447467, "loss": 5.27, "step": 112000 }, { "epoch": 0.7200691266361571, "grad_norm": 0.294921875, "learning_rate": 0.00039423944698691076, "loss": 5.2712, "step": 112500 }, { "epoch": 0.7232694338656511, "grad_norm": 0.296875, "learning_rate": 0.0003942138445290748, "loss": 5.272, "step": 113000 }, { "epoch": 0.7264697410951452, "grad_norm": 0.30859375, "learning_rate": 0.0003941882420712389, "loss": 5.2714, "step": 113500 }, { "epoch": 0.7296700483246392, "grad_norm": 0.310546875, "learning_rate": 0.0003941626396134029, "loss": 5.266, "step": 114000 }, { "epoch": 0.7328703555541332, "grad_norm": 0.3125, "learning_rate": 0.00039413703715556695, "loss": 5.2666, "step": 114500 }, { "epoch": 0.7360706627836272, "grad_norm": 0.302734375, "learning_rate": 0.000394111434697731, "loss": 5.2648, "step": 115000 }, { "epoch": 0.7392709700131213, "grad_norm": 0.294921875, "learning_rate": 0.000394085832239895, "loss": 5.266, "step": 115500 }, { "epoch": 0.7424712772426153, "grad_norm": 0.32421875, "learning_rate": 0.00039406022978205905, "loss": 5.2665, "step": 116000 }, { "epoch": 0.7456715844721094, "grad_norm": 0.296875, "learning_rate": 0.00039403462732422314, "loss": 5.269, "step": 116500 }, { "epoch": 0.7488718917016034, "grad_norm": 0.302734375, "learning_rate": 0.0003940090248663872, "loss": 5.2671, "step": 117000 }, { "epoch": 0.7520721989310973, "grad_norm": 0.306640625, "learning_rate": 0.00039398342240855126, "loss": 5.2694, "step": 117500 }, { "epoch": 0.7552725061605914, "grad_norm": 0.3046875, "learning_rate": 0.0003939578199507153, "loss": 5.2679, "step": 118000 }, { "epoch": 0.7584728133900854, "grad_norm": 0.30859375, "learning_rate": 0.00039393221749287933, "loss": 5.2682, "step": 118500 }, { "epoch": 0.7616731206195795, "grad_norm": 0.326171875, "learning_rate": 0.0003939066150350434, "loss": 5.2669, "step": 119000 }, { "epoch": 0.7648734278490735, "grad_norm": 0.30859375, "learning_rate": 0.00039388101257720746, "loss": 5.2708, "step": 119500 }, { "epoch": 0.7680737350785676, "grad_norm": 0.306640625, "learning_rate": 0.0003938554101193715, "loss": 5.2702, "step": 120000 }, { "epoch": 0.7712740423080616, "grad_norm": 0.30078125, "learning_rate": 0.0003938298076615355, "loss": 5.2592, "step": 120500 }, { "epoch": 0.7744743495375556, "grad_norm": 0.306640625, "learning_rate": 0.00039380420520369956, "loss": 5.261, "step": 121000 }, { "epoch": 0.7776746567670496, "grad_norm": 0.3125, "learning_rate": 0.0003937786027458636, "loss": 5.2653, "step": 121500 }, { "epoch": 0.7808749639965437, "grad_norm": 0.310546875, "learning_rate": 0.0003937530002880277, "loss": 5.2716, "step": 122000 }, { "epoch": 0.7840752712260377, "grad_norm": 0.322265625, "learning_rate": 0.0003937273978301917, "loss": 5.2643, "step": 122500 }, { "epoch": 0.7872755784555318, "grad_norm": 0.314453125, "learning_rate": 0.00039370179537235575, "loss": 5.2618, "step": 123000 }, { "epoch": 0.7904758856850258, "grad_norm": 0.306640625, "learning_rate": 0.0003936761929145198, "loss": 5.2592, "step": 123500 }, { "epoch": 0.7936761929145197, "grad_norm": 0.30859375, "learning_rate": 0.0003936505904566838, "loss": 5.262, "step": 124000 }, { "epoch": 0.7968765001440138, "grad_norm": 0.294921875, "learning_rate": 0.0003936249879988479, "loss": 5.2561, "step": 124500 }, { "epoch": 0.8000768073735078, "grad_norm": 0.318359375, "learning_rate": 0.00039359938554101194, "loss": 5.2625, "step": 125000 }, { "epoch": 0.8032771146030019, "grad_norm": 0.318359375, "learning_rate": 0.00039357378308317603, "loss": 5.2566, "step": 125500 }, { "epoch": 0.8064774218324959, "grad_norm": 0.298828125, "learning_rate": 0.00039354818062534006, "loss": 5.2632, "step": 126000 }, { "epoch": 0.80967772906199, "grad_norm": 0.322265625, "learning_rate": 0.0003935225781675041, "loss": 5.2591, "step": 126500 }, { "epoch": 0.812878036291484, "grad_norm": 0.3125, "learning_rate": 0.00039349697570966813, "loss": 5.26, "step": 127000 }, { "epoch": 0.816078343520978, "grad_norm": 0.328125, "learning_rate": 0.0003934713732518322, "loss": 5.261, "step": 127500 }, { "epoch": 0.819278650750472, "grad_norm": 0.30859375, "learning_rate": 0.00039344577079399625, "loss": 5.2639, "step": 128000 }, { "epoch": 0.8224789579799661, "grad_norm": 0.294921875, "learning_rate": 0.0003934201683361603, "loss": 5.258, "step": 128500 }, { "epoch": 0.8256792652094601, "grad_norm": 0.3125, "learning_rate": 0.0003933945658783243, "loss": 5.2634, "step": 129000 }, { "epoch": 0.8288795724389542, "grad_norm": 0.30859375, "learning_rate": 0.00039336896342048836, "loss": 5.2536, "step": 129500 }, { "epoch": 0.8320798796684482, "grad_norm": 0.3046875, "learning_rate": 0.00039334336096265245, "loss": 5.259, "step": 130000 }, { "epoch": 0.8352801868979423, "grad_norm": 0.318359375, "learning_rate": 0.0003933177585048165, "loss": 5.256, "step": 130500 }, { "epoch": 0.8384804941274362, "grad_norm": 0.314453125, "learning_rate": 0.0003932921560469805, "loss": 5.263, "step": 131000 }, { "epoch": 0.8416808013569302, "grad_norm": 0.291015625, "learning_rate": 0.00039326655358914455, "loss": 5.2594, "step": 131500 }, { "epoch": 0.8448811085864243, "grad_norm": 0.32421875, "learning_rate": 0.00039324095113130864, "loss": 5.254, "step": 132000 }, { "epoch": 0.8480814158159183, "grad_norm": 0.32421875, "learning_rate": 0.00039321534867347267, "loss": 5.2611, "step": 132500 }, { "epoch": 0.8512817230454124, "grad_norm": 0.310546875, "learning_rate": 0.00039318974621563676, "loss": 5.2558, "step": 133000 }, { "epoch": 0.8544820302749064, "grad_norm": 0.30859375, "learning_rate": 0.0003931641437578008, "loss": 5.2568, "step": 133500 }, { "epoch": 0.8576823375044004, "grad_norm": 0.3359375, "learning_rate": 0.00039313854129996483, "loss": 5.2554, "step": 134000 }, { "epoch": 0.8608826447338944, "grad_norm": 0.322265625, "learning_rate": 0.00039311293884212886, "loss": 5.2527, "step": 134500 }, { "epoch": 0.8640829519633885, "grad_norm": 0.3125, "learning_rate": 0.0003930873363842929, "loss": 5.2552, "step": 135000 }, { "epoch": 0.8672832591928825, "grad_norm": 0.30078125, "learning_rate": 0.000393061733926457, "loss": 5.2521, "step": 135500 }, { "epoch": 0.8704835664223766, "grad_norm": 0.33203125, "learning_rate": 0.000393036131468621, "loss": 5.2537, "step": 136000 }, { "epoch": 0.8736838736518706, "grad_norm": 0.3203125, "learning_rate": 0.00039301052901078505, "loss": 5.2509, "step": 136500 }, { "epoch": 0.8768841808813647, "grad_norm": 0.318359375, "learning_rate": 0.0003929849265529491, "loss": 5.2577, "step": 137000 }, { "epoch": 0.8800844881108586, "grad_norm": 0.3125, "learning_rate": 0.0003929593240951131, "loss": 5.2591, "step": 137500 }, { "epoch": 0.8832847953403526, "grad_norm": 0.326171875, "learning_rate": 0.0003929337216372772, "loss": 5.2543, "step": 138000 }, { "epoch": 0.8864851025698467, "grad_norm": 0.31640625, "learning_rate": 0.00039290811917944124, "loss": 5.2522, "step": 138500 }, { "epoch": 0.8896854097993407, "grad_norm": 0.337890625, "learning_rate": 0.0003928825167216053, "loss": 5.2445, "step": 139000 }, { "epoch": 0.8928857170288348, "grad_norm": 0.3203125, "learning_rate": 0.0003928569142637693, "loss": 5.2539, "step": 139500 }, { "epoch": 0.8960860242583288, "grad_norm": 0.30859375, "learning_rate": 0.0003928313118059334, "loss": 5.2501, "step": 140000 }, { "epoch": 0.8992863314878228, "grad_norm": 0.32421875, "learning_rate": 0.00039280570934809744, "loss": 5.256, "step": 140500 }, { "epoch": 0.9024866387173168, "grad_norm": 0.326171875, "learning_rate": 0.0003927801068902615, "loss": 5.256, "step": 141000 }, { "epoch": 0.9056869459468109, "grad_norm": 0.3203125, "learning_rate": 0.00039275450443242556, "loss": 5.253, "step": 141500 }, { "epoch": 0.9088872531763049, "grad_norm": 0.30859375, "learning_rate": 0.0003927289019745896, "loss": 5.25, "step": 142000 }, { "epoch": 0.912087560405799, "grad_norm": 0.318359375, "learning_rate": 0.00039270329951675363, "loss": 5.2534, "step": 142500 }, { "epoch": 0.915287867635293, "grad_norm": 0.32421875, "learning_rate": 0.00039267769705891766, "loss": 5.253, "step": 143000 }, { "epoch": 0.9184881748647871, "grad_norm": 0.3203125, "learning_rate": 0.00039265209460108175, "loss": 5.2497, "step": 143500 }, { "epoch": 0.921688482094281, "grad_norm": 0.314453125, "learning_rate": 0.0003926264921432458, "loss": 5.2563, "step": 144000 }, { "epoch": 0.924888789323775, "grad_norm": 0.30078125, "learning_rate": 0.0003926008896854098, "loss": 5.2449, "step": 144500 }, { "epoch": 0.9280890965532691, "grad_norm": 0.326171875, "learning_rate": 0.00039257528722757385, "loss": 5.2514, "step": 145000 }, { "epoch": 0.9312894037827631, "grad_norm": 0.3046875, "learning_rate": 0.0003925496847697379, "loss": 5.2508, "step": 145500 }, { "epoch": 0.9344897110122572, "grad_norm": 0.330078125, "learning_rate": 0.0003925240823119019, "loss": 5.2492, "step": 146000 }, { "epoch": 0.9376900182417512, "grad_norm": 0.32421875, "learning_rate": 0.000392498479854066, "loss": 5.2414, "step": 146500 }, { "epoch": 0.9408903254712453, "grad_norm": 0.330078125, "learning_rate": 0.00039247287739623004, "loss": 5.2449, "step": 147000 }, { "epoch": 0.9440906327007392, "grad_norm": 0.310546875, "learning_rate": 0.0003924472749383941, "loss": 5.2465, "step": 147500 }, { "epoch": 0.9472909399302333, "grad_norm": 0.322265625, "learning_rate": 0.00039242167248055817, "loss": 5.246, "step": 148000 }, { "epoch": 0.9504912471597273, "grad_norm": 0.3046875, "learning_rate": 0.0003923960700227222, "loss": 5.2489, "step": 148500 }, { "epoch": 0.9536915543892214, "grad_norm": 0.341796875, "learning_rate": 0.0003923704675648863, "loss": 5.2438, "step": 149000 }, { "epoch": 0.9568918616187154, "grad_norm": 0.318359375, "learning_rate": 0.0003923448651070503, "loss": 5.2472, "step": 149500 }, { "epoch": 0.9600921688482095, "grad_norm": 0.314453125, "learning_rate": 0.00039231926264921436, "loss": 5.25, "step": 150000 }, { "epoch": 0.9632924760777034, "grad_norm": 0.29296875, "learning_rate": 0.0003922936601913784, "loss": 5.2486, "step": 150500 }, { "epoch": 0.9664927833071975, "grad_norm": 0.330078125, "learning_rate": 0.0003922680577335424, "loss": 5.2468, "step": 151000 }, { "epoch": 0.9696930905366915, "grad_norm": 0.3203125, "learning_rate": 0.00039224245527570646, "loss": 5.2491, "step": 151500 }, { "epoch": 0.9728933977661856, "grad_norm": 0.314453125, "learning_rate": 0.00039221685281787055, "loss": 5.2458, "step": 152000 }, { "epoch": 0.9760937049956796, "grad_norm": 0.318359375, "learning_rate": 0.0003921912503600346, "loss": 5.2511, "step": 152500 }, { "epoch": 0.9792940122251736, "grad_norm": 0.318359375, "learning_rate": 0.0003921656479021986, "loss": 5.2354, "step": 153000 }, { "epoch": 0.9824943194546677, "grad_norm": 0.333984375, "learning_rate": 0.00039214004544436265, "loss": 5.2433, "step": 153500 }, { "epoch": 0.9856946266841616, "grad_norm": 0.326171875, "learning_rate": 0.0003921144429865267, "loss": 5.2477, "step": 154000 }, { "epoch": 0.9888949339136557, "grad_norm": 0.318359375, "learning_rate": 0.0003920888405286908, "loss": 5.2436, "step": 154500 }, { "epoch": 0.9920952411431497, "grad_norm": 0.318359375, "learning_rate": 0.0003920632380708548, "loss": 5.2468, "step": 155000 }, { "epoch": 0.9952955483726438, "grad_norm": 0.322265625, "learning_rate": 0.0003920376356130189, "loss": 5.2409, "step": 155500 }, { "epoch": 0.9984958556021378, "grad_norm": 0.330078125, "learning_rate": 0.00039201203315518293, "loss": 5.2425, "step": 156000 }, { "epoch": 1.0, "eval_loss": 5.233697414398193, "eval_runtime": 1.6051, "eval_samples_per_second": 623.012, "eval_steps_per_second": 9.968, "step": 156235 }, { "epoch": 1.0016961628316319, "grad_norm": 0.314453125, "learning_rate": 0.00039198643069734697, "loss": 5.2388, "step": 156500 }, { "epoch": 1.0048964700611258, "grad_norm": 0.328125, "learning_rate": 0.000391960828239511, "loss": 5.2386, "step": 157000 }, { "epoch": 1.00809677729062, "grad_norm": 0.3046875, "learning_rate": 0.0003919352257816751, "loss": 5.2406, "step": 157500 }, { "epoch": 1.011297084520114, "grad_norm": 0.337890625, "learning_rate": 0.0003919096233238391, "loss": 5.2425, "step": 158000 }, { "epoch": 1.014497391749608, "grad_norm": 0.337890625, "learning_rate": 0.00039188402086600316, "loss": 5.2406, "step": 158500 }, { "epoch": 1.017697698979102, "grad_norm": 0.330078125, "learning_rate": 0.0003918584184081672, "loss": 5.2416, "step": 159000 }, { "epoch": 1.020898006208596, "grad_norm": 0.33984375, "learning_rate": 0.0003918328159503312, "loss": 5.242, "step": 159500 }, { "epoch": 1.02409831343809, "grad_norm": 0.32421875, "learning_rate": 0.0003918072134924953, "loss": 5.2358, "step": 160000 }, { "epoch": 1.027298620667584, "grad_norm": 0.341796875, "learning_rate": 0.00039178161103465935, "loss": 5.2389, "step": 160500 }, { "epoch": 1.0304989278970782, "grad_norm": 0.333984375, "learning_rate": 0.0003917560085768234, "loss": 5.2403, "step": 161000 }, { "epoch": 1.0336992351265721, "grad_norm": 0.318359375, "learning_rate": 0.0003917304061189874, "loss": 5.238, "step": 161500 }, { "epoch": 1.0368995423560663, "grad_norm": 0.3828125, "learning_rate": 0.00039170480366115145, "loss": 5.2392, "step": 162000 }, { "epoch": 1.0400998495855602, "grad_norm": 0.333984375, "learning_rate": 0.00039167920120331554, "loss": 5.2381, "step": 162500 }, { "epoch": 1.0433001568150542, "grad_norm": 0.31640625, "learning_rate": 0.0003916535987454796, "loss": 5.2354, "step": 163000 }, { "epoch": 1.0465004640445483, "grad_norm": 0.3203125, "learning_rate": 0.00039162799628764366, "loss": 5.2342, "step": 163500 }, { "epoch": 1.0497007712740423, "grad_norm": 0.318359375, "learning_rate": 0.0003916023938298077, "loss": 5.2378, "step": 164000 }, { "epoch": 1.0529010785035364, "grad_norm": 0.314453125, "learning_rate": 0.00039157679137197173, "loss": 5.2414, "step": 164500 }, { "epoch": 1.0561013857330304, "grad_norm": 0.337890625, "learning_rate": 0.00039155118891413576, "loss": 5.2382, "step": 165000 }, { "epoch": 1.0593016929625243, "grad_norm": 0.33203125, "learning_rate": 0.00039152558645629985, "loss": 5.2386, "step": 165500 }, { "epoch": 1.0625020001920185, "grad_norm": 0.3203125, "learning_rate": 0.0003914999839984639, "loss": 5.2348, "step": 166000 }, { "epoch": 1.0657023074215124, "grad_norm": 0.32421875, "learning_rate": 0.0003914743815406279, "loss": 5.2314, "step": 166500 }, { "epoch": 1.0689026146510066, "grad_norm": 0.337890625, "learning_rate": 0.00039144877908279196, "loss": 5.23, "step": 167000 }, { "epoch": 1.0721029218805005, "grad_norm": 0.31640625, "learning_rate": 0.000391423176624956, "loss": 5.2312, "step": 167500 }, { "epoch": 1.0753032291099947, "grad_norm": 0.349609375, "learning_rate": 0.00039139757416712, "loss": 5.2448, "step": 168000 }, { "epoch": 1.0785035363394886, "grad_norm": 0.310546875, "learning_rate": 0.0003913719717092841, "loss": 5.2313, "step": 168500 }, { "epoch": 1.0817038435689825, "grad_norm": 0.328125, "learning_rate": 0.00039134636925144815, "loss": 5.2329, "step": 169000 }, { "epoch": 1.0849041507984767, "grad_norm": 0.341796875, "learning_rate": 0.0003913207667936122, "loss": 5.2307, "step": 169500 }, { "epoch": 1.0881044580279706, "grad_norm": 0.32421875, "learning_rate": 0.00039129516433577627, "loss": 5.2377, "step": 170000 }, { "epoch": 1.0913047652574648, "grad_norm": 0.33984375, "learning_rate": 0.0003912695618779403, "loss": 5.2423, "step": 170500 }, { "epoch": 1.0945050724869587, "grad_norm": 0.32421875, "learning_rate": 0.0003912439594201044, "loss": 5.2413, "step": 171000 }, { "epoch": 1.0977053797164529, "grad_norm": 0.318359375, "learning_rate": 0.0003912183569622684, "loss": 5.2345, "step": 171500 }, { "epoch": 1.1009056869459468, "grad_norm": 0.326171875, "learning_rate": 0.00039119275450443246, "loss": 5.2354, "step": 172000 }, { "epoch": 1.1041059941754408, "grad_norm": 0.33203125, "learning_rate": 0.0003911671520465965, "loss": 5.2357, "step": 172500 }, { "epoch": 1.107306301404935, "grad_norm": 0.310546875, "learning_rate": 0.00039114154958876053, "loss": 5.2332, "step": 173000 }, { "epoch": 1.1105066086344288, "grad_norm": 0.333984375, "learning_rate": 0.0003911159471309246, "loss": 5.2334, "step": 173500 }, { "epoch": 1.113706915863923, "grad_norm": 0.3515625, "learning_rate": 0.00039109034467308865, "loss": 5.2318, "step": 174000 }, { "epoch": 1.116907223093417, "grad_norm": 0.328125, "learning_rate": 0.0003910647422152527, "loss": 5.2356, "step": 174500 }, { "epoch": 1.1201075303229109, "grad_norm": 0.326171875, "learning_rate": 0.0003910391397574167, "loss": 5.232, "step": 175000 }, { "epoch": 1.123307837552405, "grad_norm": 0.32421875, "learning_rate": 0.00039101353729958075, "loss": 5.233, "step": 175500 }, { "epoch": 1.126508144781899, "grad_norm": 0.34375, "learning_rate": 0.0003909879348417448, "loss": 5.2262, "step": 176000 }, { "epoch": 1.1297084520113931, "grad_norm": 0.345703125, "learning_rate": 0.0003909623323839089, "loss": 5.2293, "step": 176500 }, { "epoch": 1.132908759240887, "grad_norm": 0.322265625, "learning_rate": 0.0003909367299260729, "loss": 5.2283, "step": 177000 }, { "epoch": 1.1361090664703812, "grad_norm": 0.337890625, "learning_rate": 0.00039091112746823695, "loss": 5.2333, "step": 177500 }, { "epoch": 1.1393093736998752, "grad_norm": 0.337890625, "learning_rate": 0.00039088552501040103, "loss": 5.2351, "step": 178000 }, { "epoch": 1.1425096809293693, "grad_norm": 0.330078125, "learning_rate": 0.00039085992255256507, "loss": 5.2342, "step": 178500 }, { "epoch": 1.1457099881588633, "grad_norm": 0.30859375, "learning_rate": 0.00039083432009472916, "loss": 5.2338, "step": 179000 }, { "epoch": 1.1489102953883572, "grad_norm": 0.34765625, "learning_rate": 0.0003908087176368932, "loss": 5.2308, "step": 179500 }, { "epoch": 1.1521106026178514, "grad_norm": 0.3359375, "learning_rate": 0.0003907831151790572, "loss": 5.2279, "step": 180000 }, { "epoch": 1.1553109098473453, "grad_norm": 0.30859375, "learning_rate": 0.00039075751272122126, "loss": 5.2357, "step": 180500 }, { "epoch": 1.1585112170768395, "grad_norm": 0.369140625, "learning_rate": 0.0003907319102633853, "loss": 5.2233, "step": 181000 }, { "epoch": 1.1617115243063334, "grad_norm": 0.326171875, "learning_rate": 0.00039070630780554933, "loss": 5.2309, "step": 181500 }, { "epoch": 1.1649118315358273, "grad_norm": 0.328125, "learning_rate": 0.0003906807053477134, "loss": 5.2303, "step": 182000 }, { "epoch": 1.1681121387653215, "grad_norm": 0.326171875, "learning_rate": 0.00039065510288987745, "loss": 5.2274, "step": 182500 }, { "epoch": 1.1713124459948154, "grad_norm": 0.341796875, "learning_rate": 0.0003906295004320415, "loss": 5.2295, "step": 183000 }, { "epoch": 1.1745127532243096, "grad_norm": 0.337890625, "learning_rate": 0.0003906038979742055, "loss": 5.2258, "step": 183500 }, { "epoch": 1.1777130604538035, "grad_norm": 0.341796875, "learning_rate": 0.00039057829551636955, "loss": 5.2288, "step": 184000 }, { "epoch": 1.1809133676832977, "grad_norm": 0.35546875, "learning_rate": 0.00039055269305853364, "loss": 5.2293, "step": 184500 }, { "epoch": 1.1841136749127916, "grad_norm": 0.33984375, "learning_rate": 0.0003905270906006977, "loss": 5.2275, "step": 185000 }, { "epoch": 1.1873139821422858, "grad_norm": 0.345703125, "learning_rate": 0.00039050148814286176, "loss": 5.2306, "step": 185500 }, { "epoch": 1.1905142893717797, "grad_norm": 0.333984375, "learning_rate": 0.0003904758856850258, "loss": 5.2261, "step": 186000 }, { "epoch": 1.1937145966012737, "grad_norm": 0.349609375, "learning_rate": 0.00039045028322718983, "loss": 5.2227, "step": 186500 }, { "epoch": 1.1969149038307678, "grad_norm": 0.34375, "learning_rate": 0.00039042468076935387, "loss": 5.2282, "step": 187000 }, { "epoch": 1.2001152110602618, "grad_norm": 0.34765625, "learning_rate": 0.00039039907831151796, "loss": 5.2293, "step": 187500 }, { "epoch": 1.203315518289756, "grad_norm": 0.337890625, "learning_rate": 0.000390373475853682, "loss": 5.2249, "step": 188000 }, { "epoch": 1.2065158255192499, "grad_norm": 0.3203125, "learning_rate": 0.000390347873395846, "loss": 5.2263, "step": 188500 }, { "epoch": 1.2097161327487438, "grad_norm": 0.322265625, "learning_rate": 0.00039032227093801006, "loss": 5.227, "step": 189000 }, { "epoch": 1.212916439978238, "grad_norm": 0.33203125, "learning_rate": 0.0003902966684801741, "loss": 5.2288, "step": 189500 }, { "epoch": 1.2161167472077319, "grad_norm": 0.337890625, "learning_rate": 0.0003902710660223382, "loss": 5.2248, "step": 190000 }, { "epoch": 1.219317054437226, "grad_norm": 0.333984375, "learning_rate": 0.0003902454635645022, "loss": 5.2225, "step": 190500 }, { "epoch": 1.22251736166672, "grad_norm": 0.349609375, "learning_rate": 0.00039021986110666625, "loss": 5.2203, "step": 191000 }, { "epoch": 1.225717668896214, "grad_norm": 0.333984375, "learning_rate": 0.0003901942586488303, "loss": 5.2242, "step": 191500 }, { "epoch": 1.228917976125708, "grad_norm": 0.337890625, "learning_rate": 0.0003901686561909943, "loss": 5.2195, "step": 192000 }, { "epoch": 1.232118283355202, "grad_norm": 0.34765625, "learning_rate": 0.0003901430537331584, "loss": 5.2254, "step": 192500 }, { "epoch": 1.2353185905846962, "grad_norm": 0.365234375, "learning_rate": 0.00039011745127532244, "loss": 5.2238, "step": 193000 }, { "epoch": 1.2385188978141901, "grad_norm": 0.33984375, "learning_rate": 0.00039009184881748653, "loss": 5.23, "step": 193500 }, { "epoch": 1.2417192050436843, "grad_norm": 0.333984375, "learning_rate": 0.00039006624635965056, "loss": 5.2221, "step": 194000 }, { "epoch": 1.2449195122731782, "grad_norm": 0.328125, "learning_rate": 0.0003900406439018146, "loss": 5.2264, "step": 194500 }, { "epoch": 1.2481198195026724, "grad_norm": 0.373046875, "learning_rate": 0.00039001504144397863, "loss": 5.2214, "step": 195000 }, { "epoch": 1.2513201267321663, "grad_norm": 0.32421875, "learning_rate": 0.0003899894389861427, "loss": 5.2222, "step": 195500 }, { "epoch": 1.2545204339616602, "grad_norm": 0.34375, "learning_rate": 0.00038996383652830676, "loss": 5.2248, "step": 196000 }, { "epoch": 1.2577207411911544, "grad_norm": 0.33984375, "learning_rate": 0.0003899382340704708, "loss": 5.2203, "step": 196500 }, { "epoch": 1.2609210484206483, "grad_norm": 0.3515625, "learning_rate": 0.0003899126316126348, "loss": 5.222, "step": 197000 }, { "epoch": 1.2641213556501425, "grad_norm": 0.333984375, "learning_rate": 0.00038988702915479886, "loss": 5.2234, "step": 197500 }, { "epoch": 1.2673216628796364, "grad_norm": 0.33984375, "learning_rate": 0.0003898614266969629, "loss": 5.2289, "step": 198000 }, { "epoch": 1.2705219701091304, "grad_norm": 0.341796875, "learning_rate": 0.000389835824239127, "loss": 5.2214, "step": 198500 }, { "epoch": 1.2737222773386245, "grad_norm": 0.345703125, "learning_rate": 0.000389810221781291, "loss": 5.2235, "step": 199000 }, { "epoch": 1.2769225845681185, "grad_norm": 0.333984375, "learning_rate": 0.00038978461932345505, "loss": 5.2256, "step": 199500 }, { "epoch": 1.2801228917976126, "grad_norm": 0.3515625, "learning_rate": 0.0003897590168656191, "loss": 5.2266, "step": 200000 }, { "epoch": 1.2833231990271066, "grad_norm": 0.349609375, "learning_rate": 0.00038973341440778317, "loss": 5.2288, "step": 200500 }, { "epoch": 1.2865235062566005, "grad_norm": 0.3671875, "learning_rate": 0.0003897078119499472, "loss": 5.2253, "step": 201000 }, { "epoch": 1.2897238134860947, "grad_norm": 0.3359375, "learning_rate": 0.0003896822094921113, "loss": 5.2216, "step": 201500 }, { "epoch": 1.2929241207155888, "grad_norm": 0.345703125, "learning_rate": 0.00038965660703427533, "loss": 5.2229, "step": 202000 }, { "epoch": 1.2961244279450828, "grad_norm": 0.357421875, "learning_rate": 0.00038963100457643936, "loss": 5.2239, "step": 202500 }, { "epoch": 1.2993247351745767, "grad_norm": 0.33984375, "learning_rate": 0.0003896054021186034, "loss": 5.217, "step": 203000 }, { "epoch": 1.3025250424040709, "grad_norm": 0.36328125, "learning_rate": 0.00038957979966076743, "loss": 5.2295, "step": 203500 }, { "epoch": 1.3057253496335648, "grad_norm": 0.32421875, "learning_rate": 0.0003895541972029315, "loss": 5.2226, "step": 204000 }, { "epoch": 1.308925656863059, "grad_norm": 0.3203125, "learning_rate": 0.00038952859474509555, "loss": 5.224, "step": 204500 }, { "epoch": 1.3121259640925529, "grad_norm": 0.333984375, "learning_rate": 0.0003895029922872596, "loss": 5.2216, "step": 205000 }, { "epoch": 1.3153262713220468, "grad_norm": 0.349609375, "learning_rate": 0.0003894773898294236, "loss": 5.2219, "step": 205500 }, { "epoch": 1.318526578551541, "grad_norm": 0.3359375, "learning_rate": 0.00038945178737158766, "loss": 5.2175, "step": 206000 }, { "epoch": 1.321726885781035, "grad_norm": 0.359375, "learning_rate": 0.00038942618491375175, "loss": 5.2167, "step": 206500 }, { "epoch": 1.324927193010529, "grad_norm": 0.33203125, "learning_rate": 0.0003894005824559158, "loss": 5.2227, "step": 207000 }, { "epoch": 1.328127500240023, "grad_norm": 0.359375, "learning_rate": 0.0003893749799980798, "loss": 5.2199, "step": 207500 }, { "epoch": 1.331327807469517, "grad_norm": 0.33984375, "learning_rate": 0.0003893493775402439, "loss": 5.2198, "step": 208000 }, { "epoch": 1.3345281146990111, "grad_norm": 0.345703125, "learning_rate": 0.00038932377508240794, "loss": 5.2206, "step": 208500 }, { "epoch": 1.3377284219285053, "grad_norm": 0.337890625, "learning_rate": 0.000389298172624572, "loss": 5.2217, "step": 209000 }, { "epoch": 1.3409287291579992, "grad_norm": 0.345703125, "learning_rate": 0.00038927257016673606, "loss": 5.2231, "step": 209500 }, { "epoch": 1.3441290363874931, "grad_norm": 0.39453125, "learning_rate": 0.0003892469677089001, "loss": 5.2211, "step": 210000 }, { "epoch": 1.3473293436169873, "grad_norm": 0.33984375, "learning_rate": 0.00038922136525106413, "loss": 5.2185, "step": 210500 }, { "epoch": 1.3505296508464812, "grad_norm": 0.33984375, "learning_rate": 0.00038919576279322816, "loss": 5.2207, "step": 211000 }, { "epoch": 1.3537299580759754, "grad_norm": 0.322265625, "learning_rate": 0.0003891701603353922, "loss": 5.23, "step": 211500 }, { "epoch": 1.3569302653054693, "grad_norm": 0.34765625, "learning_rate": 0.0003891445578775563, "loss": 5.2224, "step": 212000 }, { "epoch": 1.3601305725349633, "grad_norm": 0.3203125, "learning_rate": 0.0003891189554197203, "loss": 5.2179, "step": 212500 }, { "epoch": 1.3633308797644574, "grad_norm": 0.34375, "learning_rate": 0.00038909335296188435, "loss": 5.2214, "step": 213000 }, { "epoch": 1.3665311869939514, "grad_norm": 0.345703125, "learning_rate": 0.0003890677505040484, "loss": 5.2253, "step": 213500 }, { "epoch": 1.3697314942234455, "grad_norm": 0.3515625, "learning_rate": 0.0003890421480462124, "loss": 5.2184, "step": 214000 }, { "epoch": 1.3729318014529395, "grad_norm": 0.3515625, "learning_rate": 0.0003890165455883765, "loss": 5.2235, "step": 214500 }, { "epoch": 1.3761321086824334, "grad_norm": 0.36328125, "learning_rate": 0.00038899094313054054, "loss": 5.2209, "step": 215000 }, { "epoch": 1.3793324159119276, "grad_norm": 0.33984375, "learning_rate": 0.0003889653406727046, "loss": 5.2172, "step": 215500 }, { "epoch": 1.3825327231414215, "grad_norm": 0.3671875, "learning_rate": 0.00038893973821486867, "loss": 5.2177, "step": 216000 }, { "epoch": 1.3857330303709157, "grad_norm": 0.373046875, "learning_rate": 0.0003889141357570327, "loss": 5.2171, "step": 216500 }, { "epoch": 1.3889333376004096, "grad_norm": 0.33203125, "learning_rate": 0.00038888853329919674, "loss": 5.2189, "step": 217000 }, { "epoch": 1.3921336448299035, "grad_norm": 0.333984375, "learning_rate": 0.0003888629308413608, "loss": 5.2184, "step": 217500 }, { "epoch": 1.3953339520593977, "grad_norm": 0.349609375, "learning_rate": 0.00038883732838352486, "loss": 5.2201, "step": 218000 }, { "epoch": 1.3985342592888919, "grad_norm": 0.353515625, "learning_rate": 0.0003888117259256889, "loss": 5.2189, "step": 218500 }, { "epoch": 1.4017345665183858, "grad_norm": 0.341796875, "learning_rate": 0.0003887861234678529, "loss": 5.219, "step": 219000 }, { "epoch": 1.4049348737478797, "grad_norm": 0.34375, "learning_rate": 0.00038876052101001696, "loss": 5.2243, "step": 219500 }, { "epoch": 1.408135180977374, "grad_norm": 0.337890625, "learning_rate": 0.00038873491855218105, "loss": 5.218, "step": 220000 }, { "epoch": 1.4113354882068678, "grad_norm": 0.35546875, "learning_rate": 0.0003887093160943451, "loss": 5.2208, "step": 220500 }, { "epoch": 1.414535795436362, "grad_norm": 0.35546875, "learning_rate": 0.0003886837136365091, "loss": 5.2175, "step": 221000 }, { "epoch": 1.417736102665856, "grad_norm": 0.349609375, "learning_rate": 0.00038865811117867315, "loss": 5.2186, "step": 221500 }, { "epoch": 1.4209364098953499, "grad_norm": 0.369140625, "learning_rate": 0.0003886325087208372, "loss": 5.215, "step": 222000 }, { "epoch": 1.424136717124844, "grad_norm": 0.359375, "learning_rate": 0.0003886069062630012, "loss": 5.2184, "step": 222500 }, { "epoch": 1.427337024354338, "grad_norm": 0.341796875, "learning_rate": 0.0003885813038051653, "loss": 5.2215, "step": 223000 }, { "epoch": 1.4305373315838321, "grad_norm": 0.326171875, "learning_rate": 0.0003885557013473294, "loss": 5.2166, "step": 223500 }, { "epoch": 1.433737638813326, "grad_norm": 0.33984375, "learning_rate": 0.00038853009888949343, "loss": 5.2194, "step": 224000 }, { "epoch": 1.43693794604282, "grad_norm": 0.33203125, "learning_rate": 0.00038850449643165747, "loss": 5.2188, "step": 224500 }, { "epoch": 1.4401382532723142, "grad_norm": 0.330078125, "learning_rate": 0.0003884788939738215, "loss": 5.216, "step": 225000 }, { "epoch": 1.4433385605018083, "grad_norm": 0.357421875, "learning_rate": 0.0003884532915159856, "loss": 5.216, "step": 225500 }, { "epoch": 1.4465388677313022, "grad_norm": 0.349609375, "learning_rate": 0.0003884276890581496, "loss": 5.2194, "step": 226000 }, { "epoch": 1.4497391749607962, "grad_norm": 0.359375, "learning_rate": 0.00038840208660031366, "loss": 5.2202, "step": 226500 }, { "epoch": 1.4529394821902903, "grad_norm": 0.3359375, "learning_rate": 0.0003883764841424777, "loss": 5.2207, "step": 227000 }, { "epoch": 1.4561397894197843, "grad_norm": 0.34375, "learning_rate": 0.0003883508816846417, "loss": 5.2209, "step": 227500 }, { "epoch": 1.4593400966492784, "grad_norm": 0.3359375, "learning_rate": 0.00038832527922680576, "loss": 5.2265, "step": 228000 }, { "epoch": 1.4625404038787724, "grad_norm": 0.35546875, "learning_rate": 0.00038829967676896985, "loss": 5.2122, "step": 228500 }, { "epoch": 1.4657407111082663, "grad_norm": 0.359375, "learning_rate": 0.0003882740743111339, "loss": 5.211, "step": 229000 }, { "epoch": 1.4689410183377605, "grad_norm": 0.36328125, "learning_rate": 0.0003882484718532979, "loss": 5.2213, "step": 229500 }, { "epoch": 1.4721413255672544, "grad_norm": 0.345703125, "learning_rate": 0.00038822286939546195, "loss": 5.2215, "step": 230000 }, { "epoch": 1.4753416327967486, "grad_norm": 0.33203125, "learning_rate": 0.00038819726693762604, "loss": 5.2211, "step": 230500 }, { "epoch": 1.4785419400262425, "grad_norm": 0.33984375, "learning_rate": 0.0003881716644797901, "loss": 5.2163, "step": 231000 }, { "epoch": 1.4817422472557364, "grad_norm": 0.34375, "learning_rate": 0.00038814606202195416, "loss": 5.2206, "step": 231500 }, { "epoch": 1.4849425544852306, "grad_norm": 0.353515625, "learning_rate": 0.0003881204595641182, "loss": 5.2235, "step": 232000 }, { "epoch": 1.4881428617147245, "grad_norm": 0.34765625, "learning_rate": 0.00038809485710628223, "loss": 5.2167, "step": 232500 }, { "epoch": 1.4913431689442187, "grad_norm": 0.330078125, "learning_rate": 0.00038806925464844626, "loss": 5.2211, "step": 233000 }, { "epoch": 1.4945434761737126, "grad_norm": 0.33984375, "learning_rate": 0.0003880436521906103, "loss": 5.2137, "step": 233500 }, { "epoch": 1.4977437834032066, "grad_norm": 0.337890625, "learning_rate": 0.0003880180497327744, "loss": 5.2173, "step": 234000 }, { "epoch": 1.5009440906327007, "grad_norm": 0.361328125, "learning_rate": 0.0003879924472749384, "loss": 5.2144, "step": 234500 }, { "epoch": 1.504144397862195, "grad_norm": 0.345703125, "learning_rate": 0.00038796684481710246, "loss": 5.2201, "step": 235000 }, { "epoch": 1.5073447050916888, "grad_norm": 0.36328125, "learning_rate": 0.0003879412423592665, "loss": 5.2115, "step": 235500 }, { "epoch": 1.5105450123211828, "grad_norm": 0.333984375, "learning_rate": 0.0003879156399014305, "loss": 5.2141, "step": 236000 }, { "epoch": 1.5137453195506767, "grad_norm": 0.330078125, "learning_rate": 0.0003878900374435946, "loss": 5.2155, "step": 236500 }, { "epoch": 1.5169456267801709, "grad_norm": 0.34375, "learning_rate": 0.00038786443498575865, "loss": 5.2129, "step": 237000 }, { "epoch": 1.520145934009665, "grad_norm": 0.36328125, "learning_rate": 0.0003878388325279227, "loss": 5.2213, "step": 237500 }, { "epoch": 1.523346241239159, "grad_norm": 0.365234375, "learning_rate": 0.0003878132300700867, "loss": 5.2209, "step": 238000 }, { "epoch": 1.526546548468653, "grad_norm": 0.35546875, "learning_rate": 0.0003877876276122508, "loss": 5.2169, "step": 238500 }, { "epoch": 1.529746855698147, "grad_norm": 0.37109375, "learning_rate": 0.00038776202515441484, "loss": 5.2133, "step": 239000 }, { "epoch": 1.5329471629276412, "grad_norm": 0.34375, "learning_rate": 0.0003877364226965789, "loss": 5.2157, "step": 239500 }, { "epoch": 1.5361474701571352, "grad_norm": 0.37109375, "learning_rate": 0.00038771082023874296, "loss": 5.2146, "step": 240000 }, { "epoch": 1.539347777386629, "grad_norm": 0.369140625, "learning_rate": 0.000387685217780907, "loss": 5.2147, "step": 240500 }, { "epoch": 1.542548084616123, "grad_norm": 0.359375, "learning_rate": 0.00038765961532307103, "loss": 5.2205, "step": 241000 }, { "epoch": 1.5457483918456172, "grad_norm": 0.353515625, "learning_rate": 0.00038763401286523506, "loss": 5.215, "step": 241500 }, { "epoch": 1.5489486990751113, "grad_norm": 0.357421875, "learning_rate": 0.00038760841040739915, "loss": 5.2172, "step": 242000 }, { "epoch": 1.5521490063046053, "grad_norm": 0.337890625, "learning_rate": 0.0003875828079495632, "loss": 5.2126, "step": 242500 }, { "epoch": 1.5553493135340992, "grad_norm": 0.328125, "learning_rate": 0.0003875572054917272, "loss": 5.2145, "step": 243000 }, { "epoch": 1.5585496207635932, "grad_norm": 0.33984375, "learning_rate": 0.00038753160303389125, "loss": 5.2084, "step": 243500 }, { "epoch": 1.5617499279930873, "grad_norm": 0.357421875, "learning_rate": 0.0003875060005760553, "loss": 5.2203, "step": 244000 }, { "epoch": 1.5649502352225815, "grad_norm": 0.35546875, "learning_rate": 0.0003874803981182193, "loss": 5.2093, "step": 244500 }, { "epoch": 1.5681505424520754, "grad_norm": 0.359375, "learning_rate": 0.0003874547956603834, "loss": 5.2172, "step": 245000 }, { "epoch": 1.5713508496815694, "grad_norm": 0.322265625, "learning_rate": 0.00038742919320254745, "loss": 5.2117, "step": 245500 }, { "epoch": 1.5745511569110635, "grad_norm": 0.357421875, "learning_rate": 0.00038740359074471153, "loss": 5.2167, "step": 246000 }, { "epoch": 1.5777514641405574, "grad_norm": 0.3515625, "learning_rate": 0.00038737798828687557, "loss": 5.2152, "step": 246500 }, { "epoch": 1.5809517713700516, "grad_norm": 0.337890625, "learning_rate": 0.0003873523858290396, "loss": 5.2127, "step": 247000 }, { "epoch": 1.5841520785995455, "grad_norm": 0.357421875, "learning_rate": 0.0003873267833712037, "loss": 5.2125, "step": 247500 }, { "epoch": 1.5873523858290395, "grad_norm": 0.365234375, "learning_rate": 0.0003873011809133677, "loss": 5.215, "step": 248000 }, { "epoch": 1.5905526930585336, "grad_norm": 0.37890625, "learning_rate": 0.00038727557845553176, "loss": 5.2106, "step": 248500 }, { "epoch": 1.5937530002880278, "grad_norm": 0.380859375, "learning_rate": 0.0003872499759976958, "loss": 5.2123, "step": 249000 }, { "epoch": 1.5969533075175217, "grad_norm": 0.361328125, "learning_rate": 0.00038722437353985983, "loss": 5.2162, "step": 249500 }, { "epoch": 1.6001536147470157, "grad_norm": 0.357421875, "learning_rate": 0.0003871987710820239, "loss": 5.2224, "step": 250000 }, { "epoch": 1.6033539219765096, "grad_norm": 0.341796875, "learning_rate": 0.00038717316862418795, "loss": 5.2105, "step": 250500 }, { "epoch": 1.6065542292060038, "grad_norm": 0.33984375, "learning_rate": 0.000387147566166352, "loss": 5.2144, "step": 251000 }, { "epoch": 1.609754536435498, "grad_norm": 0.38671875, "learning_rate": 0.000387121963708516, "loss": 5.2169, "step": 251500 }, { "epoch": 1.6129548436649919, "grad_norm": 0.3515625, "learning_rate": 0.00038709636125068005, "loss": 5.2141, "step": 252000 }, { "epoch": 1.6161551508944858, "grad_norm": 0.353515625, "learning_rate": 0.0003870707587928441, "loss": 5.2145, "step": 252500 }, { "epoch": 1.6193554581239797, "grad_norm": 0.34375, "learning_rate": 0.0003870451563350082, "loss": 5.2192, "step": 253000 }, { "epoch": 1.622555765353474, "grad_norm": 0.33984375, "learning_rate": 0.0003870195538771722, "loss": 5.2116, "step": 253500 }, { "epoch": 1.625756072582968, "grad_norm": 0.330078125, "learning_rate": 0.0003869939514193363, "loss": 5.2121, "step": 254000 }, { "epoch": 1.628956379812462, "grad_norm": 0.35546875, "learning_rate": 0.00038696834896150033, "loss": 5.2086, "step": 254500 }, { "epoch": 1.632156687041956, "grad_norm": 0.359375, "learning_rate": 0.00038694274650366437, "loss": 5.2137, "step": 255000 }, { "epoch": 1.63535699427145, "grad_norm": 0.3359375, "learning_rate": 0.00038691714404582846, "loss": 5.2148, "step": 255500 }, { "epoch": 1.6385573015009443, "grad_norm": 0.375, "learning_rate": 0.0003868915415879925, "loss": 5.2191, "step": 256000 }, { "epoch": 1.6417576087304382, "grad_norm": 0.349609375, "learning_rate": 0.0003868659391301565, "loss": 5.2119, "step": 256500 }, { "epoch": 1.6449579159599321, "grad_norm": 0.39453125, "learning_rate": 0.00038684033667232056, "loss": 5.2129, "step": 257000 }, { "epoch": 1.648158223189426, "grad_norm": 0.359375, "learning_rate": 0.0003868147342144846, "loss": 5.2107, "step": 257500 }, { "epoch": 1.6513585304189202, "grad_norm": 0.36328125, "learning_rate": 0.00038678913175664863, "loss": 5.2092, "step": 258000 }, { "epoch": 1.6545588376484144, "grad_norm": 0.408203125, "learning_rate": 0.0003867635292988127, "loss": 5.2142, "step": 258500 }, { "epoch": 1.6577591448779083, "grad_norm": 0.375, "learning_rate": 0.00038673792684097675, "loss": 5.2097, "step": 259000 }, { "epoch": 1.6609594521074023, "grad_norm": 0.361328125, "learning_rate": 0.0003867123243831408, "loss": 5.2178, "step": 259500 }, { "epoch": 1.6641597593368962, "grad_norm": 0.36328125, "learning_rate": 0.0003866867219253048, "loss": 5.2202, "step": 260000 }, { "epoch": 1.6673600665663904, "grad_norm": 0.375, "learning_rate": 0.00038666111946746885, "loss": 5.2108, "step": 260500 }, { "epoch": 1.6705603737958845, "grad_norm": 0.353515625, "learning_rate": 0.00038663551700963294, "loss": 5.2111, "step": 261000 }, { "epoch": 1.6737606810253784, "grad_norm": 0.36328125, "learning_rate": 0.00038660991455179703, "loss": 5.2115, "step": 261500 }, { "epoch": 1.6769609882548724, "grad_norm": 0.34375, "learning_rate": 0.00038658431209396106, "loss": 5.2137, "step": 262000 }, { "epoch": 1.6801612954843665, "grad_norm": 0.369140625, "learning_rate": 0.0003865587096361251, "loss": 5.2147, "step": 262500 }, { "epoch": 1.6833616027138605, "grad_norm": 0.373046875, "learning_rate": 0.00038653310717828913, "loss": 5.2157, "step": 263000 }, { "epoch": 1.6865619099433546, "grad_norm": 0.369140625, "learning_rate": 0.00038650750472045317, "loss": 5.2153, "step": 263500 }, { "epoch": 1.6897622171728486, "grad_norm": 0.345703125, "learning_rate": 0.00038648190226261726, "loss": 5.2073, "step": 264000 }, { "epoch": 1.6929625244023425, "grad_norm": 0.359375, "learning_rate": 0.0003864562998047813, "loss": 5.2185, "step": 264500 }, { "epoch": 1.6961628316318367, "grad_norm": 0.359375, "learning_rate": 0.0003864306973469453, "loss": 5.2131, "step": 265000 }, { "epoch": 1.6993631388613308, "grad_norm": 0.38671875, "learning_rate": 0.00038640509488910936, "loss": 5.2177, "step": 265500 }, { "epoch": 1.7025634460908248, "grad_norm": 0.3984375, "learning_rate": 0.0003863794924312734, "loss": 5.2098, "step": 266000 }, { "epoch": 1.7057637533203187, "grad_norm": 0.36328125, "learning_rate": 0.0003863538899734375, "loss": 5.2088, "step": 266500 }, { "epoch": 1.7089640605498126, "grad_norm": 0.35546875, "learning_rate": 0.0003863282875156015, "loss": 5.2095, "step": 267000 }, { "epoch": 1.7121643677793068, "grad_norm": 0.36328125, "learning_rate": 0.00038630268505776555, "loss": 5.2109, "step": 267500 }, { "epoch": 1.715364675008801, "grad_norm": 0.359375, "learning_rate": 0.0003862770825999296, "loss": 5.2124, "step": 268000 }, { "epoch": 1.718564982238295, "grad_norm": 0.36328125, "learning_rate": 0.00038625148014209367, "loss": 5.2093, "step": 268500 }, { "epoch": 1.7217652894677888, "grad_norm": 0.365234375, "learning_rate": 0.0003862258776842577, "loss": 5.2065, "step": 269000 }, { "epoch": 1.7249655966972828, "grad_norm": 0.408203125, "learning_rate": 0.0003862002752264218, "loss": 5.2177, "step": 269500 }, { "epoch": 1.728165903926777, "grad_norm": 0.361328125, "learning_rate": 0.00038617467276858583, "loss": 5.2129, "step": 270000 }, { "epoch": 1.731366211156271, "grad_norm": 0.376953125, "learning_rate": 0.00038614907031074986, "loss": 5.2104, "step": 270500 }, { "epoch": 1.734566518385765, "grad_norm": 0.3515625, "learning_rate": 0.0003861234678529139, "loss": 5.2146, "step": 271000 }, { "epoch": 1.737766825615259, "grad_norm": 0.341796875, "learning_rate": 0.00038609786539507793, "loss": 5.21, "step": 271500 }, { "epoch": 1.7409671328447531, "grad_norm": 0.349609375, "learning_rate": 0.000386072262937242, "loss": 5.2105, "step": 272000 }, { "epoch": 1.7441674400742473, "grad_norm": 0.3671875, "learning_rate": 0.00038604666047940605, "loss": 5.2113, "step": 272500 }, { "epoch": 1.7473677473037412, "grad_norm": 0.38671875, "learning_rate": 0.0003860210580215701, "loss": 5.2127, "step": 273000 }, { "epoch": 1.7505680545332352, "grad_norm": 0.3671875, "learning_rate": 0.0003859954555637341, "loss": 5.2113, "step": 273500 }, { "epoch": 1.753768361762729, "grad_norm": 0.345703125, "learning_rate": 0.00038596985310589816, "loss": 5.2137, "step": 274000 }, { "epoch": 1.7569686689922233, "grad_norm": 0.3515625, "learning_rate": 0.0003859442506480622, "loss": 5.2108, "step": 274500 }, { "epoch": 1.7601689762217174, "grad_norm": 0.37890625, "learning_rate": 0.0003859186481902263, "loss": 5.2108, "step": 275000 }, { "epoch": 1.7633692834512114, "grad_norm": 0.345703125, "learning_rate": 0.0003858930457323903, "loss": 5.2107, "step": 275500 }, { "epoch": 1.7665695906807053, "grad_norm": 0.34765625, "learning_rate": 0.00038586744327455435, "loss": 5.2126, "step": 276000 }, { "epoch": 1.7697698979101992, "grad_norm": 0.36328125, "learning_rate": 0.00038584184081671844, "loss": 5.2125, "step": 276500 }, { "epoch": 1.7729702051396934, "grad_norm": 0.384765625, "learning_rate": 0.00038581623835888247, "loss": 5.2136, "step": 277000 }, { "epoch": 1.7761705123691875, "grad_norm": 0.369140625, "learning_rate": 0.00038579063590104656, "loss": 5.2136, "step": 277500 }, { "epoch": 1.7793708195986815, "grad_norm": 0.353515625, "learning_rate": 0.0003857650334432106, "loss": 5.2113, "step": 278000 }, { "epoch": 1.7825711268281754, "grad_norm": 0.36328125, "learning_rate": 0.00038573943098537463, "loss": 5.2126, "step": 278500 }, { "epoch": 1.7857714340576696, "grad_norm": 0.3359375, "learning_rate": 0.00038571382852753866, "loss": 5.2121, "step": 279000 }, { "epoch": 1.7889717412871635, "grad_norm": 0.37890625, "learning_rate": 0.0003856882260697027, "loss": 5.2061, "step": 279500 }, { "epoch": 1.7921720485166577, "grad_norm": 0.380859375, "learning_rate": 0.00038566262361186673, "loss": 5.2132, "step": 280000 }, { "epoch": 1.7953723557461516, "grad_norm": 0.359375, "learning_rate": 0.0003856370211540308, "loss": 5.2158, "step": 280500 }, { "epoch": 1.7985726629756456, "grad_norm": 0.369140625, "learning_rate": 0.00038561141869619485, "loss": 5.2113, "step": 281000 }, { "epoch": 1.8017729702051397, "grad_norm": 0.34375, "learning_rate": 0.0003855858162383589, "loss": 5.2072, "step": 281500 }, { "epoch": 1.8049732774346339, "grad_norm": 0.384765625, "learning_rate": 0.0003855602137805229, "loss": 5.2132, "step": 282000 }, { "epoch": 1.8081735846641278, "grad_norm": 0.369140625, "learning_rate": 0.00038553461132268696, "loss": 5.2134, "step": 282500 }, { "epoch": 1.8113738918936217, "grad_norm": 0.357421875, "learning_rate": 0.00038550900886485104, "loss": 5.2122, "step": 283000 }, { "epoch": 1.8145741991231157, "grad_norm": 0.375, "learning_rate": 0.0003854834064070151, "loss": 5.2105, "step": 283500 }, { "epoch": 1.8177745063526098, "grad_norm": 0.3515625, "learning_rate": 0.00038545780394917917, "loss": 5.2108, "step": 284000 }, { "epoch": 1.820974813582104, "grad_norm": 0.400390625, "learning_rate": 0.0003854322014913432, "loss": 5.2122, "step": 284500 }, { "epoch": 1.824175120811598, "grad_norm": 0.359375, "learning_rate": 0.00038540659903350724, "loss": 5.2076, "step": 285000 }, { "epoch": 1.8273754280410919, "grad_norm": 0.359375, "learning_rate": 0.0003853809965756713, "loss": 5.2107, "step": 285500 }, { "epoch": 1.8305757352705858, "grad_norm": 0.349609375, "learning_rate": 0.00038535539411783536, "loss": 5.2128, "step": 286000 }, { "epoch": 1.83377604250008, "grad_norm": 0.365234375, "learning_rate": 0.0003853297916599994, "loss": 5.2076, "step": 286500 }, { "epoch": 1.8369763497295741, "grad_norm": 0.357421875, "learning_rate": 0.0003853041892021634, "loss": 5.2113, "step": 287000 }, { "epoch": 1.840176656959068, "grad_norm": 0.365234375, "learning_rate": 0.00038527858674432746, "loss": 5.2084, "step": 287500 }, { "epoch": 1.843376964188562, "grad_norm": 0.375, "learning_rate": 0.0003852529842864915, "loss": 5.2152, "step": 288000 }, { "epoch": 1.8465772714180562, "grad_norm": 0.384765625, "learning_rate": 0.0003852273818286556, "loss": 5.2082, "step": 288500 }, { "epoch": 1.8497775786475503, "grad_norm": 0.37109375, "learning_rate": 0.0003852017793708196, "loss": 5.2145, "step": 289000 }, { "epoch": 1.8529778858770443, "grad_norm": 0.359375, "learning_rate": 0.00038517617691298365, "loss": 5.2083, "step": 289500 }, { "epoch": 1.8561781931065382, "grad_norm": 0.36328125, "learning_rate": 0.0003851505744551477, "loss": 5.2103, "step": 290000 }, { "epoch": 1.8593785003360321, "grad_norm": 0.375, "learning_rate": 0.0003851249719973117, "loss": 5.2066, "step": 290500 }, { "epoch": 1.8625788075655263, "grad_norm": 0.36328125, "learning_rate": 0.0003850993695394758, "loss": 5.2152, "step": 291000 }, { "epoch": 1.8657791147950205, "grad_norm": 0.357421875, "learning_rate": 0.00038507376708163984, "loss": 5.2144, "step": 291500 }, { "epoch": 1.8689794220245144, "grad_norm": 0.369140625, "learning_rate": 0.00038504816462380393, "loss": 5.2091, "step": 292000 }, { "epoch": 1.8721797292540083, "grad_norm": 0.375, "learning_rate": 0.00038502256216596797, "loss": 5.2137, "step": 292500 }, { "epoch": 1.8753800364835023, "grad_norm": 0.35546875, "learning_rate": 0.000384996959708132, "loss": 5.2023, "step": 293000 }, { "epoch": 1.8785803437129964, "grad_norm": 0.353515625, "learning_rate": 0.00038497135725029603, "loss": 5.2073, "step": 293500 }, { "epoch": 1.8817806509424906, "grad_norm": 0.380859375, "learning_rate": 0.0003849457547924601, "loss": 5.2089, "step": 294000 }, { "epoch": 1.8849809581719845, "grad_norm": 0.408203125, "learning_rate": 0.00038492015233462416, "loss": 5.2069, "step": 294500 }, { "epoch": 1.8881812654014785, "grad_norm": 0.408203125, "learning_rate": 0.0003848945498767882, "loss": 5.2084, "step": 295000 }, { "epoch": 1.8913815726309726, "grad_norm": 0.380859375, "learning_rate": 0.0003848689474189522, "loss": 5.2065, "step": 295500 }, { "epoch": 1.8945818798604666, "grad_norm": 0.3671875, "learning_rate": 0.00038484334496111626, "loss": 5.2123, "step": 296000 }, { "epoch": 1.8977821870899607, "grad_norm": 0.357421875, "learning_rate": 0.00038481774250328035, "loss": 5.2105, "step": 296500 }, { "epoch": 1.9009824943194547, "grad_norm": 0.375, "learning_rate": 0.0003847921400454444, "loss": 5.2092, "step": 297000 }, { "epoch": 1.9041828015489486, "grad_norm": 0.365234375, "learning_rate": 0.0003847665375876084, "loss": 5.2042, "step": 297500 }, { "epoch": 1.9073831087784427, "grad_norm": 0.3671875, "learning_rate": 0.00038474093512977245, "loss": 5.2083, "step": 298000 }, { "epoch": 1.910583416007937, "grad_norm": 0.3984375, "learning_rate": 0.00038471533267193654, "loss": 5.2129, "step": 298500 }, { "epoch": 1.9137837232374308, "grad_norm": 0.373046875, "learning_rate": 0.0003846897302141006, "loss": 5.2136, "step": 299000 }, { "epoch": 1.9169840304669248, "grad_norm": 0.349609375, "learning_rate": 0.00038466412775626466, "loss": 5.2083, "step": 299500 }, { "epoch": 1.9201843376964187, "grad_norm": 0.34765625, "learning_rate": 0.0003846385252984287, "loss": 5.2071, "step": 300000 }, { "epoch": 1.9233846449259129, "grad_norm": 0.365234375, "learning_rate": 0.00038461292284059273, "loss": 5.2018, "step": 300500 }, { "epoch": 1.926584952155407, "grad_norm": 0.375, "learning_rate": 0.00038458732038275676, "loss": 5.2091, "step": 301000 }, { "epoch": 1.929785259384901, "grad_norm": 0.39453125, "learning_rate": 0.0003845617179249208, "loss": 5.2078, "step": 301500 }, { "epoch": 1.932985566614395, "grad_norm": 0.41796875, "learning_rate": 0.0003845361154670849, "loss": 5.2102, "step": 302000 }, { "epoch": 1.9361858738438888, "grad_norm": 0.369140625, "learning_rate": 0.0003845105130092489, "loss": 5.2069, "step": 302500 }, { "epoch": 1.939386181073383, "grad_norm": 0.369140625, "learning_rate": 0.00038448491055141296, "loss": 5.2084, "step": 303000 }, { "epoch": 1.9425864883028772, "grad_norm": 0.373046875, "learning_rate": 0.000384459308093577, "loss": 5.208, "step": 303500 }, { "epoch": 1.945786795532371, "grad_norm": 0.376953125, "learning_rate": 0.000384433705635741, "loss": 5.2096, "step": 304000 }, { "epoch": 1.948987102761865, "grad_norm": 0.361328125, "learning_rate": 0.00038440810317790506, "loss": 5.208, "step": 304500 }, { "epoch": 1.9521874099913592, "grad_norm": 0.353515625, "learning_rate": 0.00038438250072006915, "loss": 5.2065, "step": 305000 }, { "epoch": 1.9553877172208534, "grad_norm": 0.369140625, "learning_rate": 0.0003843568982622332, "loss": 5.2092, "step": 305500 }, { "epoch": 1.9585880244503473, "grad_norm": 0.3515625, "learning_rate": 0.0003843312958043972, "loss": 5.2149, "step": 306000 }, { "epoch": 1.9617883316798412, "grad_norm": 0.390625, "learning_rate": 0.0003843056933465613, "loss": 5.2069, "step": 306500 }, { "epoch": 1.9649886389093352, "grad_norm": 0.37109375, "learning_rate": 0.00038428009088872534, "loss": 5.2079, "step": 307000 }, { "epoch": 1.9681889461388293, "grad_norm": 0.37109375, "learning_rate": 0.0003842544884308894, "loss": 5.2044, "step": 307500 }, { "epoch": 1.9713892533683235, "grad_norm": 0.37109375, "learning_rate": 0.00038422888597305346, "loss": 5.2109, "step": 308000 }, { "epoch": 1.9745895605978174, "grad_norm": 0.3828125, "learning_rate": 0.0003842032835152175, "loss": 5.2061, "step": 308500 }, { "epoch": 1.9777898678273114, "grad_norm": 0.357421875, "learning_rate": 0.00038417768105738153, "loss": 5.2095, "step": 309000 }, { "epoch": 1.9809901750568053, "grad_norm": 0.357421875, "learning_rate": 0.00038415207859954556, "loss": 5.2049, "step": 309500 }, { "epoch": 1.9841904822862995, "grad_norm": 0.365234375, "learning_rate": 0.0003841264761417096, "loss": 5.2041, "step": 310000 }, { "epoch": 1.9873907895157936, "grad_norm": 0.361328125, "learning_rate": 0.0003841008736838737, "loss": 5.1995, "step": 310500 }, { "epoch": 1.9905910967452876, "grad_norm": 0.365234375, "learning_rate": 0.0003840752712260377, "loss": 5.2057, "step": 311000 }, { "epoch": 1.9937914039747815, "grad_norm": 0.41796875, "learning_rate": 0.00038404966876820176, "loss": 5.212, "step": 311500 }, { "epoch": 1.9969917112042757, "grad_norm": 0.376953125, "learning_rate": 0.0003840240663103658, "loss": 5.2029, "step": 312000 }, { "epoch": 2.0, "eval_loss": 5.198747158050537, "eval_runtime": 1.6976, "eval_samples_per_second": 589.07, "eval_steps_per_second": 9.425, "step": 312470 }, { "epoch": 2.00019201843377, "grad_norm": 0.376953125, "learning_rate": 0.0003839984638525298, "loss": 5.2133, "step": 312500 }, { "epoch": 2.0033923256632638, "grad_norm": 0.375, "learning_rate": 0.0003839728613946939, "loss": 5.2076, "step": 313000 }, { "epoch": 2.0065926328927577, "grad_norm": 0.37109375, "learning_rate": 0.00038394725893685795, "loss": 5.202, "step": 313500 }, { "epoch": 2.0097929401222516, "grad_norm": 0.392578125, "learning_rate": 0.000383921656479022, "loss": 5.203, "step": 314000 }, { "epoch": 2.0129932473517456, "grad_norm": 0.373046875, "learning_rate": 0.00038389605402118607, "loss": 5.2092, "step": 314500 }, { "epoch": 2.01619355458124, "grad_norm": 0.41015625, "learning_rate": 0.0003838704515633501, "loss": 5.1966, "step": 315000 }, { "epoch": 2.019393861810734, "grad_norm": 0.369140625, "learning_rate": 0.00038384484910551414, "loss": 5.2033, "step": 315500 }, { "epoch": 2.022594169040228, "grad_norm": 0.376953125, "learning_rate": 0.0003838192466476782, "loss": 5.2001, "step": 316000 }, { "epoch": 2.0257944762697218, "grad_norm": 0.357421875, "learning_rate": 0.00038379364418984226, "loss": 5.2038, "step": 316500 }, { "epoch": 2.028994783499216, "grad_norm": 0.376953125, "learning_rate": 0.0003837680417320063, "loss": 5.2041, "step": 317000 }, { "epoch": 2.03219509072871, "grad_norm": 0.380859375, "learning_rate": 0.00038374243927417033, "loss": 5.199, "step": 317500 }, { "epoch": 2.035395397958204, "grad_norm": 0.3515625, "learning_rate": 0.00038371683681633436, "loss": 5.2118, "step": 318000 }, { "epoch": 2.038595705187698, "grad_norm": 0.3984375, "learning_rate": 0.00038369123435849845, "loss": 5.1993, "step": 318500 }, { "epoch": 2.041796012417192, "grad_norm": 0.380859375, "learning_rate": 0.0003836656319006625, "loss": 5.2074, "step": 319000 }, { "epoch": 2.0449963196466863, "grad_norm": 0.392578125, "learning_rate": 0.0003836400294428265, "loss": 5.2091, "step": 319500 }, { "epoch": 2.04819662687618, "grad_norm": 0.400390625, "learning_rate": 0.00038361442698499055, "loss": 5.2067, "step": 320000 }, { "epoch": 2.051396934105674, "grad_norm": 0.375, "learning_rate": 0.0003835888245271546, "loss": 5.2012, "step": 320500 }, { "epoch": 2.054597241335168, "grad_norm": 0.349609375, "learning_rate": 0.0003835632220693187, "loss": 5.2013, "step": 321000 }, { "epoch": 2.057797548564662, "grad_norm": 0.369140625, "learning_rate": 0.0003835376196114827, "loss": 5.2057, "step": 321500 }, { "epoch": 2.0609978557941564, "grad_norm": 0.376953125, "learning_rate": 0.0003835120171536468, "loss": 5.2053, "step": 322000 }, { "epoch": 2.0641981630236503, "grad_norm": 0.384765625, "learning_rate": 0.00038348641469581083, "loss": 5.2096, "step": 322500 }, { "epoch": 2.0673984702531443, "grad_norm": 0.361328125, "learning_rate": 0.00038346081223797487, "loss": 5.2038, "step": 323000 }, { "epoch": 2.070598777482638, "grad_norm": 0.416015625, "learning_rate": 0.0003834352097801389, "loss": 5.2057, "step": 323500 }, { "epoch": 2.0737990847121326, "grad_norm": 0.359375, "learning_rate": 0.000383409607322303, "loss": 5.2013, "step": 324000 }, { "epoch": 2.0769993919416265, "grad_norm": 0.380859375, "learning_rate": 0.000383384004864467, "loss": 5.2081, "step": 324500 }, { "epoch": 2.0801996991711205, "grad_norm": 0.40234375, "learning_rate": 0.00038335840240663106, "loss": 5.2038, "step": 325000 }, { "epoch": 2.0834000064006144, "grad_norm": 0.400390625, "learning_rate": 0.0003833327999487951, "loss": 5.2032, "step": 325500 }, { "epoch": 2.0866003136301083, "grad_norm": 0.39453125, "learning_rate": 0.00038330719749095913, "loss": 5.2018, "step": 326000 }, { "epoch": 2.0898006208596027, "grad_norm": 0.369140625, "learning_rate": 0.0003832815950331232, "loss": 5.2015, "step": 326500 }, { "epoch": 2.0930009280890967, "grad_norm": 0.37109375, "learning_rate": 0.00038325599257528725, "loss": 5.1985, "step": 327000 }, { "epoch": 2.0962012353185906, "grad_norm": 0.373046875, "learning_rate": 0.0003832303901174513, "loss": 5.2005, "step": 327500 }, { "epoch": 2.0994015425480845, "grad_norm": 0.400390625, "learning_rate": 0.0003832047876596153, "loss": 5.2005, "step": 328000 }, { "epoch": 2.1026018497775785, "grad_norm": 0.392578125, "learning_rate": 0.00038317918520177935, "loss": 5.2106, "step": 328500 }, { "epoch": 2.105802157007073, "grad_norm": 0.376953125, "learning_rate": 0.00038315358274394344, "loss": 5.2059, "step": 329000 }, { "epoch": 2.109002464236567, "grad_norm": 0.37890625, "learning_rate": 0.0003831279802861075, "loss": 5.2012, "step": 329500 }, { "epoch": 2.1122027714660607, "grad_norm": 0.375, "learning_rate": 0.00038310237782827156, "loss": 5.1964, "step": 330000 }, { "epoch": 2.1154030786955547, "grad_norm": 0.380859375, "learning_rate": 0.0003830767753704356, "loss": 5.2087, "step": 330500 }, { "epoch": 2.1186033859250486, "grad_norm": 0.390625, "learning_rate": 0.00038305117291259963, "loss": 5.203, "step": 331000 }, { "epoch": 2.121803693154543, "grad_norm": 0.369140625, "learning_rate": 0.00038302557045476367, "loss": 5.2044, "step": 331500 }, { "epoch": 2.125004000384037, "grad_norm": 0.365234375, "learning_rate": 0.00038299996799692776, "loss": 5.2065, "step": 332000 }, { "epoch": 2.128204307613531, "grad_norm": 0.365234375, "learning_rate": 0.0003829743655390918, "loss": 5.2018, "step": 332500 }, { "epoch": 2.131404614843025, "grad_norm": 0.353515625, "learning_rate": 0.0003829487630812558, "loss": 5.205, "step": 333000 }, { "epoch": 2.134604922072519, "grad_norm": 0.400390625, "learning_rate": 0.00038292316062341986, "loss": 5.2031, "step": 333500 }, { "epoch": 2.137805229302013, "grad_norm": 0.33984375, "learning_rate": 0.0003828975581655839, "loss": 5.2081, "step": 334000 }, { "epoch": 2.141005536531507, "grad_norm": 0.373046875, "learning_rate": 0.0003828719557077479, "loss": 5.2036, "step": 334500 }, { "epoch": 2.144205843761001, "grad_norm": 0.3828125, "learning_rate": 0.000382846353249912, "loss": 5.2067, "step": 335000 }, { "epoch": 2.147406150990495, "grad_norm": 0.38671875, "learning_rate": 0.00038282075079207605, "loss": 5.2016, "step": 335500 }, { "epoch": 2.1506064582199893, "grad_norm": 0.37109375, "learning_rate": 0.0003827951483342401, "loss": 5.2033, "step": 336000 }, { "epoch": 2.1538067654494832, "grad_norm": 0.3515625, "learning_rate": 0.00038276954587640417, "loss": 5.2013, "step": 336500 }, { "epoch": 2.157007072678977, "grad_norm": 0.361328125, "learning_rate": 0.0003827439434185682, "loss": 5.1996, "step": 337000 }, { "epoch": 2.160207379908471, "grad_norm": 0.369140625, "learning_rate": 0.0003827183409607323, "loss": 5.2038, "step": 337500 }, { "epoch": 2.163407687137965, "grad_norm": 0.400390625, "learning_rate": 0.00038269273850289633, "loss": 5.2004, "step": 338000 }, { "epoch": 2.1666079943674594, "grad_norm": 0.44921875, "learning_rate": 0.00038266713604506036, "loss": 5.202, "step": 338500 }, { "epoch": 2.1698083015969534, "grad_norm": 0.37890625, "learning_rate": 0.0003826415335872244, "loss": 5.1946, "step": 339000 }, { "epoch": 2.1730086088264473, "grad_norm": 0.38671875, "learning_rate": 0.00038261593112938843, "loss": 5.205, "step": 339500 }, { "epoch": 2.1762089160559412, "grad_norm": 0.3671875, "learning_rate": 0.00038259032867155247, "loss": 5.2002, "step": 340000 }, { "epoch": 2.179409223285435, "grad_norm": 0.40234375, "learning_rate": 0.00038256472621371655, "loss": 5.2024, "step": 340500 }, { "epoch": 2.1826095305149296, "grad_norm": 0.390625, "learning_rate": 0.0003825391237558806, "loss": 5.2081, "step": 341000 }, { "epoch": 2.1858098377444235, "grad_norm": 0.4453125, "learning_rate": 0.0003825135212980446, "loss": 5.2062, "step": 341500 }, { "epoch": 2.1890101449739174, "grad_norm": 0.3671875, "learning_rate": 0.00038248791884020866, "loss": 5.202, "step": 342000 }, { "epoch": 2.1922104522034114, "grad_norm": 0.3828125, "learning_rate": 0.0003824623163823727, "loss": 5.2013, "step": 342500 }, { "epoch": 2.1954107594329058, "grad_norm": 0.40625, "learning_rate": 0.0003824367139245368, "loss": 5.199, "step": 343000 }, { "epoch": 2.1986110666623997, "grad_norm": 0.365234375, "learning_rate": 0.0003824111114667008, "loss": 5.2029, "step": 343500 }, { "epoch": 2.2018113738918936, "grad_norm": 0.400390625, "learning_rate": 0.00038238550900886485, "loss": 5.208, "step": 344000 }, { "epoch": 2.2050116811213876, "grad_norm": 0.4375, "learning_rate": 0.00038235990655102894, "loss": 5.2012, "step": 344500 }, { "epoch": 2.2082119883508815, "grad_norm": 0.384765625, "learning_rate": 0.00038233430409319297, "loss": 5.1991, "step": 345000 }, { "epoch": 2.211412295580376, "grad_norm": 0.3828125, "learning_rate": 0.000382308701635357, "loss": 5.2054, "step": 345500 }, { "epoch": 2.21461260280987, "grad_norm": 0.390625, "learning_rate": 0.0003822830991775211, "loss": 5.2062, "step": 346000 }, { "epoch": 2.2178129100393638, "grad_norm": 0.369140625, "learning_rate": 0.00038225749671968513, "loss": 5.202, "step": 346500 }, { "epoch": 2.2210132172688577, "grad_norm": 0.388671875, "learning_rate": 0.00038223189426184916, "loss": 5.1993, "step": 347000 }, { "epoch": 2.2242135244983516, "grad_norm": 0.37109375, "learning_rate": 0.0003822062918040132, "loss": 5.1995, "step": 347500 }, { "epoch": 2.227413831727846, "grad_norm": 0.384765625, "learning_rate": 0.00038218068934617723, "loss": 5.1964, "step": 348000 }, { "epoch": 2.23061413895734, "grad_norm": 0.375, "learning_rate": 0.0003821550868883413, "loss": 5.2061, "step": 348500 }, { "epoch": 2.233814446186834, "grad_norm": 0.37890625, "learning_rate": 0.00038212948443050535, "loss": 5.2081, "step": 349000 }, { "epoch": 2.237014753416328, "grad_norm": 0.435546875, "learning_rate": 0.0003821038819726694, "loss": 5.1967, "step": 349500 }, { "epoch": 2.2402150606458218, "grad_norm": 0.37109375, "learning_rate": 0.0003820782795148334, "loss": 5.2061, "step": 350000 }, { "epoch": 2.243415367875316, "grad_norm": 0.435546875, "learning_rate": 0.00038205267705699746, "loss": 5.2018, "step": 350500 }, { "epoch": 2.24661567510481, "grad_norm": 0.357421875, "learning_rate": 0.0003820270745991615, "loss": 5.2016, "step": 351000 }, { "epoch": 2.249815982334304, "grad_norm": 0.390625, "learning_rate": 0.0003820014721413256, "loss": 5.2083, "step": 351500 }, { "epoch": 2.253016289563798, "grad_norm": 0.421875, "learning_rate": 0.0003819758696834896, "loss": 5.2076, "step": 352000 }, { "epoch": 2.2562165967932923, "grad_norm": 0.38671875, "learning_rate": 0.0003819502672256537, "loss": 5.2068, "step": 352500 }, { "epoch": 2.2594169040227863, "grad_norm": 0.400390625, "learning_rate": 0.00038192466476781774, "loss": 5.2036, "step": 353000 }, { "epoch": 2.26261721125228, "grad_norm": 0.37890625, "learning_rate": 0.00038189906230998177, "loss": 5.198, "step": 353500 }, { "epoch": 2.265817518481774, "grad_norm": 0.40234375, "learning_rate": 0.00038187345985214586, "loss": 5.2034, "step": 354000 }, { "epoch": 2.269017825711268, "grad_norm": 0.392578125, "learning_rate": 0.0003818478573943099, "loss": 5.2037, "step": 354500 }, { "epoch": 2.2722181329407625, "grad_norm": 0.357421875, "learning_rate": 0.0003818222549364739, "loss": 5.2039, "step": 355000 }, { "epoch": 2.2754184401702564, "grad_norm": 0.412109375, "learning_rate": 0.00038179665247863796, "loss": 5.2, "step": 355500 }, { "epoch": 2.2786187473997503, "grad_norm": 0.369140625, "learning_rate": 0.000381771050020802, "loss": 5.2056, "step": 356000 }, { "epoch": 2.2818190546292443, "grad_norm": 0.365234375, "learning_rate": 0.00038174544756296603, "loss": 5.2023, "step": 356500 }, { "epoch": 2.2850193618587387, "grad_norm": 0.3671875, "learning_rate": 0.0003817198451051301, "loss": 5.1957, "step": 357000 }, { "epoch": 2.2882196690882326, "grad_norm": 0.37109375, "learning_rate": 0.00038169424264729415, "loss": 5.2005, "step": 357500 }, { "epoch": 2.2914199763177265, "grad_norm": 0.375, "learning_rate": 0.0003816686401894582, "loss": 5.2037, "step": 358000 }, { "epoch": 2.2946202835472205, "grad_norm": 0.40234375, "learning_rate": 0.0003816430377316222, "loss": 5.2036, "step": 358500 }, { "epoch": 2.2978205907767144, "grad_norm": 0.375, "learning_rate": 0.0003816174352737863, "loss": 5.2034, "step": 359000 }, { "epoch": 2.3010208980062083, "grad_norm": 0.3828125, "learning_rate": 0.00038159183281595034, "loss": 5.1988, "step": 359500 }, { "epoch": 2.3042212052357027, "grad_norm": 0.404296875, "learning_rate": 0.00038156623035811443, "loss": 5.2031, "step": 360000 }, { "epoch": 2.3074215124651967, "grad_norm": 0.404296875, "learning_rate": 0.00038154062790027847, "loss": 5.1989, "step": 360500 }, { "epoch": 2.3106218196946906, "grad_norm": 0.392578125, "learning_rate": 0.0003815150254424425, "loss": 5.2058, "step": 361000 }, { "epoch": 2.3138221269241845, "grad_norm": 0.376953125, "learning_rate": 0.00038148942298460653, "loss": 5.2032, "step": 361500 }, { "epoch": 2.317022434153679, "grad_norm": 0.396484375, "learning_rate": 0.0003814638205267706, "loss": 5.1991, "step": 362000 }, { "epoch": 2.320222741383173, "grad_norm": 0.400390625, "learning_rate": 0.00038143821806893466, "loss": 5.2017, "step": 362500 }, { "epoch": 2.323423048612667, "grad_norm": 0.40234375, "learning_rate": 0.0003814126156110987, "loss": 5.2021, "step": 363000 }, { "epoch": 2.3266233558421607, "grad_norm": 0.34765625, "learning_rate": 0.0003813870131532627, "loss": 5.2023, "step": 363500 }, { "epoch": 2.3298236630716547, "grad_norm": 0.392578125, "learning_rate": 0.00038136141069542676, "loss": 5.2085, "step": 364000 }, { "epoch": 2.333023970301149, "grad_norm": 0.392578125, "learning_rate": 0.0003813358082375908, "loss": 5.2003, "step": 364500 }, { "epoch": 2.336224277530643, "grad_norm": 0.375, "learning_rate": 0.0003813102057797549, "loss": 5.2016, "step": 365000 }, { "epoch": 2.339424584760137, "grad_norm": 0.38671875, "learning_rate": 0.0003812846033219189, "loss": 5.2058, "step": 365500 }, { "epoch": 2.342624891989631, "grad_norm": 0.359375, "learning_rate": 0.00038125900086408295, "loss": 5.2025, "step": 366000 }, { "epoch": 2.3458251992191252, "grad_norm": 0.390625, "learning_rate": 0.000381233398406247, "loss": 5.1953, "step": 366500 }, { "epoch": 2.349025506448619, "grad_norm": 0.38671875, "learning_rate": 0.0003812077959484111, "loss": 5.2013, "step": 367000 }, { "epoch": 2.352225813678113, "grad_norm": 0.3828125, "learning_rate": 0.0003811821934905751, "loss": 5.2023, "step": 367500 }, { "epoch": 2.355426120907607, "grad_norm": 0.42578125, "learning_rate": 0.0003811565910327392, "loss": 5.2069, "step": 368000 }, { "epoch": 2.358626428137101, "grad_norm": 0.396484375, "learning_rate": 0.00038113098857490323, "loss": 5.2004, "step": 368500 }, { "epoch": 2.3618267353665954, "grad_norm": 0.390625, "learning_rate": 0.00038110538611706727, "loss": 5.2035, "step": 369000 }, { "epoch": 2.3650270425960893, "grad_norm": 0.400390625, "learning_rate": 0.0003810797836592313, "loss": 5.2042, "step": 369500 }, { "epoch": 2.3682273498255833, "grad_norm": 0.41796875, "learning_rate": 0.00038105418120139533, "loss": 5.1985, "step": 370000 }, { "epoch": 2.371427657055077, "grad_norm": 0.408203125, "learning_rate": 0.0003810285787435594, "loss": 5.2041, "step": 370500 }, { "epoch": 2.3746279642845716, "grad_norm": 0.35546875, "learning_rate": 0.00038100297628572346, "loss": 5.1956, "step": 371000 }, { "epoch": 2.3778282715140655, "grad_norm": 0.37890625, "learning_rate": 0.0003809773738278875, "loss": 5.2046, "step": 371500 }, { "epoch": 2.3810285787435594, "grad_norm": 0.390625, "learning_rate": 0.0003809517713700515, "loss": 5.1996, "step": 372000 }, { "epoch": 2.3842288859730534, "grad_norm": 0.4140625, "learning_rate": 0.00038092616891221556, "loss": 5.2018, "step": 372500 }, { "epoch": 2.3874291932025473, "grad_norm": 0.373046875, "learning_rate": 0.00038090056645437965, "loss": 5.2014, "step": 373000 }, { "epoch": 2.3906295004320413, "grad_norm": 0.486328125, "learning_rate": 0.0003808749639965437, "loss": 5.1966, "step": 373500 }, { "epoch": 2.3938298076615356, "grad_norm": 0.419921875, "learning_rate": 0.0003808493615387077, "loss": 5.2013, "step": 374000 }, { "epoch": 2.3970301148910296, "grad_norm": 0.404296875, "learning_rate": 0.0003808237590808718, "loss": 5.1956, "step": 374500 }, { "epoch": 2.4002304221205235, "grad_norm": 0.380859375, "learning_rate": 0.00038079815662303584, "loss": 5.1954, "step": 375000 }, { "epoch": 2.4034307293500174, "grad_norm": 0.38671875, "learning_rate": 0.0003807725541651999, "loss": 5.1928, "step": 375500 }, { "epoch": 2.406631036579512, "grad_norm": 0.44140625, "learning_rate": 0.00038074695170736396, "loss": 5.2002, "step": 376000 }, { "epoch": 2.4098313438090058, "grad_norm": 0.486328125, "learning_rate": 0.000380721349249528, "loss": 5.1998, "step": 376500 }, { "epoch": 2.4130316510384997, "grad_norm": 0.384765625, "learning_rate": 0.00038069574679169203, "loss": 5.1989, "step": 377000 }, { "epoch": 2.4162319582679936, "grad_norm": 0.392578125, "learning_rate": 0.00038067014433385606, "loss": 5.205, "step": 377500 }, { "epoch": 2.4194322654974876, "grad_norm": 0.3828125, "learning_rate": 0.0003806445418760201, "loss": 5.2056, "step": 378000 }, { "epoch": 2.422632572726982, "grad_norm": 0.376953125, "learning_rate": 0.0003806189394181842, "loss": 5.2047, "step": 378500 }, { "epoch": 2.425832879956476, "grad_norm": 0.3828125, "learning_rate": 0.0003805933369603482, "loss": 5.1933, "step": 379000 }, { "epoch": 2.42903318718597, "grad_norm": 0.41796875, "learning_rate": 0.00038056773450251226, "loss": 5.1982, "step": 379500 }, { "epoch": 2.4322334944154638, "grad_norm": 0.435546875, "learning_rate": 0.0003805421320446763, "loss": 5.2007, "step": 380000 }, { "epoch": 2.435433801644958, "grad_norm": 0.412109375, "learning_rate": 0.0003805165295868403, "loss": 5.2001, "step": 380500 }, { "epoch": 2.438634108874452, "grad_norm": 0.3828125, "learning_rate": 0.00038049092712900436, "loss": 5.2003, "step": 381000 }, { "epoch": 2.441834416103946, "grad_norm": 0.3515625, "learning_rate": 0.00038046532467116845, "loss": 5.203, "step": 381500 }, { "epoch": 2.44503472333344, "grad_norm": 0.41796875, "learning_rate": 0.0003804397222133325, "loss": 5.1956, "step": 382000 }, { "epoch": 2.448235030562934, "grad_norm": 0.419921875, "learning_rate": 0.00038041411975549657, "loss": 5.1983, "step": 382500 }, { "epoch": 2.451435337792428, "grad_norm": 0.4296875, "learning_rate": 0.0003803885172976606, "loss": 5.199, "step": 383000 }, { "epoch": 2.454635645021922, "grad_norm": 0.396484375, "learning_rate": 0.00038036291483982464, "loss": 5.203, "step": 383500 }, { "epoch": 2.457835952251416, "grad_norm": 0.3828125, "learning_rate": 0.0003803373123819887, "loss": 5.2004, "step": 384000 }, { "epoch": 2.46103625948091, "grad_norm": 0.384765625, "learning_rate": 0.00038031170992415276, "loss": 5.2037, "step": 384500 }, { "epoch": 2.464236566710404, "grad_norm": 0.396484375, "learning_rate": 0.0003802861074663168, "loss": 5.1961, "step": 385000 }, { "epoch": 2.4674368739398984, "grad_norm": 0.392578125, "learning_rate": 0.00038026050500848083, "loss": 5.2003, "step": 385500 }, { "epoch": 2.4706371811693923, "grad_norm": 0.392578125, "learning_rate": 0.00038023490255064486, "loss": 5.2007, "step": 386000 }, { "epoch": 2.4738374883988863, "grad_norm": 0.408203125, "learning_rate": 0.0003802093000928089, "loss": 5.1987, "step": 386500 }, { "epoch": 2.4770377956283802, "grad_norm": 0.40625, "learning_rate": 0.000380183697634973, "loss": 5.1996, "step": 387000 }, { "epoch": 2.480238102857874, "grad_norm": 0.392578125, "learning_rate": 0.000380158095177137, "loss": 5.1935, "step": 387500 }, { "epoch": 2.4834384100873685, "grad_norm": 0.427734375, "learning_rate": 0.00038013249271930105, "loss": 5.1977, "step": 388000 }, { "epoch": 2.4866387173168625, "grad_norm": 0.390625, "learning_rate": 0.0003801068902614651, "loss": 5.2041, "step": 388500 }, { "epoch": 2.4898390245463564, "grad_norm": 0.400390625, "learning_rate": 0.0003800812878036291, "loss": 5.2037, "step": 389000 }, { "epoch": 2.4930393317758504, "grad_norm": 0.41796875, "learning_rate": 0.0003800556853457932, "loss": 5.2075, "step": 389500 }, { "epoch": 2.4962396390053447, "grad_norm": 0.36328125, "learning_rate": 0.0003800300828879573, "loss": 5.2056, "step": 390000 }, { "epoch": 2.4994399462348387, "grad_norm": 0.40234375, "learning_rate": 0.00038000448043012133, "loss": 5.1986, "step": 390500 }, { "epoch": 2.5026402534643326, "grad_norm": 0.443359375, "learning_rate": 0.00037997887797228537, "loss": 5.2022, "step": 391000 }, { "epoch": 2.5058405606938265, "grad_norm": 0.373046875, "learning_rate": 0.0003799532755144494, "loss": 5.2009, "step": 391500 }, { "epoch": 2.5090408679233205, "grad_norm": 0.412109375, "learning_rate": 0.00037992767305661344, "loss": 5.197, "step": 392000 }, { "epoch": 2.5122411751528144, "grad_norm": 0.400390625, "learning_rate": 0.0003799020705987775, "loss": 5.2044, "step": 392500 }, { "epoch": 2.515441482382309, "grad_norm": 0.4375, "learning_rate": 0.00037987646814094156, "loss": 5.2045, "step": 393000 }, { "epoch": 2.5186417896118027, "grad_norm": 0.37109375, "learning_rate": 0.0003798508656831056, "loss": 5.2081, "step": 393500 }, { "epoch": 2.5218420968412967, "grad_norm": 0.453125, "learning_rate": 0.00037982526322526963, "loss": 5.2002, "step": 394000 }, { "epoch": 2.525042404070791, "grad_norm": 0.404296875, "learning_rate": 0.00037979966076743366, "loss": 5.1958, "step": 394500 }, { "epoch": 2.528242711300285, "grad_norm": 0.3828125, "learning_rate": 0.00037977405830959775, "loss": 5.1946, "step": 395000 }, { "epoch": 2.531443018529779, "grad_norm": 0.396484375, "learning_rate": 0.0003797484558517618, "loss": 5.2012, "step": 395500 }, { "epoch": 2.534643325759273, "grad_norm": 0.41015625, "learning_rate": 0.0003797228533939258, "loss": 5.1959, "step": 396000 }, { "epoch": 2.537843632988767, "grad_norm": 0.388671875, "learning_rate": 0.00037969725093608985, "loss": 5.2028, "step": 396500 }, { "epoch": 2.5410439402182607, "grad_norm": 0.38671875, "learning_rate": 0.00037967164847825394, "loss": 5.1976, "step": 397000 }, { "epoch": 2.544244247447755, "grad_norm": 0.396484375, "learning_rate": 0.000379646046020418, "loss": 5.1984, "step": 397500 }, { "epoch": 2.547444554677249, "grad_norm": 0.39453125, "learning_rate": 0.00037962044356258206, "loss": 5.199, "step": 398000 }, { "epoch": 2.550644861906743, "grad_norm": 0.419921875, "learning_rate": 0.0003795948411047461, "loss": 5.1985, "step": 398500 }, { "epoch": 2.553845169136237, "grad_norm": 0.396484375, "learning_rate": 0.00037956923864691013, "loss": 5.2035, "step": 399000 }, { "epoch": 2.5570454763657313, "grad_norm": 0.44921875, "learning_rate": 0.00037954363618907417, "loss": 5.1987, "step": 399500 }, { "epoch": 2.5602457835952253, "grad_norm": 0.38671875, "learning_rate": 0.0003795180337312382, "loss": 5.2027, "step": 400000 }, { "epoch": 2.563446090824719, "grad_norm": 0.423828125, "learning_rate": 0.0003794924312734023, "loss": 5.2047, "step": 400500 }, { "epoch": 2.566646398054213, "grad_norm": 0.400390625, "learning_rate": 0.0003794668288155663, "loss": 5.2031, "step": 401000 }, { "epoch": 2.569846705283707, "grad_norm": 0.388671875, "learning_rate": 0.00037944122635773036, "loss": 5.2054, "step": 401500 }, { "epoch": 2.573047012513201, "grad_norm": 0.40625, "learning_rate": 0.0003794156238998944, "loss": 5.203, "step": 402000 }, { "epoch": 2.5762473197426954, "grad_norm": 0.3828125, "learning_rate": 0.0003793900214420584, "loss": 5.205, "step": 402500 }, { "epoch": 2.5794476269721893, "grad_norm": 0.416015625, "learning_rate": 0.0003793644189842225, "loss": 5.1995, "step": 403000 }, { "epoch": 2.5826479342016833, "grad_norm": 0.37890625, "learning_rate": 0.00037933881652638655, "loss": 5.2005, "step": 403500 }, { "epoch": 2.5858482414311776, "grad_norm": 0.365234375, "learning_rate": 0.0003793132140685506, "loss": 5.2015, "step": 404000 }, { "epoch": 2.5890485486606716, "grad_norm": 0.3828125, "learning_rate": 0.0003792876116107146, "loss": 5.201, "step": 404500 }, { "epoch": 2.5922488558901655, "grad_norm": 0.400390625, "learning_rate": 0.0003792620091528787, "loss": 5.2021, "step": 405000 }, { "epoch": 2.5954491631196595, "grad_norm": 0.40234375, "learning_rate": 0.00037923640669504274, "loss": 5.209, "step": 405500 }, { "epoch": 2.5986494703491534, "grad_norm": 0.41796875, "learning_rate": 0.00037921080423720683, "loss": 5.1946, "step": 406000 }, { "epoch": 2.6018497775786473, "grad_norm": 0.4296875, "learning_rate": 0.00037918520177937086, "loss": 5.1998, "step": 406500 }, { "epoch": 2.6050500848081417, "grad_norm": 0.39453125, "learning_rate": 0.0003791595993215349, "loss": 5.1969, "step": 407000 }, { "epoch": 2.6082503920376356, "grad_norm": 0.404296875, "learning_rate": 0.00037913399686369893, "loss": 5.1967, "step": 407500 }, { "epoch": 2.6114506992671296, "grad_norm": 0.42578125, "learning_rate": 0.00037910839440586297, "loss": 5.2025, "step": 408000 }, { "epoch": 2.6146510064966235, "grad_norm": 0.408203125, "learning_rate": 0.00037908279194802705, "loss": 5.199, "step": 408500 }, { "epoch": 2.617851313726118, "grad_norm": 0.390625, "learning_rate": 0.0003790571894901911, "loss": 5.2005, "step": 409000 }, { "epoch": 2.621051620955612, "grad_norm": 0.40625, "learning_rate": 0.0003790315870323551, "loss": 5.1969, "step": 409500 }, { "epoch": 2.6242519281851058, "grad_norm": 0.396484375, "learning_rate": 0.00037900598457451916, "loss": 5.205, "step": 410000 }, { "epoch": 2.6274522354145997, "grad_norm": 0.37890625, "learning_rate": 0.0003789803821166832, "loss": 5.2016, "step": 410500 }, { "epoch": 2.6306525426440936, "grad_norm": 0.39453125, "learning_rate": 0.0003789547796588472, "loss": 5.2007, "step": 411000 }, { "epoch": 2.6338528498735876, "grad_norm": 0.38671875, "learning_rate": 0.0003789291772010113, "loss": 5.2053, "step": 411500 }, { "epoch": 2.637053157103082, "grad_norm": 0.3671875, "learning_rate": 0.00037890357474317535, "loss": 5.1981, "step": 412000 }, { "epoch": 2.640253464332576, "grad_norm": 0.40234375, "learning_rate": 0.00037887797228533944, "loss": 5.2052, "step": 412500 }, { "epoch": 2.64345377156207, "grad_norm": 0.38671875, "learning_rate": 0.00037885236982750347, "loss": 5.1987, "step": 413000 }, { "epoch": 2.6466540787915642, "grad_norm": 0.3828125, "learning_rate": 0.0003788267673696675, "loss": 5.1951, "step": 413500 }, { "epoch": 2.649854386021058, "grad_norm": 0.392578125, "learning_rate": 0.0003788011649118316, "loss": 5.2001, "step": 414000 }, { "epoch": 2.653054693250552, "grad_norm": 0.375, "learning_rate": 0.00037877556245399563, "loss": 5.2009, "step": 414500 }, { "epoch": 2.656255000480046, "grad_norm": 0.3984375, "learning_rate": 0.00037874995999615966, "loss": 5.199, "step": 415000 }, { "epoch": 2.65945530770954, "grad_norm": 0.396484375, "learning_rate": 0.0003787243575383237, "loss": 5.2029, "step": 415500 }, { "epoch": 2.662655614939034, "grad_norm": 0.427734375, "learning_rate": 0.00037869875508048773, "loss": 5.1956, "step": 416000 }, { "epoch": 2.6658559221685283, "grad_norm": 0.40234375, "learning_rate": 0.00037867315262265177, "loss": 5.2, "step": 416500 }, { "epoch": 2.6690562293980222, "grad_norm": 0.408203125, "learning_rate": 0.00037864755016481585, "loss": 5.1974, "step": 417000 }, { "epoch": 2.672256536627516, "grad_norm": 0.396484375, "learning_rate": 0.0003786219477069799, "loss": 5.1977, "step": 417500 }, { "epoch": 2.6754568438570105, "grad_norm": 0.392578125, "learning_rate": 0.0003785963452491439, "loss": 5.1954, "step": 418000 }, { "epoch": 2.6786571510865045, "grad_norm": 0.4296875, "learning_rate": 0.00037857074279130796, "loss": 5.2027, "step": 418500 }, { "epoch": 2.6818574583159984, "grad_norm": 0.419921875, "learning_rate": 0.000378545140333472, "loss": 5.1943, "step": 419000 }, { "epoch": 2.6850577655454924, "grad_norm": 0.404296875, "learning_rate": 0.0003785195378756361, "loss": 5.1955, "step": 419500 }, { "epoch": 2.6882580727749863, "grad_norm": 0.41015625, "learning_rate": 0.0003784939354178001, "loss": 5.206, "step": 420000 }, { "epoch": 2.6914583800044802, "grad_norm": 0.38671875, "learning_rate": 0.0003784683329599642, "loss": 5.2077, "step": 420500 }, { "epoch": 2.6946586872339746, "grad_norm": 0.392578125, "learning_rate": 0.00037844273050212824, "loss": 5.2, "step": 421000 }, { "epoch": 2.6978589944634686, "grad_norm": 0.384765625, "learning_rate": 0.00037841712804429227, "loss": 5.1981, "step": 421500 }, { "epoch": 2.7010593016929625, "grad_norm": 0.404296875, "learning_rate": 0.0003783915255864563, "loss": 5.2035, "step": 422000 }, { "epoch": 2.7042596089224564, "grad_norm": 0.416015625, "learning_rate": 0.0003783659231286204, "loss": 5.2, "step": 422500 }, { "epoch": 2.707459916151951, "grad_norm": 0.412109375, "learning_rate": 0.0003783403206707844, "loss": 5.2004, "step": 423000 }, { "epoch": 2.7106602233814447, "grad_norm": 0.3984375, "learning_rate": 0.00037831471821294846, "loss": 5.1948, "step": 423500 }, { "epoch": 2.7138605306109387, "grad_norm": 0.373046875, "learning_rate": 0.0003782891157551125, "loss": 5.1975, "step": 424000 }, { "epoch": 2.7170608378404326, "grad_norm": 0.416015625, "learning_rate": 0.00037826351329727653, "loss": 5.2018, "step": 424500 }, { "epoch": 2.7202611450699266, "grad_norm": 0.41796875, "learning_rate": 0.0003782379108394406, "loss": 5.2005, "step": 425000 }, { "epoch": 2.7234614522994205, "grad_norm": 0.435546875, "learning_rate": 0.00037821230838160465, "loss": 5.2011, "step": 425500 }, { "epoch": 2.726661759528915, "grad_norm": 0.419921875, "learning_rate": 0.0003781867059237687, "loss": 5.2041, "step": 426000 }, { "epoch": 2.729862066758409, "grad_norm": 0.41015625, "learning_rate": 0.0003781611034659327, "loss": 5.1967, "step": 426500 }, { "epoch": 2.7330623739879027, "grad_norm": 0.388671875, "learning_rate": 0.00037813550100809676, "loss": 5.2039, "step": 427000 }, { "epoch": 2.736262681217397, "grad_norm": 0.3984375, "learning_rate": 0.00037810989855026084, "loss": 5.1936, "step": 427500 }, { "epoch": 2.739462988446891, "grad_norm": 0.431640625, "learning_rate": 0.00037808429609242493, "loss": 5.1995, "step": 428000 }, { "epoch": 2.742663295676385, "grad_norm": 0.4140625, "learning_rate": 0.00037805869363458897, "loss": 5.2016, "step": 428500 }, { "epoch": 2.745863602905879, "grad_norm": 0.390625, "learning_rate": 0.000378033091176753, "loss": 5.1997, "step": 429000 }, { "epoch": 2.749063910135373, "grad_norm": 0.421875, "learning_rate": 0.00037800748871891703, "loss": 5.2064, "step": 429500 }, { "epoch": 2.752264217364867, "grad_norm": 0.43359375, "learning_rate": 0.00037798188626108107, "loss": 5.1999, "step": 430000 }, { "epoch": 2.755464524594361, "grad_norm": 0.40234375, "learning_rate": 0.00037795628380324516, "loss": 5.1978, "step": 430500 }, { "epoch": 2.758664831823855, "grad_norm": 0.404296875, "learning_rate": 0.0003779306813454092, "loss": 5.1986, "step": 431000 }, { "epoch": 2.761865139053349, "grad_norm": 0.390625, "learning_rate": 0.0003779050788875732, "loss": 5.202, "step": 431500 }, { "epoch": 2.765065446282843, "grad_norm": 0.37890625, "learning_rate": 0.00037787947642973726, "loss": 5.1967, "step": 432000 }, { "epoch": 2.7682657535123374, "grad_norm": 0.392578125, "learning_rate": 0.0003778538739719013, "loss": 5.2028, "step": 432500 }, { "epoch": 2.7714660607418313, "grad_norm": 0.4296875, "learning_rate": 0.0003778282715140654, "loss": 5.1978, "step": 433000 }, { "epoch": 2.7746663679713253, "grad_norm": 0.421875, "learning_rate": 0.0003778026690562294, "loss": 5.197, "step": 433500 }, { "epoch": 2.777866675200819, "grad_norm": 0.40625, "learning_rate": 0.00037777706659839345, "loss": 5.1927, "step": 434000 }, { "epoch": 2.781066982430313, "grad_norm": 0.41796875, "learning_rate": 0.0003777514641405575, "loss": 5.1988, "step": 434500 }, { "epoch": 2.784267289659807, "grad_norm": 0.435546875, "learning_rate": 0.0003777258616827216, "loss": 5.1968, "step": 435000 }, { "epoch": 2.7874675968893015, "grad_norm": 0.41015625, "learning_rate": 0.0003777002592248856, "loss": 5.1956, "step": 435500 }, { "epoch": 2.7906679041187954, "grad_norm": 0.4296875, "learning_rate": 0.0003776746567670497, "loss": 5.1986, "step": 436000 }, { "epoch": 2.7938682113482893, "grad_norm": 0.44140625, "learning_rate": 0.00037764905430921373, "loss": 5.1952, "step": 436500 }, { "epoch": 2.7970685185777837, "grad_norm": 0.37890625, "learning_rate": 0.00037762345185137777, "loss": 5.2054, "step": 437000 }, { "epoch": 2.8002688258072777, "grad_norm": 0.416015625, "learning_rate": 0.0003775978493935418, "loss": 5.1935, "step": 437500 }, { "epoch": 2.8034691330367716, "grad_norm": 0.380859375, "learning_rate": 0.00037757224693570583, "loss": 5.1976, "step": 438000 }, { "epoch": 2.8066694402662655, "grad_norm": 0.404296875, "learning_rate": 0.0003775466444778699, "loss": 5.2001, "step": 438500 }, { "epoch": 2.8098697474957595, "grad_norm": 0.46875, "learning_rate": 0.00037752104202003396, "loss": 5.1961, "step": 439000 }, { "epoch": 2.8130700547252534, "grad_norm": 0.38671875, "learning_rate": 0.000377495439562198, "loss": 5.2053, "step": 439500 }, { "epoch": 2.816270361954748, "grad_norm": 0.38671875, "learning_rate": 0.000377469837104362, "loss": 5.2005, "step": 440000 }, { "epoch": 2.8194706691842417, "grad_norm": 0.390625, "learning_rate": 0.00037744423464652606, "loss": 5.2043, "step": 440500 }, { "epoch": 2.8226709764137357, "grad_norm": 0.421875, "learning_rate": 0.0003774186321886901, "loss": 5.1988, "step": 441000 }, { "epoch": 2.8258712836432296, "grad_norm": 0.421875, "learning_rate": 0.0003773930297308542, "loss": 5.2003, "step": 441500 }, { "epoch": 2.829071590872724, "grad_norm": 0.419921875, "learning_rate": 0.0003773674272730182, "loss": 5.1983, "step": 442000 }, { "epoch": 2.832271898102218, "grad_norm": 0.4296875, "learning_rate": 0.00037734182481518225, "loss": 5.1978, "step": 442500 }, { "epoch": 2.835472205331712, "grad_norm": 0.42578125, "learning_rate": 0.00037731622235734634, "loss": 5.1958, "step": 443000 }, { "epoch": 2.838672512561206, "grad_norm": 0.419921875, "learning_rate": 0.0003772906198995104, "loss": 5.2009, "step": 443500 }, { "epoch": 2.8418728197906997, "grad_norm": 0.39453125, "learning_rate": 0.00037726501744167446, "loss": 5.1955, "step": 444000 }, { "epoch": 2.845073127020194, "grad_norm": 0.388671875, "learning_rate": 0.0003772394149838385, "loss": 5.2014, "step": 444500 }, { "epoch": 2.848273434249688, "grad_norm": 0.421875, "learning_rate": 0.00037721381252600253, "loss": 5.2015, "step": 445000 }, { "epoch": 2.851473741479182, "grad_norm": 0.3984375, "learning_rate": 0.00037718821006816656, "loss": 5.1953, "step": 445500 }, { "epoch": 2.854674048708676, "grad_norm": 0.3828125, "learning_rate": 0.0003771626076103306, "loss": 5.1958, "step": 446000 }, { "epoch": 2.8578743559381703, "grad_norm": 0.443359375, "learning_rate": 0.00037713700515249463, "loss": 5.2021, "step": 446500 }, { "epoch": 2.8610746631676642, "grad_norm": 0.40625, "learning_rate": 0.0003771114026946587, "loss": 5.1982, "step": 447000 }, { "epoch": 2.864274970397158, "grad_norm": 0.466796875, "learning_rate": 0.00037708580023682276, "loss": 5.1954, "step": 447500 }, { "epoch": 2.867475277626652, "grad_norm": 0.416015625, "learning_rate": 0.0003770601977789868, "loss": 5.1985, "step": 448000 }, { "epoch": 2.870675584856146, "grad_norm": 0.439453125, "learning_rate": 0.0003770345953211508, "loss": 5.2033, "step": 448500 }, { "epoch": 2.87387589208564, "grad_norm": 0.419921875, "learning_rate": 0.00037700899286331486, "loss": 5.1984, "step": 449000 }, { "epoch": 2.8770761993151344, "grad_norm": 0.404296875, "learning_rate": 0.00037698339040547895, "loss": 5.201, "step": 449500 }, { "epoch": 2.8802765065446283, "grad_norm": 0.419921875, "learning_rate": 0.000376957787947643, "loss": 5.1991, "step": 450000 }, { "epoch": 2.8834768137741222, "grad_norm": 0.40234375, "learning_rate": 0.00037693218548980707, "loss": 5.1997, "step": 450500 }, { "epoch": 2.8866771210036166, "grad_norm": 0.3984375, "learning_rate": 0.0003769065830319711, "loss": 5.1972, "step": 451000 }, { "epoch": 2.8898774282331106, "grad_norm": 0.3984375, "learning_rate": 0.00037688098057413514, "loss": 5.2039, "step": 451500 }, { "epoch": 2.8930777354626045, "grad_norm": 0.40234375, "learning_rate": 0.00037685537811629917, "loss": 5.1969, "step": 452000 }, { "epoch": 2.8962780426920984, "grad_norm": 0.40625, "learning_rate": 0.00037682977565846326, "loss": 5.1992, "step": 452500 }, { "epoch": 2.8994783499215924, "grad_norm": 0.388671875, "learning_rate": 0.0003768041732006273, "loss": 5.1965, "step": 453000 }, { "epoch": 2.9026786571510863, "grad_norm": 0.412109375, "learning_rate": 0.00037677857074279133, "loss": 5.1972, "step": 453500 }, { "epoch": 2.9058789643805807, "grad_norm": 0.458984375, "learning_rate": 0.00037675296828495536, "loss": 5.1968, "step": 454000 }, { "epoch": 2.9090792716100746, "grad_norm": 0.3984375, "learning_rate": 0.0003767273658271194, "loss": 5.1998, "step": 454500 }, { "epoch": 2.9122795788395686, "grad_norm": 0.431640625, "learning_rate": 0.0003767017633692835, "loss": 5.1934, "step": 455000 }, { "epoch": 2.9154798860690625, "grad_norm": 0.3984375, "learning_rate": 0.0003766761609114475, "loss": 5.1931, "step": 455500 }, { "epoch": 2.918680193298557, "grad_norm": 0.43359375, "learning_rate": 0.00037665055845361155, "loss": 5.2004, "step": 456000 }, { "epoch": 2.921880500528051, "grad_norm": 0.419921875, "learning_rate": 0.0003766249559957756, "loss": 5.2008, "step": 456500 }, { "epoch": 2.9250808077575448, "grad_norm": 0.396484375, "learning_rate": 0.0003765993535379396, "loss": 5.1973, "step": 457000 }, { "epoch": 2.9282811149870387, "grad_norm": 0.40234375, "learning_rate": 0.0003765737510801037, "loss": 5.1917, "step": 457500 }, { "epoch": 2.9314814222165326, "grad_norm": 0.42578125, "learning_rate": 0.00037654814862226775, "loss": 5.1964, "step": 458000 }, { "epoch": 2.9346817294460266, "grad_norm": 0.4140625, "learning_rate": 0.00037652254616443183, "loss": 5.196, "step": 458500 }, { "epoch": 2.937882036675521, "grad_norm": 0.41015625, "learning_rate": 0.00037649694370659587, "loss": 5.1973, "step": 459000 }, { "epoch": 2.941082343905015, "grad_norm": 0.458984375, "learning_rate": 0.0003764713412487599, "loss": 5.203, "step": 459500 }, { "epoch": 2.944282651134509, "grad_norm": 0.439453125, "learning_rate": 0.00037644573879092394, "loss": 5.1963, "step": 460000 }, { "epoch": 2.947482958364003, "grad_norm": 0.44140625, "learning_rate": 0.000376420136333088, "loss": 5.1969, "step": 460500 }, { "epoch": 2.950683265593497, "grad_norm": 0.494140625, "learning_rate": 0.00037639453387525206, "loss": 5.1959, "step": 461000 }, { "epoch": 2.953883572822991, "grad_norm": 0.39453125, "learning_rate": 0.0003763689314174161, "loss": 5.197, "step": 461500 }, { "epoch": 2.957083880052485, "grad_norm": 0.4140625, "learning_rate": 0.00037634332895958013, "loss": 5.1998, "step": 462000 }, { "epoch": 2.960284187281979, "grad_norm": 0.443359375, "learning_rate": 0.00037631772650174416, "loss": 5.2028, "step": 462500 }, { "epoch": 2.963484494511473, "grad_norm": 0.447265625, "learning_rate": 0.0003762921240439082, "loss": 5.1969, "step": 463000 }, { "epoch": 2.9666848017409673, "grad_norm": 0.44921875, "learning_rate": 0.0003762665215860723, "loss": 5.202, "step": 463500 }, { "epoch": 2.969885108970461, "grad_norm": 0.482421875, "learning_rate": 0.0003762409191282363, "loss": 5.1935, "step": 464000 }, { "epoch": 2.973085416199955, "grad_norm": 0.416015625, "learning_rate": 0.00037621531667040035, "loss": 5.1952, "step": 464500 }, { "epoch": 2.976285723429449, "grad_norm": 0.40234375, "learning_rate": 0.0003761897142125644, "loss": 5.1999, "step": 465000 }, { "epoch": 2.9794860306589435, "grad_norm": 0.4140625, "learning_rate": 0.0003761641117547285, "loss": 5.1928, "step": 465500 }, { "epoch": 2.9826863378884374, "grad_norm": 0.3984375, "learning_rate": 0.00037613850929689256, "loss": 5.1958, "step": 466000 }, { "epoch": 2.9858866451179313, "grad_norm": 0.40234375, "learning_rate": 0.0003761129068390566, "loss": 5.1954, "step": 466500 }, { "epoch": 2.9890869523474253, "grad_norm": 0.41015625, "learning_rate": 0.00037608730438122063, "loss": 5.1946, "step": 467000 }, { "epoch": 2.992287259576919, "grad_norm": 0.380859375, "learning_rate": 0.00037606170192338467, "loss": 5.2003, "step": 467500 }, { "epoch": 2.995487566806413, "grad_norm": 0.41015625, "learning_rate": 0.0003760360994655487, "loss": 5.1928, "step": 468000 }, { "epoch": 2.9986878740359075, "grad_norm": 0.52734375, "learning_rate": 0.00037601049700771274, "loss": 5.2001, "step": 468500 }, { "epoch": 3.0, "eval_loss": 5.188437461853027, "eval_runtime": 1.1511, "eval_samples_per_second": 868.755, "eval_steps_per_second": 13.9, "step": 468705 }, { "epoch": 3.0018881812654015, "grad_norm": 0.435546875, "learning_rate": 0.0003759848945498768, "loss": 5.2012, "step": 469000 }, { "epoch": 3.0050884884948954, "grad_norm": 0.40625, "learning_rate": 0.00037595929209204086, "loss": 5.195, "step": 469500 }, { "epoch": 3.0082887957243893, "grad_norm": 0.40234375, "learning_rate": 0.0003759336896342049, "loss": 5.1946, "step": 470000 }, { "epoch": 3.0114891029538837, "grad_norm": 0.419921875, "learning_rate": 0.0003759080871763689, "loss": 5.1945, "step": 470500 }, { "epoch": 3.0146894101833777, "grad_norm": 0.41796875, "learning_rate": 0.00037588248471853296, "loss": 5.1927, "step": 471000 }, { "epoch": 3.0178897174128716, "grad_norm": 0.404296875, "learning_rate": 0.00037585688226069705, "loss": 5.194, "step": 471500 }, { "epoch": 3.0210900246423655, "grad_norm": 0.423828125, "learning_rate": 0.0003758312798028611, "loss": 5.187, "step": 472000 }, { "epoch": 3.02429033187186, "grad_norm": 0.37890625, "learning_rate": 0.0003758056773450251, "loss": 5.195, "step": 472500 }, { "epoch": 3.027490639101354, "grad_norm": 0.41015625, "learning_rate": 0.0003757800748871892, "loss": 5.1955, "step": 473000 }, { "epoch": 3.030690946330848, "grad_norm": 0.408203125, "learning_rate": 0.00037575447242935324, "loss": 5.1973, "step": 473500 }, { "epoch": 3.0338912535603417, "grad_norm": 0.38671875, "learning_rate": 0.00037572886997151733, "loss": 5.1982, "step": 474000 }, { "epoch": 3.0370915607898357, "grad_norm": 0.38671875, "learning_rate": 0.00037570326751368136, "loss": 5.1939, "step": 474500 }, { "epoch": 3.04029186801933, "grad_norm": 0.45703125, "learning_rate": 0.0003756776650558454, "loss": 5.202, "step": 475000 }, { "epoch": 3.043492175248824, "grad_norm": 0.4453125, "learning_rate": 0.00037565206259800943, "loss": 5.1921, "step": 475500 }, { "epoch": 3.046692482478318, "grad_norm": 0.439453125, "learning_rate": 0.00037562646014017347, "loss": 5.1992, "step": 476000 }, { "epoch": 3.049892789707812, "grad_norm": 0.41796875, "learning_rate": 0.0003756008576823375, "loss": 5.1936, "step": 476500 }, { "epoch": 3.053093096937306, "grad_norm": 0.421875, "learning_rate": 0.0003755752552245016, "loss": 5.1884, "step": 477000 }, { "epoch": 3.0562934041668, "grad_norm": 0.404296875, "learning_rate": 0.0003755496527666656, "loss": 5.1873, "step": 477500 }, { "epoch": 3.059493711396294, "grad_norm": 0.40625, "learning_rate": 0.00037552405030882966, "loss": 5.1863, "step": 478000 }, { "epoch": 3.062694018625788, "grad_norm": 0.423828125, "learning_rate": 0.0003754984478509937, "loss": 5.197, "step": 478500 }, { "epoch": 3.065894325855282, "grad_norm": 0.40234375, "learning_rate": 0.0003754728453931577, "loss": 5.1983, "step": 479000 }, { "epoch": 3.069094633084776, "grad_norm": 0.43359375, "learning_rate": 0.0003754472429353218, "loss": 5.1934, "step": 479500 }, { "epoch": 3.0722949403142703, "grad_norm": 0.447265625, "learning_rate": 0.00037542164047748585, "loss": 5.1938, "step": 480000 }, { "epoch": 3.0754952475437642, "grad_norm": 0.4375, "learning_rate": 0.0003753960380196499, "loss": 5.1926, "step": 480500 }, { "epoch": 3.078695554773258, "grad_norm": 0.431640625, "learning_rate": 0.00037537043556181397, "loss": 5.1942, "step": 481000 }, { "epoch": 3.081895862002752, "grad_norm": 0.447265625, "learning_rate": 0.000375344833103978, "loss": 5.1923, "step": 481500 }, { "epoch": 3.0850961692322465, "grad_norm": 0.431640625, "learning_rate": 0.00037531923064614204, "loss": 5.1809, "step": 482000 }, { "epoch": 3.0882964764617404, "grad_norm": 0.419921875, "learning_rate": 0.00037529362818830613, "loss": 5.1921, "step": 482500 }, { "epoch": 3.0914967836912344, "grad_norm": 0.419921875, "learning_rate": 0.00037526802573047016, "loss": 5.1906, "step": 483000 }, { "epoch": 3.0946970909207283, "grad_norm": 0.44140625, "learning_rate": 0.0003752424232726342, "loss": 5.1914, "step": 483500 }, { "epoch": 3.0978973981502222, "grad_norm": 0.42578125, "learning_rate": 0.00037521682081479823, "loss": 5.1966, "step": 484000 }, { "epoch": 3.1010977053797166, "grad_norm": 0.408203125, "learning_rate": 0.00037519121835696227, "loss": 5.1933, "step": 484500 }, { "epoch": 3.1042980126092106, "grad_norm": 0.43359375, "learning_rate": 0.00037516561589912635, "loss": 5.1917, "step": 485000 }, { "epoch": 3.1074983198387045, "grad_norm": 0.408203125, "learning_rate": 0.0003751400134412904, "loss": 5.1991, "step": 485500 }, { "epoch": 3.1106986270681984, "grad_norm": 0.423828125, "learning_rate": 0.0003751144109834544, "loss": 5.1917, "step": 486000 }, { "epoch": 3.1138989342976924, "grad_norm": 0.435546875, "learning_rate": 0.00037508880852561846, "loss": 5.192, "step": 486500 }, { "epoch": 3.1170992415271868, "grad_norm": 0.41796875, "learning_rate": 0.0003750632060677825, "loss": 5.1985, "step": 487000 }, { "epoch": 3.1202995487566807, "grad_norm": 0.408203125, "learning_rate": 0.0003750376036099466, "loss": 5.1917, "step": 487500 }, { "epoch": 3.1234998559861746, "grad_norm": 0.416015625, "learning_rate": 0.0003750120011521106, "loss": 5.1907, "step": 488000 }, { "epoch": 3.1267001632156686, "grad_norm": 0.408203125, "learning_rate": 0.0003749863986942747, "loss": 5.1917, "step": 488500 }, { "epoch": 3.129900470445163, "grad_norm": 0.416015625, "learning_rate": 0.00037496079623643874, "loss": 5.1896, "step": 489000 }, { "epoch": 3.133100777674657, "grad_norm": 0.458984375, "learning_rate": 0.00037493519377860277, "loss": 5.1971, "step": 489500 }, { "epoch": 3.136301084904151, "grad_norm": 0.4296875, "learning_rate": 0.0003749095913207668, "loss": 5.1931, "step": 490000 }, { "epoch": 3.1395013921336448, "grad_norm": 0.4140625, "learning_rate": 0.0003748839888629309, "loss": 5.1998, "step": 490500 }, { "epoch": 3.1427016993631387, "grad_norm": 0.427734375, "learning_rate": 0.00037485838640509493, "loss": 5.1847, "step": 491000 }, { "epoch": 3.145902006592633, "grad_norm": 0.474609375, "learning_rate": 0.00037483278394725896, "loss": 5.1932, "step": 491500 }, { "epoch": 3.149102313822127, "grad_norm": 0.419921875, "learning_rate": 0.000374807181489423, "loss": 5.1958, "step": 492000 }, { "epoch": 3.152302621051621, "grad_norm": 0.4140625, "learning_rate": 0.00037478157903158703, "loss": 5.191, "step": 492500 }, { "epoch": 3.155502928281115, "grad_norm": 0.419921875, "learning_rate": 0.00037475597657375106, "loss": 5.1945, "step": 493000 }, { "epoch": 3.158703235510609, "grad_norm": 0.427734375, "learning_rate": 0.00037473037411591515, "loss": 5.1945, "step": 493500 }, { "epoch": 3.161903542740103, "grad_norm": 0.427734375, "learning_rate": 0.0003747047716580792, "loss": 5.1986, "step": 494000 }, { "epoch": 3.165103849969597, "grad_norm": 0.400390625, "learning_rate": 0.0003746791692002432, "loss": 5.1928, "step": 494500 }, { "epoch": 3.168304157199091, "grad_norm": 0.455078125, "learning_rate": 0.00037465356674240726, "loss": 5.195, "step": 495000 }, { "epoch": 3.171504464428585, "grad_norm": 0.41015625, "learning_rate": 0.00037462796428457134, "loss": 5.1943, "step": 495500 }, { "epoch": 3.174704771658079, "grad_norm": 0.43359375, "learning_rate": 0.0003746023618267354, "loss": 5.1968, "step": 496000 }, { "epoch": 3.1779050788875733, "grad_norm": 0.4453125, "learning_rate": 0.00037457675936889947, "loss": 5.1964, "step": 496500 }, { "epoch": 3.1811053861170673, "grad_norm": 0.443359375, "learning_rate": 0.0003745511569110635, "loss": 5.1988, "step": 497000 }, { "epoch": 3.184305693346561, "grad_norm": 0.447265625, "learning_rate": 0.00037452555445322754, "loss": 5.195, "step": 497500 }, { "epoch": 3.187506000576055, "grad_norm": 0.439453125, "learning_rate": 0.00037449995199539157, "loss": 5.1972, "step": 498000 }, { "epoch": 3.1907063078055495, "grad_norm": 0.421875, "learning_rate": 0.0003744743495375556, "loss": 5.1964, "step": 498500 }, { "epoch": 3.1939066150350435, "grad_norm": 0.45703125, "learning_rate": 0.0003744487470797197, "loss": 5.1935, "step": 499000 }, { "epoch": 3.1971069222645374, "grad_norm": 0.40234375, "learning_rate": 0.0003744231446218837, "loss": 5.1911, "step": 499500 }, { "epoch": 3.2003072294940313, "grad_norm": 0.42578125, "learning_rate": 0.00037439754216404776, "loss": 5.1889, "step": 500000 }, { "epoch": 3.2035075367235253, "grad_norm": 0.43359375, "learning_rate": 0.0003743719397062118, "loss": 5.1899, "step": 500500 }, { "epoch": 3.2067078439530197, "grad_norm": 0.4609375, "learning_rate": 0.00037434633724837583, "loss": 5.1958, "step": 501000 }, { "epoch": 3.2099081511825136, "grad_norm": 0.455078125, "learning_rate": 0.0003743207347905399, "loss": 5.1986, "step": 501500 }, { "epoch": 3.2131084584120075, "grad_norm": 0.4453125, "learning_rate": 0.00037429513233270395, "loss": 5.1947, "step": 502000 }, { "epoch": 3.2163087656415015, "grad_norm": 0.4453125, "learning_rate": 0.000374269529874868, "loss": 5.188, "step": 502500 }, { "epoch": 3.2195090728709954, "grad_norm": 0.3671875, "learning_rate": 0.0003742439274170321, "loss": 5.1988, "step": 503000 }, { "epoch": 3.22270938010049, "grad_norm": 0.42578125, "learning_rate": 0.0003742183249591961, "loss": 5.1933, "step": 503500 }, { "epoch": 3.2259096873299837, "grad_norm": 0.38671875, "learning_rate": 0.00037419272250136014, "loss": 5.1946, "step": 504000 }, { "epoch": 3.2291099945594777, "grad_norm": 0.4609375, "learning_rate": 0.00037416712004352423, "loss": 5.1928, "step": 504500 }, { "epoch": 3.2323103017889716, "grad_norm": 0.43359375, "learning_rate": 0.00037414151758568827, "loss": 5.1958, "step": 505000 }, { "epoch": 3.2355106090184655, "grad_norm": 0.416015625, "learning_rate": 0.0003741159151278523, "loss": 5.1934, "step": 505500 }, { "epoch": 3.23871091624796, "grad_norm": 0.5546875, "learning_rate": 0.00037409031267001633, "loss": 5.197, "step": 506000 }, { "epoch": 3.241911223477454, "grad_norm": 0.4140625, "learning_rate": 0.00037406471021218037, "loss": 5.1905, "step": 506500 }, { "epoch": 3.245111530706948, "grad_norm": 0.447265625, "learning_rate": 0.00037403910775434446, "loss": 5.1953, "step": 507000 }, { "epoch": 3.2483118379364417, "grad_norm": 0.4609375, "learning_rate": 0.0003740135052965085, "loss": 5.1952, "step": 507500 }, { "epoch": 3.251512145165936, "grad_norm": 0.42578125, "learning_rate": 0.0003739879028386725, "loss": 5.1929, "step": 508000 }, { "epoch": 3.25471245239543, "grad_norm": 0.435546875, "learning_rate": 0.00037396230038083656, "loss": 5.1908, "step": 508500 }, { "epoch": 3.257912759624924, "grad_norm": 0.4609375, "learning_rate": 0.0003739366979230006, "loss": 5.1931, "step": 509000 }, { "epoch": 3.261113066854418, "grad_norm": 0.423828125, "learning_rate": 0.0003739110954651647, "loss": 5.1924, "step": 509500 }, { "epoch": 3.264313374083912, "grad_norm": 0.4609375, "learning_rate": 0.0003738854930073287, "loss": 5.1907, "step": 510000 }, { "epoch": 3.2675136813134062, "grad_norm": 0.431640625, "learning_rate": 0.00037385989054949275, "loss": 5.1911, "step": 510500 }, { "epoch": 3.2707139885429, "grad_norm": 0.443359375, "learning_rate": 0.00037383428809165684, "loss": 5.1866, "step": 511000 }, { "epoch": 3.273914295772394, "grad_norm": 0.44921875, "learning_rate": 0.0003738086856338209, "loss": 5.1897, "step": 511500 }, { "epoch": 3.277114603001888, "grad_norm": 0.423828125, "learning_rate": 0.0003737830831759849, "loss": 5.1928, "step": 512000 }, { "epoch": 3.2803149102313824, "grad_norm": 0.41796875, "learning_rate": 0.000373757480718149, "loss": 5.1908, "step": 512500 }, { "epoch": 3.2835152174608764, "grad_norm": 0.453125, "learning_rate": 0.00037373187826031303, "loss": 5.1901, "step": 513000 }, { "epoch": 3.2867155246903703, "grad_norm": 0.453125, "learning_rate": 0.00037370627580247706, "loss": 5.1933, "step": 513500 }, { "epoch": 3.2899158319198643, "grad_norm": 0.447265625, "learning_rate": 0.0003736806733446411, "loss": 5.1916, "step": 514000 }, { "epoch": 3.293116139149358, "grad_norm": 0.451171875, "learning_rate": 0.00037365507088680513, "loss": 5.1879, "step": 514500 }, { "epoch": 3.296316446378852, "grad_norm": 0.462890625, "learning_rate": 0.0003736294684289692, "loss": 5.1881, "step": 515000 }, { "epoch": 3.2995167536083465, "grad_norm": 0.44921875, "learning_rate": 0.00037360386597113326, "loss": 5.1928, "step": 515500 }, { "epoch": 3.3027170608378404, "grad_norm": 0.4609375, "learning_rate": 0.0003735782635132973, "loss": 5.1954, "step": 516000 }, { "epoch": 3.3059173680673344, "grad_norm": 0.42578125, "learning_rate": 0.0003735526610554613, "loss": 5.1907, "step": 516500 }, { "epoch": 3.3091176752968283, "grad_norm": 0.42578125, "learning_rate": 0.00037352705859762536, "loss": 5.1956, "step": 517000 }, { "epoch": 3.3123179825263227, "grad_norm": 0.423828125, "learning_rate": 0.0003735014561397894, "loss": 5.1939, "step": 517500 }, { "epoch": 3.3155182897558166, "grad_norm": 0.408203125, "learning_rate": 0.0003734758536819535, "loss": 5.1891, "step": 518000 }, { "epoch": 3.3187185969853106, "grad_norm": 0.490234375, "learning_rate": 0.0003734502512241175, "loss": 5.1844, "step": 518500 }, { "epoch": 3.3219189042148045, "grad_norm": 0.443359375, "learning_rate": 0.0003734246487662816, "loss": 5.1886, "step": 519000 }, { "epoch": 3.3251192114442985, "grad_norm": 0.458984375, "learning_rate": 0.00037339904630844564, "loss": 5.1915, "step": 519500 }, { "epoch": 3.328319518673793, "grad_norm": 0.443359375, "learning_rate": 0.00037337344385060967, "loss": 5.1936, "step": 520000 }, { "epoch": 3.3315198259032868, "grad_norm": 0.44140625, "learning_rate": 0.00037334784139277376, "loss": 5.1928, "step": 520500 }, { "epoch": 3.3347201331327807, "grad_norm": 0.423828125, "learning_rate": 0.0003733222389349378, "loss": 5.1946, "step": 521000 }, { "epoch": 3.3379204403622746, "grad_norm": 0.443359375, "learning_rate": 0.00037329663647710183, "loss": 5.1969, "step": 521500 }, { "epoch": 3.341120747591769, "grad_norm": 0.4375, "learning_rate": 0.00037327103401926586, "loss": 5.1986, "step": 522000 }, { "epoch": 3.344321054821263, "grad_norm": 0.43359375, "learning_rate": 0.0003732454315614299, "loss": 5.1924, "step": 522500 }, { "epoch": 3.347521362050757, "grad_norm": 0.478515625, "learning_rate": 0.00037321982910359393, "loss": 5.1939, "step": 523000 }, { "epoch": 3.350721669280251, "grad_norm": 0.435546875, "learning_rate": 0.000373194226645758, "loss": 5.1984, "step": 523500 }, { "epoch": 3.3539219765097448, "grad_norm": 0.404296875, "learning_rate": 0.00037316862418792205, "loss": 5.1929, "step": 524000 }, { "epoch": 3.357122283739239, "grad_norm": 0.4375, "learning_rate": 0.0003731430217300861, "loss": 5.1921, "step": 524500 }, { "epoch": 3.360322590968733, "grad_norm": 0.408203125, "learning_rate": 0.0003731174192722501, "loss": 5.1959, "step": 525000 }, { "epoch": 3.363522898198227, "grad_norm": 0.44921875, "learning_rate": 0.0003730918168144142, "loss": 5.1947, "step": 525500 }, { "epoch": 3.366723205427721, "grad_norm": 0.427734375, "learning_rate": 0.00037306621435657825, "loss": 5.1939, "step": 526000 }, { "epoch": 3.369923512657215, "grad_norm": 0.416015625, "learning_rate": 0.00037304061189874233, "loss": 5.2001, "step": 526500 }, { "epoch": 3.3731238198867093, "grad_norm": 0.46875, "learning_rate": 0.00037301500944090637, "loss": 5.1944, "step": 527000 }, { "epoch": 3.376324127116203, "grad_norm": 0.443359375, "learning_rate": 0.0003729894069830704, "loss": 5.1885, "step": 527500 }, { "epoch": 3.379524434345697, "grad_norm": 0.408203125, "learning_rate": 0.00037296380452523444, "loss": 5.1954, "step": 528000 }, { "epoch": 3.382724741575191, "grad_norm": 0.4609375, "learning_rate": 0.00037293820206739847, "loss": 5.1898, "step": 528500 }, { "epoch": 3.385925048804685, "grad_norm": 0.431640625, "learning_rate": 0.00037291259960956256, "loss": 5.1952, "step": 529000 }, { "epoch": 3.3891253560341794, "grad_norm": 0.431640625, "learning_rate": 0.0003728869971517266, "loss": 5.1911, "step": 529500 }, { "epoch": 3.3923256632636734, "grad_norm": 0.439453125, "learning_rate": 0.00037286139469389063, "loss": 5.1901, "step": 530000 }, { "epoch": 3.3955259704931673, "grad_norm": 0.40625, "learning_rate": 0.00037283579223605466, "loss": 5.1941, "step": 530500 }, { "epoch": 3.3987262777226612, "grad_norm": 0.416015625, "learning_rate": 0.0003728101897782187, "loss": 5.1962, "step": 531000 }, { "epoch": 3.4019265849521556, "grad_norm": 0.4296875, "learning_rate": 0.0003727845873203828, "loss": 5.1943, "step": 531500 }, { "epoch": 3.4051268921816495, "grad_norm": 0.427734375, "learning_rate": 0.0003727589848625468, "loss": 5.1894, "step": 532000 }, { "epoch": 3.4083271994111435, "grad_norm": 0.419921875, "learning_rate": 0.00037273338240471085, "loss": 5.1891, "step": 532500 }, { "epoch": 3.4115275066406374, "grad_norm": 0.43359375, "learning_rate": 0.0003727077799468749, "loss": 5.1877, "step": 533000 }, { "epoch": 3.4147278138701314, "grad_norm": 0.423828125, "learning_rate": 0.000372682177489039, "loss": 5.1948, "step": 533500 }, { "epoch": 3.4179281210996257, "grad_norm": 0.44140625, "learning_rate": 0.000372656575031203, "loss": 5.1937, "step": 534000 }, { "epoch": 3.4211284283291197, "grad_norm": 0.4375, "learning_rate": 0.0003726309725733671, "loss": 5.1932, "step": 534500 }, { "epoch": 3.4243287355586136, "grad_norm": 0.404296875, "learning_rate": 0.00037260537011553113, "loss": 5.1979, "step": 535000 }, { "epoch": 3.4275290427881075, "grad_norm": 0.466796875, "learning_rate": 0.00037257976765769517, "loss": 5.1899, "step": 535500 }, { "epoch": 3.430729350017602, "grad_norm": 0.43359375, "learning_rate": 0.0003725541651998592, "loss": 5.1883, "step": 536000 }, { "epoch": 3.433929657247096, "grad_norm": 0.470703125, "learning_rate": 0.00037252856274202324, "loss": 5.1951, "step": 536500 }, { "epoch": 3.43712996447659, "grad_norm": 0.462890625, "learning_rate": 0.0003725029602841873, "loss": 5.1938, "step": 537000 }, { "epoch": 3.4403302717060837, "grad_norm": 0.41015625, "learning_rate": 0.00037247735782635136, "loss": 5.1935, "step": 537500 }, { "epoch": 3.4435305789355777, "grad_norm": 0.447265625, "learning_rate": 0.0003724517553685154, "loss": 5.1865, "step": 538000 }, { "epoch": 3.4467308861650716, "grad_norm": 0.423828125, "learning_rate": 0.00037242615291067943, "loss": 5.1933, "step": 538500 }, { "epoch": 3.449931193394566, "grad_norm": 0.447265625, "learning_rate": 0.00037240055045284346, "loss": 5.1946, "step": 539000 }, { "epoch": 3.45313150062406, "grad_norm": 0.45703125, "learning_rate": 0.0003723749479950075, "loss": 5.1926, "step": 539500 }, { "epoch": 3.456331807853554, "grad_norm": 0.451171875, "learning_rate": 0.0003723493455371716, "loss": 5.1921, "step": 540000 }, { "epoch": 3.459532115083048, "grad_norm": 0.44140625, "learning_rate": 0.0003723237430793356, "loss": 5.1952, "step": 540500 }, { "epoch": 3.462732422312542, "grad_norm": 0.42578125, "learning_rate": 0.0003722981406214997, "loss": 5.1905, "step": 541000 }, { "epoch": 3.465932729542036, "grad_norm": 0.427734375, "learning_rate": 0.00037227253816366374, "loss": 5.1907, "step": 541500 }, { "epoch": 3.46913303677153, "grad_norm": 0.451171875, "learning_rate": 0.0003722469357058278, "loss": 5.1925, "step": 542000 }, { "epoch": 3.472333344001024, "grad_norm": 0.443359375, "learning_rate": 0.00037222133324799186, "loss": 5.1903, "step": 542500 }, { "epoch": 3.475533651230518, "grad_norm": 0.45703125, "learning_rate": 0.0003721957307901559, "loss": 5.1954, "step": 543000 }, { "epoch": 3.4787339584600123, "grad_norm": 0.486328125, "learning_rate": 0.00037217012833231993, "loss": 5.1947, "step": 543500 }, { "epoch": 3.4819342656895063, "grad_norm": 0.484375, "learning_rate": 0.00037214452587448397, "loss": 5.1955, "step": 544000 }, { "epoch": 3.485134572919, "grad_norm": 0.48046875, "learning_rate": 0.000372118923416648, "loss": 5.195, "step": 544500 }, { "epoch": 3.488334880148494, "grad_norm": 0.416015625, "learning_rate": 0.00037209332095881203, "loss": 5.1956, "step": 545000 }, { "epoch": 3.4915351873779885, "grad_norm": 0.3984375, "learning_rate": 0.0003720677185009761, "loss": 5.1975, "step": 545500 }, { "epoch": 3.4947354946074825, "grad_norm": 0.439453125, "learning_rate": 0.00037204211604314016, "loss": 5.1947, "step": 546000 }, { "epoch": 3.4979358018369764, "grad_norm": 0.46484375, "learning_rate": 0.0003720165135853042, "loss": 5.1972, "step": 546500 }, { "epoch": 3.5011361090664703, "grad_norm": 0.478515625, "learning_rate": 0.0003719909111274682, "loss": 5.1907, "step": 547000 }, { "epoch": 3.5043364162959643, "grad_norm": 0.41796875, "learning_rate": 0.00037196530866963226, "loss": 5.1957, "step": 547500 }, { "epoch": 3.507536723525458, "grad_norm": 0.431640625, "learning_rate": 0.00037193970621179635, "loss": 5.1908, "step": 548000 }, { "epoch": 3.5107370307549526, "grad_norm": 0.45703125, "learning_rate": 0.0003719141037539604, "loss": 5.1912, "step": 548500 }, { "epoch": 3.5139373379844465, "grad_norm": 0.4296875, "learning_rate": 0.00037188850129612447, "loss": 5.1996, "step": 549000 }, { "epoch": 3.5171376452139405, "grad_norm": 0.4375, "learning_rate": 0.0003718628988382885, "loss": 5.1944, "step": 549500 }, { "epoch": 3.520337952443435, "grad_norm": 0.44921875, "learning_rate": 0.00037183729638045254, "loss": 5.1949, "step": 550000 }, { "epoch": 3.5235382596729288, "grad_norm": 0.41015625, "learning_rate": 0.00037181169392261663, "loss": 5.1878, "step": 550500 }, { "epoch": 3.5267385669024227, "grad_norm": 0.48046875, "learning_rate": 0.00037178609146478066, "loss": 5.1897, "step": 551000 }, { "epoch": 3.5299388741319166, "grad_norm": 0.423828125, "learning_rate": 0.0003717604890069447, "loss": 5.1953, "step": 551500 }, { "epoch": 3.5331391813614106, "grad_norm": 0.455078125, "learning_rate": 0.00037173488654910873, "loss": 5.1928, "step": 552000 }, { "epoch": 3.5363394885909045, "grad_norm": 0.46875, "learning_rate": 0.00037170928409127277, "loss": 5.1918, "step": 552500 }, { "epoch": 3.539539795820399, "grad_norm": 0.45703125, "learning_rate": 0.0003716836816334368, "loss": 5.1947, "step": 553000 }, { "epoch": 3.542740103049893, "grad_norm": 0.423828125, "learning_rate": 0.0003716580791756009, "loss": 5.1898, "step": 553500 }, { "epoch": 3.545940410279387, "grad_norm": 0.431640625, "learning_rate": 0.0003716324767177649, "loss": 5.194, "step": 554000 }, { "epoch": 3.5491407175088807, "grad_norm": 0.46484375, "learning_rate": 0.00037160687425992896, "loss": 5.1985, "step": 554500 }, { "epoch": 3.552341024738375, "grad_norm": 0.431640625, "learning_rate": 0.000371581271802093, "loss": 5.1973, "step": 555000 }, { "epoch": 3.555541331967869, "grad_norm": 0.41796875, "learning_rate": 0.000371555669344257, "loss": 5.1947, "step": 555500 }, { "epoch": 3.558741639197363, "grad_norm": 0.458984375, "learning_rate": 0.0003715300668864211, "loss": 5.1974, "step": 556000 }, { "epoch": 3.561941946426857, "grad_norm": 0.408203125, "learning_rate": 0.0003715044644285852, "loss": 5.1936, "step": 556500 }, { "epoch": 3.565142253656351, "grad_norm": 0.4375, "learning_rate": 0.00037147886197074924, "loss": 5.197, "step": 557000 }, { "epoch": 3.568342560885845, "grad_norm": 0.435546875, "learning_rate": 0.00037145325951291327, "loss": 5.1962, "step": 557500 }, { "epoch": 3.571542868115339, "grad_norm": 0.439453125, "learning_rate": 0.0003714276570550773, "loss": 5.1893, "step": 558000 }, { "epoch": 3.574743175344833, "grad_norm": 0.474609375, "learning_rate": 0.00037140205459724134, "loss": 5.1963, "step": 558500 }, { "epoch": 3.577943482574327, "grad_norm": 0.4296875, "learning_rate": 0.00037137645213940543, "loss": 5.1885, "step": 559000 }, { "epoch": 3.5811437898038214, "grad_norm": 0.451171875, "learning_rate": 0.00037135084968156946, "loss": 5.192, "step": 559500 }, { "epoch": 3.5843440970333154, "grad_norm": 0.42578125, "learning_rate": 0.0003713252472237335, "loss": 5.1898, "step": 560000 }, { "epoch": 3.5875444042628093, "grad_norm": 0.458984375, "learning_rate": 0.00037129964476589753, "loss": 5.1964, "step": 560500 }, { "epoch": 3.5907447114923032, "grad_norm": 0.43359375, "learning_rate": 0.00037127404230806156, "loss": 5.1908, "step": 561000 }, { "epoch": 3.593945018721797, "grad_norm": 0.44140625, "learning_rate": 0.00037124843985022565, "loss": 5.1914, "step": 561500 }, { "epoch": 3.597145325951291, "grad_norm": 0.4140625, "learning_rate": 0.0003712228373923897, "loss": 5.1917, "step": 562000 }, { "epoch": 3.6003456331807855, "grad_norm": 0.416015625, "learning_rate": 0.0003711972349345537, "loss": 5.1943, "step": 562500 }, { "epoch": 3.6035459404102794, "grad_norm": 0.4453125, "learning_rate": 0.00037117163247671776, "loss": 5.1978, "step": 563000 }, { "epoch": 3.6067462476397734, "grad_norm": 0.439453125, "learning_rate": 0.00037114603001888184, "loss": 5.1876, "step": 563500 }, { "epoch": 3.6099465548692673, "grad_norm": 0.41796875, "learning_rate": 0.0003711204275610459, "loss": 5.191, "step": 564000 }, { "epoch": 3.6131468620987617, "grad_norm": 0.431640625, "learning_rate": 0.00037109482510320997, "loss": 5.1959, "step": 564500 }, { "epoch": 3.6163471693282556, "grad_norm": 0.44140625, "learning_rate": 0.000371069222645374, "loss": 5.1885, "step": 565000 }, { "epoch": 3.6195474765577496, "grad_norm": 0.421875, "learning_rate": 0.00037104362018753804, "loss": 5.1891, "step": 565500 }, { "epoch": 3.6227477837872435, "grad_norm": 0.5, "learning_rate": 0.00037101801772970207, "loss": 5.1944, "step": 566000 }, { "epoch": 3.6259480910167374, "grad_norm": 0.427734375, "learning_rate": 0.0003709924152718661, "loss": 5.1902, "step": 566500 }, { "epoch": 3.6291483982462314, "grad_norm": 0.427734375, "learning_rate": 0.0003709668128140302, "loss": 5.1885, "step": 567000 }, { "epoch": 3.6323487054757257, "grad_norm": 0.51171875, "learning_rate": 0.0003709412103561942, "loss": 5.189, "step": 567500 }, { "epoch": 3.6355490127052197, "grad_norm": 0.435546875, "learning_rate": 0.00037091560789835826, "loss": 5.1871, "step": 568000 }, { "epoch": 3.6387493199347136, "grad_norm": 0.423828125, "learning_rate": 0.0003708900054405223, "loss": 5.1998, "step": 568500 }, { "epoch": 3.641949627164208, "grad_norm": 0.458984375, "learning_rate": 0.00037086440298268633, "loss": 5.1883, "step": 569000 }, { "epoch": 3.645149934393702, "grad_norm": 0.484375, "learning_rate": 0.00037083880052485036, "loss": 5.1916, "step": 569500 }, { "epoch": 3.648350241623196, "grad_norm": 0.46484375, "learning_rate": 0.00037081319806701445, "loss": 5.1919, "step": 570000 }, { "epoch": 3.65155054885269, "grad_norm": 0.48046875, "learning_rate": 0.0003707875956091785, "loss": 5.1965, "step": 570500 }, { "epoch": 3.6547508560821838, "grad_norm": 0.435546875, "learning_rate": 0.0003707619931513425, "loss": 5.1954, "step": 571000 }, { "epoch": 3.6579511633116777, "grad_norm": 0.46484375, "learning_rate": 0.0003707363906935066, "loss": 5.1963, "step": 571500 }, { "epoch": 3.661151470541172, "grad_norm": 0.478515625, "learning_rate": 0.00037071078823567064, "loss": 5.1893, "step": 572000 }, { "epoch": 3.664351777770666, "grad_norm": 0.451171875, "learning_rate": 0.00037068518577783473, "loss": 5.1916, "step": 572500 }, { "epoch": 3.66755208500016, "grad_norm": 0.4296875, "learning_rate": 0.00037065958331999877, "loss": 5.1885, "step": 573000 }, { "epoch": 3.670752392229654, "grad_norm": 0.44921875, "learning_rate": 0.0003706339808621628, "loss": 5.1929, "step": 573500 }, { "epoch": 3.6739526994591483, "grad_norm": 0.482421875, "learning_rate": 0.00037060837840432683, "loss": 5.1891, "step": 574000 }, { "epoch": 3.677153006688642, "grad_norm": 0.48046875, "learning_rate": 0.00037058277594649087, "loss": 5.1971, "step": 574500 }, { "epoch": 3.680353313918136, "grad_norm": 0.46875, "learning_rate": 0.0003705571734886549, "loss": 5.201, "step": 575000 }, { "epoch": 3.68355362114763, "grad_norm": 0.478515625, "learning_rate": 0.000370531571030819, "loss": 5.1911, "step": 575500 }, { "epoch": 3.686753928377124, "grad_norm": 0.408203125, "learning_rate": 0.000370505968572983, "loss": 5.18, "step": 576000 }, { "epoch": 3.6899542356066184, "grad_norm": 0.474609375, "learning_rate": 0.00037048036611514706, "loss": 5.1987, "step": 576500 }, { "epoch": 3.6931545428361123, "grad_norm": 0.43359375, "learning_rate": 0.0003704547636573111, "loss": 5.1972, "step": 577000 }, { "epoch": 3.6963548500656063, "grad_norm": 0.470703125, "learning_rate": 0.00037042916119947513, "loss": 5.1944, "step": 577500 }, { "epoch": 3.6995551572951, "grad_norm": 0.4765625, "learning_rate": 0.0003704035587416392, "loss": 5.1943, "step": 578000 }, { "epoch": 3.7027554645245946, "grad_norm": 0.412109375, "learning_rate": 0.00037037795628380325, "loss": 5.1932, "step": 578500 }, { "epoch": 3.7059557717540885, "grad_norm": 0.419921875, "learning_rate": 0.00037035235382596734, "loss": 5.198, "step": 579000 }, { "epoch": 3.7091560789835825, "grad_norm": 0.484375, "learning_rate": 0.0003703267513681314, "loss": 5.201, "step": 579500 }, { "epoch": 3.7123563862130764, "grad_norm": 0.451171875, "learning_rate": 0.0003703011489102954, "loss": 5.191, "step": 580000 }, { "epoch": 3.7155566934425703, "grad_norm": 0.453125, "learning_rate": 0.00037027554645245944, "loss": 5.1979, "step": 580500 }, { "epoch": 3.7187570006720643, "grad_norm": 0.46875, "learning_rate": 0.00037024994399462353, "loss": 5.1952, "step": 581000 }, { "epoch": 3.7219573079015587, "grad_norm": 0.443359375, "learning_rate": 0.00037022434153678756, "loss": 5.1955, "step": 581500 }, { "epoch": 3.7251576151310526, "grad_norm": 0.470703125, "learning_rate": 0.0003701987390789516, "loss": 5.1907, "step": 582000 }, { "epoch": 3.7283579223605465, "grad_norm": 0.462890625, "learning_rate": 0.00037017313662111563, "loss": 5.189, "step": 582500 }, { "epoch": 3.731558229590041, "grad_norm": 0.4140625, "learning_rate": 0.00037014753416327967, "loss": 5.1898, "step": 583000 }, { "epoch": 3.734758536819535, "grad_norm": 0.47265625, "learning_rate": 0.00037012193170544376, "loss": 5.1901, "step": 583500 }, { "epoch": 3.737958844049029, "grad_norm": 0.439453125, "learning_rate": 0.0003700963292476078, "loss": 5.1962, "step": 584000 }, { "epoch": 3.7411591512785227, "grad_norm": 0.431640625, "learning_rate": 0.0003700707267897718, "loss": 5.1929, "step": 584500 }, { "epoch": 3.7443594585080167, "grad_norm": 0.44921875, "learning_rate": 0.00037004512433193586, "loss": 5.1978, "step": 585000 }, { "epoch": 3.7475597657375106, "grad_norm": 0.44140625, "learning_rate": 0.0003700195218740999, "loss": 5.1906, "step": 585500 }, { "epoch": 3.750760072967005, "grad_norm": 0.4453125, "learning_rate": 0.000369993919416264, "loss": 5.1892, "step": 586000 }, { "epoch": 3.753960380196499, "grad_norm": 0.451171875, "learning_rate": 0.000369968316958428, "loss": 5.191, "step": 586500 }, { "epoch": 3.757160687425993, "grad_norm": 0.470703125, "learning_rate": 0.0003699427145005921, "loss": 5.1967, "step": 587000 }, { "epoch": 3.760360994655487, "grad_norm": 0.486328125, "learning_rate": 0.00036991711204275614, "loss": 5.1956, "step": 587500 }, { "epoch": 3.763561301884981, "grad_norm": 0.46484375, "learning_rate": 0.00036989150958492017, "loss": 5.1934, "step": 588000 }, { "epoch": 3.766761609114475, "grad_norm": 0.5, "learning_rate": 0.0003698659071270842, "loss": 5.1938, "step": 588500 }, { "epoch": 3.769961916343969, "grad_norm": 0.4609375, "learning_rate": 0.0003698403046692483, "loss": 5.1923, "step": 589000 }, { "epoch": 3.773162223573463, "grad_norm": 0.44140625, "learning_rate": 0.00036981470221141233, "loss": 5.1926, "step": 589500 }, { "epoch": 3.776362530802957, "grad_norm": 0.5, "learning_rate": 0.00036978909975357636, "loss": 5.1917, "step": 590000 }, { "epoch": 3.779562838032451, "grad_norm": 0.427734375, "learning_rate": 0.0003697634972957404, "loss": 5.1902, "step": 590500 }, { "epoch": 3.7827631452619452, "grad_norm": 0.46875, "learning_rate": 0.00036973789483790443, "loss": 5.1998, "step": 591000 }, { "epoch": 3.785963452491439, "grad_norm": 0.421875, "learning_rate": 0.0003697122923800685, "loss": 5.1894, "step": 591500 }, { "epoch": 3.789163759720933, "grad_norm": 0.466796875, "learning_rate": 0.00036968668992223255, "loss": 5.1918, "step": 592000 }, { "epoch": 3.7923640669504275, "grad_norm": 0.443359375, "learning_rate": 0.0003696610874643966, "loss": 5.1876, "step": 592500 }, { "epoch": 3.7955643741799214, "grad_norm": 0.51171875, "learning_rate": 0.0003696354850065606, "loss": 5.2001, "step": 593000 }, { "epoch": 3.7987646814094154, "grad_norm": 0.435546875, "learning_rate": 0.00036960988254872466, "loss": 5.1952, "step": 593500 }, { "epoch": 3.8019649886389093, "grad_norm": 0.451171875, "learning_rate": 0.00036958428009088875, "loss": 5.192, "step": 594000 }, { "epoch": 3.8051652958684032, "grad_norm": 0.51171875, "learning_rate": 0.00036955867763305283, "loss": 5.1968, "step": 594500 }, { "epoch": 3.808365603097897, "grad_norm": 0.455078125, "learning_rate": 0.00036953307517521687, "loss": 5.1993, "step": 595000 }, { "epoch": 3.8115659103273916, "grad_norm": 0.44921875, "learning_rate": 0.0003695074727173809, "loss": 5.1949, "step": 595500 }, { "epoch": 3.8147662175568855, "grad_norm": 0.50390625, "learning_rate": 0.00036948187025954494, "loss": 5.1955, "step": 596000 }, { "epoch": 3.8179665247863794, "grad_norm": 0.4375, "learning_rate": 0.00036945626780170897, "loss": 5.1966, "step": 596500 }, { "epoch": 3.8211668320158734, "grad_norm": 0.5078125, "learning_rate": 0.00036943066534387306, "loss": 5.1922, "step": 597000 }, { "epoch": 3.8243671392453678, "grad_norm": 0.453125, "learning_rate": 0.0003694050628860371, "loss": 5.188, "step": 597500 }, { "epoch": 3.8275674464748617, "grad_norm": 0.404296875, "learning_rate": 0.00036937946042820113, "loss": 5.1951, "step": 598000 }, { "epoch": 3.8307677537043556, "grad_norm": 0.404296875, "learning_rate": 0.00036935385797036516, "loss": 5.1962, "step": 598500 }, { "epoch": 3.8339680609338496, "grad_norm": 0.4609375, "learning_rate": 0.0003693282555125292, "loss": 5.1961, "step": 599000 }, { "epoch": 3.8371683681633435, "grad_norm": 0.4296875, "learning_rate": 0.00036930265305469323, "loss": 5.1996, "step": 599500 }, { "epoch": 3.8403686753928374, "grad_norm": 0.427734375, "learning_rate": 0.0003692770505968573, "loss": 5.1948, "step": 600000 }, { "epoch": 3.843568982622332, "grad_norm": 0.46484375, "learning_rate": 0.00036925144813902135, "loss": 5.1891, "step": 600500 }, { "epoch": 3.8467692898518258, "grad_norm": 0.48046875, "learning_rate": 0.0003692258456811854, "loss": 5.1954, "step": 601000 }, { "epoch": 3.8499695970813197, "grad_norm": 0.443359375, "learning_rate": 0.0003692002432233495, "loss": 5.1881, "step": 601500 }, { "epoch": 3.853169904310814, "grad_norm": 0.4296875, "learning_rate": 0.0003691746407655135, "loss": 5.1938, "step": 602000 }, { "epoch": 3.856370211540308, "grad_norm": 0.447265625, "learning_rate": 0.0003691490383076776, "loss": 5.1946, "step": 602500 }, { "epoch": 3.859570518769802, "grad_norm": 0.46484375, "learning_rate": 0.00036912343584984163, "loss": 5.1926, "step": 603000 }, { "epoch": 3.862770825999296, "grad_norm": 0.43359375, "learning_rate": 0.00036909783339200567, "loss": 5.1925, "step": 603500 }, { "epoch": 3.86597113322879, "grad_norm": 0.435546875, "learning_rate": 0.0003690722309341697, "loss": 5.1956, "step": 604000 }, { "epoch": 3.8691714404582838, "grad_norm": 0.46484375, "learning_rate": 0.00036904662847633374, "loss": 5.1917, "step": 604500 }, { "epoch": 3.872371747687778, "grad_norm": 0.51953125, "learning_rate": 0.00036902102601849777, "loss": 5.1941, "step": 605000 }, { "epoch": 3.875572054917272, "grad_norm": 0.478515625, "learning_rate": 0.00036899542356066186, "loss": 5.196, "step": 605500 }, { "epoch": 3.878772362146766, "grad_norm": 0.5, "learning_rate": 0.0003689698211028259, "loss": 5.1926, "step": 606000 }, { "epoch": 3.88197266937626, "grad_norm": 0.490234375, "learning_rate": 0.00036894421864498993, "loss": 5.1966, "step": 606500 }, { "epoch": 3.8851729766057543, "grad_norm": 0.41796875, "learning_rate": 0.00036891861618715396, "loss": 5.1931, "step": 607000 }, { "epoch": 3.8883732838352483, "grad_norm": 0.43359375, "learning_rate": 0.000368893013729318, "loss": 5.1937, "step": 607500 }, { "epoch": 3.891573591064742, "grad_norm": 0.466796875, "learning_rate": 0.0003688674112714821, "loss": 5.195, "step": 608000 }, { "epoch": 3.894773898294236, "grad_norm": 0.48046875, "learning_rate": 0.0003688418088136461, "loss": 5.1918, "step": 608500 }, { "epoch": 3.89797420552373, "grad_norm": 0.462890625, "learning_rate": 0.00036881620635581015, "loss": 5.1942, "step": 609000 }, { "epoch": 3.9011745127532245, "grad_norm": 0.466796875, "learning_rate": 0.00036879060389797424, "loss": 5.1918, "step": 609500 }, { "epoch": 3.9043748199827184, "grad_norm": 0.44140625, "learning_rate": 0.0003687650014401383, "loss": 5.1955, "step": 610000 }, { "epoch": 3.9075751272122123, "grad_norm": 0.41015625, "learning_rate": 0.0003687393989823023, "loss": 5.19, "step": 610500 }, { "epoch": 3.9107754344417063, "grad_norm": 0.4453125, "learning_rate": 0.0003687137965244664, "loss": 5.1947, "step": 611000 }, { "epoch": 3.9139757416712007, "grad_norm": 0.4765625, "learning_rate": 0.00036868819406663043, "loss": 5.1975, "step": 611500 }, { "epoch": 3.9171760489006946, "grad_norm": 0.443359375, "learning_rate": 0.00036866259160879447, "loss": 5.1929, "step": 612000 }, { "epoch": 3.9203763561301885, "grad_norm": 0.42578125, "learning_rate": 0.0003686369891509585, "loss": 5.1986, "step": 612500 }, { "epoch": 3.9235766633596825, "grad_norm": 0.453125, "learning_rate": 0.00036861138669312254, "loss": 5.1918, "step": 613000 }, { "epoch": 3.9267769705891764, "grad_norm": 0.435546875, "learning_rate": 0.0003685857842352866, "loss": 5.1931, "step": 613500 }, { "epoch": 3.9299772778186703, "grad_norm": 0.4453125, "learning_rate": 0.00036856018177745066, "loss": 5.1948, "step": 614000 }, { "epoch": 3.9331775850481647, "grad_norm": 0.439453125, "learning_rate": 0.0003685345793196147, "loss": 5.1881, "step": 614500 }, { "epoch": 3.9363778922776587, "grad_norm": 0.42578125, "learning_rate": 0.0003685089768617787, "loss": 5.1954, "step": 615000 }, { "epoch": 3.9395781995071526, "grad_norm": 0.4453125, "learning_rate": 0.00036848337440394276, "loss": 5.1983, "step": 615500 }, { "epoch": 3.942778506736647, "grad_norm": 0.470703125, "learning_rate": 0.00036845777194610685, "loss": 5.192, "step": 616000 }, { "epoch": 3.945978813966141, "grad_norm": 0.4296875, "learning_rate": 0.0003684321694882709, "loss": 5.1921, "step": 616500 }, { "epoch": 3.949179121195635, "grad_norm": 0.43359375, "learning_rate": 0.00036840656703043497, "loss": 5.193, "step": 617000 }, { "epoch": 3.952379428425129, "grad_norm": 0.51171875, "learning_rate": 0.000368380964572599, "loss": 5.1931, "step": 617500 }, { "epoch": 3.9555797356546227, "grad_norm": 0.44140625, "learning_rate": 0.00036835536211476304, "loss": 5.1898, "step": 618000 }, { "epoch": 3.9587800428841167, "grad_norm": 0.46875, "learning_rate": 0.0003683297596569271, "loss": 5.1993, "step": 618500 }, { "epoch": 3.961980350113611, "grad_norm": 0.45703125, "learning_rate": 0.00036830415719909116, "loss": 5.1947, "step": 619000 }, { "epoch": 3.965180657343105, "grad_norm": 0.462890625, "learning_rate": 0.0003682785547412552, "loss": 5.1963, "step": 619500 }, { "epoch": 3.968380964572599, "grad_norm": 0.49609375, "learning_rate": 0.00036825295228341923, "loss": 5.1975, "step": 620000 }, { "epoch": 3.971581271802093, "grad_norm": 0.478515625, "learning_rate": 0.00036822734982558327, "loss": 5.195, "step": 620500 }, { "epoch": 3.9747815790315872, "grad_norm": 0.47265625, "learning_rate": 0.0003682017473677473, "loss": 5.1945, "step": 621000 }, { "epoch": 3.977981886261081, "grad_norm": 0.45703125, "learning_rate": 0.0003681761449099114, "loss": 5.1949, "step": 621500 }, { "epoch": 3.981182193490575, "grad_norm": 0.43359375, "learning_rate": 0.0003681505424520754, "loss": 5.1965, "step": 622000 }, { "epoch": 3.984382500720069, "grad_norm": 0.453125, "learning_rate": 0.00036812493999423946, "loss": 5.1927, "step": 622500 }, { "epoch": 3.987582807949563, "grad_norm": 0.48828125, "learning_rate": 0.0003680993375364035, "loss": 5.1926, "step": 623000 }, { "epoch": 3.990783115179057, "grad_norm": 0.453125, "learning_rate": 0.0003680737350785675, "loss": 5.1971, "step": 623500 }, { "epoch": 3.9939834224085513, "grad_norm": 0.4375, "learning_rate": 0.0003680481326207316, "loss": 5.1945, "step": 624000 }, { "epoch": 3.9971837296380452, "grad_norm": 0.45703125, "learning_rate": 0.00036802253016289565, "loss": 5.1938, "step": 624500 }, { "epoch": 4.0, "eval_loss": 5.188127040863037, "eval_runtime": 1.1918, "eval_samples_per_second": 839.075, "eval_steps_per_second": 13.425, "step": 624940 }, { "epoch": 4.00038403686754, "grad_norm": 0.431640625, "learning_rate": 0.00036799692770505974, "loss": 5.1936, "step": 625000 }, { "epoch": 4.003584344097034, "grad_norm": 0.44140625, "learning_rate": 0.00036797132524722377, "loss": 5.1842, "step": 625500 }, { "epoch": 4.0067846513265275, "grad_norm": 0.443359375, "learning_rate": 0.0003679457227893878, "loss": 5.1866, "step": 626000 }, { "epoch": 4.009984958556021, "grad_norm": 0.490234375, "learning_rate": 0.00036792012033155184, "loss": 5.1875, "step": 626500 }, { "epoch": 4.013185265785515, "grad_norm": 0.435546875, "learning_rate": 0.00036789451787371593, "loss": 5.1882, "step": 627000 }, { "epoch": 4.016385573015009, "grad_norm": 0.431640625, "learning_rate": 0.00036786891541587996, "loss": 5.1898, "step": 627500 }, { "epoch": 4.019585880244503, "grad_norm": 0.484375, "learning_rate": 0.000367843312958044, "loss": 5.1909, "step": 628000 }, { "epoch": 4.022786187473997, "grad_norm": 0.4765625, "learning_rate": 0.00036781771050020803, "loss": 5.1898, "step": 628500 }, { "epoch": 4.025986494703491, "grad_norm": 0.4453125, "learning_rate": 0.00036779210804237206, "loss": 5.1957, "step": 629000 }, { "epoch": 4.029186801932986, "grad_norm": 0.49609375, "learning_rate": 0.0003677665055845361, "loss": 5.1845, "step": 629500 }, { "epoch": 4.03238710916248, "grad_norm": 0.470703125, "learning_rate": 0.0003677409031267002, "loss": 5.1906, "step": 630000 }, { "epoch": 4.035587416391974, "grad_norm": 0.42578125, "learning_rate": 0.0003677153006688642, "loss": 5.1994, "step": 630500 }, { "epoch": 4.038787723621468, "grad_norm": 0.462890625, "learning_rate": 0.00036768969821102826, "loss": 5.1873, "step": 631000 }, { "epoch": 4.041988030850962, "grad_norm": 0.50390625, "learning_rate": 0.0003676640957531923, "loss": 5.1878, "step": 631500 }, { "epoch": 4.045188338080456, "grad_norm": 0.44140625, "learning_rate": 0.0003676384932953564, "loss": 5.1948, "step": 632000 }, { "epoch": 4.04838864530995, "grad_norm": 0.466796875, "learning_rate": 0.00036761289083752047, "loss": 5.1922, "step": 632500 }, { "epoch": 4.0515889525394435, "grad_norm": 0.455078125, "learning_rate": 0.0003675872883796845, "loss": 5.1802, "step": 633000 }, { "epoch": 4.0547892597689374, "grad_norm": 0.443359375, "learning_rate": 0.00036756168592184854, "loss": 5.1917, "step": 633500 }, { "epoch": 4.057989566998432, "grad_norm": 0.494140625, "learning_rate": 0.00036753608346401257, "loss": 5.1942, "step": 634000 }, { "epoch": 4.061189874227926, "grad_norm": 0.48828125, "learning_rate": 0.0003675104810061766, "loss": 5.1912, "step": 634500 }, { "epoch": 4.06439018145742, "grad_norm": 0.494140625, "learning_rate": 0.00036748487854834064, "loss": 5.186, "step": 635000 }, { "epoch": 4.067590488686914, "grad_norm": 0.47265625, "learning_rate": 0.0003674592760905047, "loss": 5.1892, "step": 635500 }, { "epoch": 4.070790795916408, "grad_norm": 0.4609375, "learning_rate": 0.00036743367363266876, "loss": 5.1912, "step": 636000 }, { "epoch": 4.073991103145902, "grad_norm": 0.4453125, "learning_rate": 0.0003674080711748328, "loss": 5.1944, "step": 636500 }, { "epoch": 4.077191410375396, "grad_norm": 0.447265625, "learning_rate": 0.00036738246871699683, "loss": 5.1918, "step": 637000 }, { "epoch": 4.08039171760489, "grad_norm": 0.46484375, "learning_rate": 0.00036735686625916086, "loss": 5.1935, "step": 637500 }, { "epoch": 4.083592024834384, "grad_norm": 0.455078125, "learning_rate": 0.00036733126380132495, "loss": 5.1961, "step": 638000 }, { "epoch": 4.086792332063878, "grad_norm": 0.439453125, "learning_rate": 0.000367305661343489, "loss": 5.1916, "step": 638500 }, { "epoch": 4.0899926392933725, "grad_norm": 0.490234375, "learning_rate": 0.000367280058885653, "loss": 5.1949, "step": 639000 }, { "epoch": 4.0931929465228665, "grad_norm": 0.451171875, "learning_rate": 0.0003672544564278171, "loss": 5.1934, "step": 639500 }, { "epoch": 4.09639325375236, "grad_norm": 0.447265625, "learning_rate": 0.00036722885396998114, "loss": 5.1895, "step": 640000 }, { "epoch": 4.099593560981854, "grad_norm": 0.490234375, "learning_rate": 0.0003672032515121452, "loss": 5.193, "step": 640500 }, { "epoch": 4.102793868211348, "grad_norm": 0.498046875, "learning_rate": 0.00036717764905430927, "loss": 5.1871, "step": 641000 }, { "epoch": 4.105994175440842, "grad_norm": 0.451171875, "learning_rate": 0.0003671520465964733, "loss": 5.1882, "step": 641500 }, { "epoch": 4.109194482670336, "grad_norm": 0.50390625, "learning_rate": 0.00036712644413863733, "loss": 5.1871, "step": 642000 }, { "epoch": 4.11239478989983, "grad_norm": 0.462890625, "learning_rate": 0.00036710084168080137, "loss": 5.1922, "step": 642500 }, { "epoch": 4.115595097129324, "grad_norm": 0.443359375, "learning_rate": 0.0003670752392229654, "loss": 5.186, "step": 643000 }, { "epoch": 4.118795404358819, "grad_norm": 0.412109375, "learning_rate": 0.0003670496367651295, "loss": 5.1933, "step": 643500 }, { "epoch": 4.121995711588313, "grad_norm": 0.486328125, "learning_rate": 0.0003670240343072935, "loss": 5.1924, "step": 644000 }, { "epoch": 4.125196018817807, "grad_norm": 0.466796875, "learning_rate": 0.00036699843184945756, "loss": 5.1947, "step": 644500 }, { "epoch": 4.128396326047301, "grad_norm": 0.43359375, "learning_rate": 0.0003669728293916216, "loss": 5.2002, "step": 645000 }, { "epoch": 4.131596633276795, "grad_norm": 0.40234375, "learning_rate": 0.00036694722693378563, "loss": 5.1989, "step": 645500 }, { "epoch": 4.1347969405062885, "grad_norm": 0.42578125, "learning_rate": 0.00036692162447594966, "loss": 5.1922, "step": 646000 }, { "epoch": 4.1379972477357825, "grad_norm": 0.55078125, "learning_rate": 0.00036689602201811375, "loss": 5.1884, "step": 646500 }, { "epoch": 4.141197554965276, "grad_norm": 0.474609375, "learning_rate": 0.0003668704195602778, "loss": 5.1934, "step": 647000 }, { "epoch": 4.14439786219477, "grad_norm": 0.47265625, "learning_rate": 0.0003668448171024419, "loss": 5.1914, "step": 647500 }, { "epoch": 4.147598169424265, "grad_norm": 0.4453125, "learning_rate": 0.0003668192146446059, "loss": 5.1949, "step": 648000 }, { "epoch": 4.150798476653759, "grad_norm": 0.45703125, "learning_rate": 0.00036679361218676994, "loss": 5.1976, "step": 648500 }, { "epoch": 4.153998783883253, "grad_norm": 0.43359375, "learning_rate": 0.00036676800972893403, "loss": 5.1909, "step": 649000 }, { "epoch": 4.157199091112747, "grad_norm": 0.458984375, "learning_rate": 0.00036674240727109806, "loss": 5.1887, "step": 649500 }, { "epoch": 4.160399398342241, "grad_norm": 0.4609375, "learning_rate": 0.0003667168048132621, "loss": 5.1877, "step": 650000 }, { "epoch": 4.163599705571735, "grad_norm": 0.447265625, "learning_rate": 0.00036669120235542613, "loss": 5.1908, "step": 650500 }, { "epoch": 4.166800012801229, "grad_norm": 0.52734375, "learning_rate": 0.00036666559989759017, "loss": 5.1921, "step": 651000 }, { "epoch": 4.170000320030723, "grad_norm": 0.52734375, "learning_rate": 0.0003666399974397542, "loss": 5.1892, "step": 651500 }, { "epoch": 4.173200627260217, "grad_norm": 0.48828125, "learning_rate": 0.0003666143949819183, "loss": 5.1924, "step": 652000 }, { "epoch": 4.176400934489711, "grad_norm": 0.52734375, "learning_rate": 0.0003665887925240823, "loss": 5.195, "step": 652500 }, { "epoch": 4.179601241719205, "grad_norm": 0.4609375, "learning_rate": 0.00036656319006624636, "loss": 5.1932, "step": 653000 }, { "epoch": 4.182801548948699, "grad_norm": 0.4609375, "learning_rate": 0.0003665375876084104, "loss": 5.1947, "step": 653500 }, { "epoch": 4.186001856178193, "grad_norm": 0.46875, "learning_rate": 0.0003665119851505745, "loss": 5.1889, "step": 654000 }, { "epoch": 4.189202163407687, "grad_norm": 0.470703125, "learning_rate": 0.0003664863826927385, "loss": 5.1934, "step": 654500 }, { "epoch": 4.192402470637181, "grad_norm": 0.44921875, "learning_rate": 0.0003664607802349026, "loss": 5.1974, "step": 655000 }, { "epoch": 4.195602777866675, "grad_norm": 0.48828125, "learning_rate": 0.00036643517777706664, "loss": 5.1908, "step": 655500 }, { "epoch": 4.198803085096169, "grad_norm": 0.423828125, "learning_rate": 0.00036640957531923067, "loss": 5.1891, "step": 656000 }, { "epoch": 4.202003392325663, "grad_norm": 0.47265625, "learning_rate": 0.0003663839728613947, "loss": 5.1873, "step": 656500 }, { "epoch": 4.205203699555157, "grad_norm": 0.462890625, "learning_rate": 0.00036635837040355874, "loss": 5.1883, "step": 657000 }, { "epoch": 4.208404006784651, "grad_norm": 0.435546875, "learning_rate": 0.00036633276794572283, "loss": 5.1928, "step": 657500 }, { "epoch": 4.211604314014146, "grad_norm": 0.50390625, "learning_rate": 0.00036630716548788686, "loss": 5.1884, "step": 658000 }, { "epoch": 4.21480462124364, "grad_norm": 0.5078125, "learning_rate": 0.0003662815630300509, "loss": 5.1933, "step": 658500 }, { "epoch": 4.218004928473134, "grad_norm": 0.52734375, "learning_rate": 0.00036625596057221493, "loss": 5.1924, "step": 659000 }, { "epoch": 4.2212052357026275, "grad_norm": 0.462890625, "learning_rate": 0.00036623035811437897, "loss": 5.1907, "step": 659500 }, { "epoch": 4.2244055429321214, "grad_norm": 0.474609375, "learning_rate": 0.00036620475565654306, "loss": 5.1954, "step": 660000 }, { "epoch": 4.227605850161615, "grad_norm": 0.451171875, "learning_rate": 0.0003661791531987071, "loss": 5.1918, "step": 660500 }, { "epoch": 4.230806157391109, "grad_norm": 0.4375, "learning_rate": 0.0003661535507408711, "loss": 5.1888, "step": 661000 }, { "epoch": 4.234006464620603, "grad_norm": 0.47265625, "learning_rate": 0.00036612794828303516, "loss": 5.1945, "step": 661500 }, { "epoch": 4.237206771850097, "grad_norm": 0.46875, "learning_rate": 0.00036610234582519925, "loss": 5.1909, "step": 662000 }, { "epoch": 4.240407079079592, "grad_norm": 0.423828125, "learning_rate": 0.0003660767433673633, "loss": 5.1898, "step": 662500 }, { "epoch": 4.243607386309086, "grad_norm": 0.447265625, "learning_rate": 0.00036605114090952737, "loss": 5.1976, "step": 663000 }, { "epoch": 4.24680769353858, "grad_norm": 0.5078125, "learning_rate": 0.0003660255384516914, "loss": 5.1939, "step": 663500 }, { "epoch": 4.250008000768074, "grad_norm": 0.5078125, "learning_rate": 0.00036599993599385544, "loss": 5.1878, "step": 664000 }, { "epoch": 4.253208307997568, "grad_norm": 0.447265625, "learning_rate": 0.00036597433353601947, "loss": 5.1858, "step": 664500 }, { "epoch": 4.256408615227062, "grad_norm": 0.5078125, "learning_rate": 0.0003659487310781835, "loss": 5.1895, "step": 665000 }, { "epoch": 4.259608922456556, "grad_norm": 0.4296875, "learning_rate": 0.0003659231286203476, "loss": 5.1937, "step": 665500 }, { "epoch": 4.26280922968605, "grad_norm": 0.5078125, "learning_rate": 0.00036589752616251163, "loss": 5.1913, "step": 666000 }, { "epoch": 4.2660095369155435, "grad_norm": 0.44140625, "learning_rate": 0.00036587192370467566, "loss": 5.191, "step": 666500 }, { "epoch": 4.269209844145038, "grad_norm": 0.482421875, "learning_rate": 0.0003658463212468397, "loss": 5.1996, "step": 667000 }, { "epoch": 4.272410151374532, "grad_norm": 0.466796875, "learning_rate": 0.00036582071878900373, "loss": 5.1936, "step": 667500 }, { "epoch": 4.275610458604026, "grad_norm": 0.51171875, "learning_rate": 0.0003657951163311678, "loss": 5.1897, "step": 668000 }, { "epoch": 4.27881076583352, "grad_norm": 0.470703125, "learning_rate": 0.00036576951387333185, "loss": 5.1917, "step": 668500 }, { "epoch": 4.282011073063014, "grad_norm": 0.447265625, "learning_rate": 0.0003657439114154959, "loss": 5.1918, "step": 669000 }, { "epoch": 4.285211380292508, "grad_norm": 0.486328125, "learning_rate": 0.00036571830895766, "loss": 5.1907, "step": 669500 }, { "epoch": 4.288411687522002, "grad_norm": 0.466796875, "learning_rate": 0.000365692706499824, "loss": 5.1899, "step": 670000 }, { "epoch": 4.291611994751496, "grad_norm": 0.482421875, "learning_rate": 0.00036566710404198805, "loss": 5.1958, "step": 670500 }, { "epoch": 4.29481230198099, "grad_norm": 0.4609375, "learning_rate": 0.00036564150158415213, "loss": 5.1896, "step": 671000 }, { "epoch": 4.298012609210485, "grad_norm": 0.48828125, "learning_rate": 0.00036561589912631617, "loss": 5.197, "step": 671500 }, { "epoch": 4.301212916439979, "grad_norm": 0.423828125, "learning_rate": 0.0003655902966684802, "loss": 5.1875, "step": 672000 }, { "epoch": 4.3044132236694725, "grad_norm": 0.51171875, "learning_rate": 0.00036556469421064424, "loss": 5.1866, "step": 672500 }, { "epoch": 4.3076135308989665, "grad_norm": 0.41015625, "learning_rate": 0.00036553909175280827, "loss": 5.1938, "step": 673000 }, { "epoch": 4.31081383812846, "grad_norm": 0.443359375, "learning_rate": 0.00036551348929497236, "loss": 5.1941, "step": 673500 }, { "epoch": 4.314014145357954, "grad_norm": 0.46484375, "learning_rate": 0.0003654878868371364, "loss": 5.1871, "step": 674000 }, { "epoch": 4.317214452587448, "grad_norm": 0.458984375, "learning_rate": 0.00036546228437930043, "loss": 5.194, "step": 674500 }, { "epoch": 4.320414759816942, "grad_norm": 0.47265625, "learning_rate": 0.00036543668192146446, "loss": 5.1944, "step": 675000 }, { "epoch": 4.323615067046436, "grad_norm": 0.53515625, "learning_rate": 0.0003654110794636285, "loss": 5.1897, "step": 675500 }, { "epoch": 4.32681537427593, "grad_norm": 0.470703125, "learning_rate": 0.00036538547700579253, "loss": 5.195, "step": 676000 }, { "epoch": 4.330015681505425, "grad_norm": 0.46875, "learning_rate": 0.0003653598745479566, "loss": 5.1907, "step": 676500 }, { "epoch": 4.333215988734919, "grad_norm": 0.462890625, "learning_rate": 0.00036533427209012065, "loss": 5.1874, "step": 677000 }, { "epoch": 4.336416295964413, "grad_norm": 0.45703125, "learning_rate": 0.00036530866963228474, "loss": 5.1906, "step": 677500 }, { "epoch": 4.339616603193907, "grad_norm": 0.458984375, "learning_rate": 0.0003652830671744488, "loss": 5.1959, "step": 678000 }, { "epoch": 4.342816910423401, "grad_norm": 0.486328125, "learning_rate": 0.0003652574647166128, "loss": 5.1941, "step": 678500 }, { "epoch": 4.346017217652895, "grad_norm": 0.474609375, "learning_rate": 0.0003652318622587769, "loss": 5.1884, "step": 679000 }, { "epoch": 4.3492175248823886, "grad_norm": 0.43359375, "learning_rate": 0.00036520625980094093, "loss": 5.19, "step": 679500 }, { "epoch": 4.3524178321118825, "grad_norm": 0.5, "learning_rate": 0.00036518065734310497, "loss": 5.1904, "step": 680000 }, { "epoch": 4.355618139341376, "grad_norm": 0.44140625, "learning_rate": 0.000365155054885269, "loss": 5.1916, "step": 680500 }, { "epoch": 4.35881844657087, "grad_norm": 0.46484375, "learning_rate": 0.00036512945242743304, "loss": 5.1923, "step": 681000 }, { "epoch": 4.362018753800365, "grad_norm": 0.490234375, "learning_rate": 0.00036510384996959707, "loss": 5.1895, "step": 681500 }, { "epoch": 4.365219061029859, "grad_norm": 0.494140625, "learning_rate": 0.00036507824751176116, "loss": 5.1908, "step": 682000 }, { "epoch": 4.368419368259353, "grad_norm": 0.44140625, "learning_rate": 0.0003650526450539252, "loss": 5.1916, "step": 682500 }, { "epoch": 4.371619675488847, "grad_norm": 0.46484375, "learning_rate": 0.0003650270425960892, "loss": 5.1936, "step": 683000 }, { "epoch": 4.374819982718341, "grad_norm": 0.478515625, "learning_rate": 0.00036500144013825326, "loss": 5.1916, "step": 683500 }, { "epoch": 4.378020289947835, "grad_norm": 0.46484375, "learning_rate": 0.0003649758376804173, "loss": 5.1896, "step": 684000 }, { "epoch": 4.381220597177329, "grad_norm": 0.486328125, "learning_rate": 0.0003649502352225814, "loss": 5.1947, "step": 684500 }, { "epoch": 4.384420904406823, "grad_norm": 0.453125, "learning_rate": 0.0003649246327647454, "loss": 5.1908, "step": 685000 }, { "epoch": 4.387621211636317, "grad_norm": 0.455078125, "learning_rate": 0.0003648990303069095, "loss": 5.1974, "step": 685500 }, { "epoch": 4.3908215188658115, "grad_norm": 0.515625, "learning_rate": 0.00036487342784907354, "loss": 5.1908, "step": 686000 }, { "epoch": 4.3940218260953054, "grad_norm": 0.486328125, "learning_rate": 0.0003648478253912376, "loss": 5.1929, "step": 686500 }, { "epoch": 4.397222133324799, "grad_norm": 0.51171875, "learning_rate": 0.0003648222229334016, "loss": 5.1903, "step": 687000 }, { "epoch": 4.400422440554293, "grad_norm": 0.5546875, "learning_rate": 0.0003647966204755657, "loss": 5.1931, "step": 687500 }, { "epoch": 4.403622747783787, "grad_norm": 0.4609375, "learning_rate": 0.00036477101801772973, "loss": 5.192, "step": 688000 }, { "epoch": 4.406823055013281, "grad_norm": 0.54296875, "learning_rate": 0.00036474541555989377, "loss": 5.1902, "step": 688500 }, { "epoch": 4.410023362242775, "grad_norm": 0.455078125, "learning_rate": 0.0003647198131020578, "loss": 5.1876, "step": 689000 }, { "epoch": 4.413223669472269, "grad_norm": 0.486328125, "learning_rate": 0.00036469421064422183, "loss": 5.1888, "step": 689500 }, { "epoch": 4.416423976701763, "grad_norm": 0.453125, "learning_rate": 0.0003646686081863859, "loss": 5.1934, "step": 690000 }, { "epoch": 4.419624283931258, "grad_norm": 0.490234375, "learning_rate": 0.00036464300572854996, "loss": 5.1933, "step": 690500 }, { "epoch": 4.422824591160752, "grad_norm": 0.453125, "learning_rate": 0.000364617403270714, "loss": 5.1894, "step": 691000 }, { "epoch": 4.426024898390246, "grad_norm": 0.53515625, "learning_rate": 0.000364591800812878, "loss": 5.1912, "step": 691500 }, { "epoch": 4.42922520561974, "grad_norm": 0.421875, "learning_rate": 0.0003645661983550421, "loss": 5.1926, "step": 692000 }, { "epoch": 4.432425512849234, "grad_norm": 0.455078125, "learning_rate": 0.00036454059589720615, "loss": 5.1901, "step": 692500 }, { "epoch": 4.4356258200787275, "grad_norm": 0.482421875, "learning_rate": 0.00036451499343937024, "loss": 5.1939, "step": 693000 }, { "epoch": 4.4388261273082215, "grad_norm": 0.5078125, "learning_rate": 0.00036448939098153427, "loss": 5.1953, "step": 693500 }, { "epoch": 4.442026434537715, "grad_norm": 0.5234375, "learning_rate": 0.0003644637885236983, "loss": 5.1922, "step": 694000 }, { "epoch": 4.445226741767209, "grad_norm": 0.515625, "learning_rate": 0.00036443818606586234, "loss": 5.1988, "step": 694500 }, { "epoch": 4.448427048996703, "grad_norm": 0.4921875, "learning_rate": 0.0003644125836080264, "loss": 5.1935, "step": 695000 }, { "epoch": 4.451627356226198, "grad_norm": 0.61328125, "learning_rate": 0.00036438698115019046, "loss": 5.1963, "step": 695500 }, { "epoch": 4.454827663455692, "grad_norm": 0.45703125, "learning_rate": 0.0003643613786923545, "loss": 5.1938, "step": 696000 }, { "epoch": 4.458027970685186, "grad_norm": 0.5078125, "learning_rate": 0.00036433577623451853, "loss": 5.1942, "step": 696500 }, { "epoch": 4.46122827791468, "grad_norm": 0.455078125, "learning_rate": 0.00036431017377668256, "loss": 5.1892, "step": 697000 }, { "epoch": 4.464428585144174, "grad_norm": 0.51171875, "learning_rate": 0.0003642845713188466, "loss": 5.1923, "step": 697500 }, { "epoch": 4.467628892373668, "grad_norm": 0.490234375, "learning_rate": 0.0003642589688610107, "loss": 5.1957, "step": 698000 }, { "epoch": 4.470829199603162, "grad_norm": 0.498046875, "learning_rate": 0.0003642333664031747, "loss": 5.1982, "step": 698500 }, { "epoch": 4.474029506832656, "grad_norm": 0.4765625, "learning_rate": 0.00036420776394533876, "loss": 5.194, "step": 699000 }, { "epoch": 4.47722981406215, "grad_norm": 0.4375, "learning_rate": 0.0003641821614875028, "loss": 5.1964, "step": 699500 }, { "epoch": 4.4804301212916435, "grad_norm": 0.484375, "learning_rate": 0.0003641565590296669, "loss": 5.1911, "step": 700000 }, { "epoch": 4.483630428521138, "grad_norm": 0.4453125, "learning_rate": 0.0003641309565718309, "loss": 5.195, "step": 700500 }, { "epoch": 4.486830735750632, "grad_norm": 0.51953125, "learning_rate": 0.000364105354113995, "loss": 5.1903, "step": 701000 }, { "epoch": 4.490031042980126, "grad_norm": 0.453125, "learning_rate": 0.00036407975165615904, "loss": 5.2005, "step": 701500 }, { "epoch": 4.49323135020962, "grad_norm": 0.478515625, "learning_rate": 0.00036405414919832307, "loss": 5.1917, "step": 702000 }, { "epoch": 4.496431657439114, "grad_norm": 0.453125, "learning_rate": 0.0003640285467404871, "loss": 5.1933, "step": 702500 }, { "epoch": 4.499631964668608, "grad_norm": 0.46484375, "learning_rate": 0.00036400294428265114, "loss": 5.1886, "step": 703000 }, { "epoch": 4.502832271898102, "grad_norm": 0.443359375, "learning_rate": 0.0003639773418248152, "loss": 5.1913, "step": 703500 }, { "epoch": 4.506032579127596, "grad_norm": 0.46875, "learning_rate": 0.00036395173936697926, "loss": 5.2004, "step": 704000 }, { "epoch": 4.50923288635709, "grad_norm": 0.447265625, "learning_rate": 0.0003639261369091433, "loss": 5.1908, "step": 704500 }, { "epoch": 4.512433193586585, "grad_norm": 0.458984375, "learning_rate": 0.00036390053445130733, "loss": 5.1889, "step": 705000 }, { "epoch": 4.515633500816079, "grad_norm": 0.4921875, "learning_rate": 0.00036387493199347136, "loss": 5.1943, "step": 705500 }, { "epoch": 4.5188338080455726, "grad_norm": 0.462890625, "learning_rate": 0.0003638493295356354, "loss": 5.1924, "step": 706000 }, { "epoch": 4.5220341152750665, "grad_norm": 0.453125, "learning_rate": 0.0003638237270777995, "loss": 5.1945, "step": 706500 }, { "epoch": 4.52523442250456, "grad_norm": 0.478515625, "learning_rate": 0.0003637981246199635, "loss": 5.1936, "step": 707000 }, { "epoch": 4.528434729734054, "grad_norm": 0.455078125, "learning_rate": 0.0003637725221621276, "loss": 5.1932, "step": 707500 }, { "epoch": 4.531635036963548, "grad_norm": 0.451171875, "learning_rate": 0.00036374691970429164, "loss": 5.1967, "step": 708000 }, { "epoch": 4.534835344193042, "grad_norm": 0.44921875, "learning_rate": 0.0003637213172464557, "loss": 5.1921, "step": 708500 }, { "epoch": 4.538035651422536, "grad_norm": 0.470703125, "learning_rate": 0.00036369571478861977, "loss": 5.1905, "step": 709000 }, { "epoch": 4.541235958652031, "grad_norm": 0.4765625, "learning_rate": 0.0003636701123307838, "loss": 5.1932, "step": 709500 }, { "epoch": 4.544436265881525, "grad_norm": 0.4921875, "learning_rate": 0.00036364450987294783, "loss": 5.1955, "step": 710000 }, { "epoch": 4.547636573111019, "grad_norm": 0.50390625, "learning_rate": 0.00036361890741511187, "loss": 5.1927, "step": 710500 }, { "epoch": 4.550836880340513, "grad_norm": 0.466796875, "learning_rate": 0.0003635933049572759, "loss": 5.1945, "step": 711000 }, { "epoch": 4.554037187570007, "grad_norm": 0.443359375, "learning_rate": 0.00036356770249943994, "loss": 5.1902, "step": 711500 }, { "epoch": 4.557237494799501, "grad_norm": 0.46484375, "learning_rate": 0.000363542100041604, "loss": 5.1927, "step": 712000 }, { "epoch": 4.560437802028995, "grad_norm": 0.451171875, "learning_rate": 0.00036351649758376806, "loss": 5.1942, "step": 712500 }, { "epoch": 4.563638109258489, "grad_norm": 0.447265625, "learning_rate": 0.0003634908951259321, "loss": 5.1962, "step": 713000 }, { "epoch": 4.5668384164879825, "grad_norm": 0.65625, "learning_rate": 0.00036346529266809613, "loss": 5.1954, "step": 713500 }, { "epoch": 4.570038723717477, "grad_norm": 0.466796875, "learning_rate": 0.00036343969021026016, "loss": 5.1929, "step": 714000 }, { "epoch": 4.573239030946971, "grad_norm": 0.453125, "learning_rate": 0.00036341408775242425, "loss": 5.1943, "step": 714500 }, { "epoch": 4.576439338176465, "grad_norm": 0.46484375, "learning_rate": 0.0003633884852945883, "loss": 5.194, "step": 715000 }, { "epoch": 4.579639645405959, "grad_norm": 0.5, "learning_rate": 0.0003633628828367524, "loss": 5.194, "step": 715500 }, { "epoch": 4.582839952635453, "grad_norm": 0.44921875, "learning_rate": 0.0003633372803789164, "loss": 5.1945, "step": 716000 }, { "epoch": 4.586040259864947, "grad_norm": 0.46484375, "learning_rate": 0.00036331167792108044, "loss": 5.1895, "step": 716500 }, { "epoch": 4.589240567094441, "grad_norm": 0.482421875, "learning_rate": 0.0003632860754632445, "loss": 5.1929, "step": 717000 }, { "epoch": 4.592440874323935, "grad_norm": 0.50390625, "learning_rate": 0.00036326047300540857, "loss": 5.1933, "step": 717500 }, { "epoch": 4.595641181553429, "grad_norm": 0.453125, "learning_rate": 0.0003632348705475726, "loss": 5.1961, "step": 718000 }, { "epoch": 4.598841488782924, "grad_norm": 0.47265625, "learning_rate": 0.00036320926808973663, "loss": 5.1955, "step": 718500 }, { "epoch": 4.602041796012417, "grad_norm": 0.49609375, "learning_rate": 0.00036318366563190067, "loss": 5.1968, "step": 719000 }, { "epoch": 4.6052421032419115, "grad_norm": 0.466796875, "learning_rate": 0.0003631580631740647, "loss": 5.195, "step": 719500 }, { "epoch": 4.6084424104714055, "grad_norm": 0.44921875, "learning_rate": 0.0003631324607162288, "loss": 5.1944, "step": 720000 }, { "epoch": 4.611642717700899, "grad_norm": 0.486328125, "learning_rate": 0.0003631068582583928, "loss": 5.1937, "step": 720500 }, { "epoch": 4.614843024930393, "grad_norm": 0.46875, "learning_rate": 0.00036308125580055686, "loss": 5.196, "step": 721000 }, { "epoch": 4.618043332159887, "grad_norm": 0.482421875, "learning_rate": 0.0003630556533427209, "loss": 5.1996, "step": 721500 }, { "epoch": 4.621243639389381, "grad_norm": 0.53125, "learning_rate": 0.00036303005088488493, "loss": 5.194, "step": 722000 }, { "epoch": 4.624443946618875, "grad_norm": 0.482421875, "learning_rate": 0.000363004448427049, "loss": 5.1924, "step": 722500 }, { "epoch": 4.627644253848369, "grad_norm": 0.5390625, "learning_rate": 0.00036297884596921305, "loss": 5.2027, "step": 723000 }, { "epoch": 4.630844561077863, "grad_norm": 0.48828125, "learning_rate": 0.00036295324351137714, "loss": 5.199, "step": 723500 }, { "epoch": 4.634044868307358, "grad_norm": 0.458984375, "learning_rate": 0.00036292764105354117, "loss": 5.1987, "step": 724000 }, { "epoch": 4.637245175536852, "grad_norm": 0.41796875, "learning_rate": 0.0003629020385957052, "loss": 5.193, "step": 724500 }, { "epoch": 4.640445482766346, "grad_norm": 0.515625, "learning_rate": 0.00036287643613786924, "loss": 5.1978, "step": 725000 }, { "epoch": 4.64364578999584, "grad_norm": 0.470703125, "learning_rate": 0.00036285083368003333, "loss": 5.1938, "step": 725500 }, { "epoch": 4.646846097225334, "grad_norm": 0.453125, "learning_rate": 0.00036282523122219736, "loss": 5.1967, "step": 726000 }, { "epoch": 4.6500464044548275, "grad_norm": 0.466796875, "learning_rate": 0.0003627996287643614, "loss": 5.1909, "step": 726500 }, { "epoch": 4.6532467116843215, "grad_norm": 0.5546875, "learning_rate": 0.00036277402630652543, "loss": 5.1929, "step": 727000 }, { "epoch": 4.656447018913815, "grad_norm": 0.490234375, "learning_rate": 0.00036274842384868947, "loss": 5.1954, "step": 727500 }, { "epoch": 4.659647326143309, "grad_norm": 0.486328125, "learning_rate": 0.0003627228213908535, "loss": 5.1988, "step": 728000 }, { "epoch": 4.662847633372804, "grad_norm": 0.45703125, "learning_rate": 0.0003626972189330176, "loss": 5.1884, "step": 728500 }, { "epoch": 4.666047940602298, "grad_norm": 0.443359375, "learning_rate": 0.0003626716164751816, "loss": 5.1952, "step": 729000 }, { "epoch": 4.669248247831792, "grad_norm": 0.453125, "learning_rate": 0.00036264601401734566, "loss": 5.1953, "step": 729500 }, { "epoch": 4.672448555061286, "grad_norm": 0.494140625, "learning_rate": 0.00036262041155950975, "loss": 5.1932, "step": 730000 }, { "epoch": 4.67564886229078, "grad_norm": 0.49609375, "learning_rate": 0.0003625948091016738, "loss": 5.1955, "step": 730500 }, { "epoch": 4.678849169520274, "grad_norm": 0.5078125, "learning_rate": 0.00036256920664383787, "loss": 5.1975, "step": 731000 }, { "epoch": 4.682049476749768, "grad_norm": 0.4609375, "learning_rate": 0.0003625436041860019, "loss": 5.1946, "step": 731500 }, { "epoch": 4.685249783979262, "grad_norm": 0.4609375, "learning_rate": 0.00036251800172816594, "loss": 5.1981, "step": 732000 }, { "epoch": 4.688450091208756, "grad_norm": 0.474609375, "learning_rate": 0.00036249239927032997, "loss": 5.1932, "step": 732500 }, { "epoch": 4.6916503984382505, "grad_norm": 0.458984375, "learning_rate": 0.000362466796812494, "loss": 5.1963, "step": 733000 }, { "epoch": 4.694850705667744, "grad_norm": 0.5078125, "learning_rate": 0.00036244119435465804, "loss": 5.2029, "step": 733500 }, { "epoch": 4.698051012897238, "grad_norm": 0.49609375, "learning_rate": 0.00036241559189682213, "loss": 5.1952, "step": 734000 }, { "epoch": 4.701251320126732, "grad_norm": 0.482421875, "learning_rate": 0.00036238998943898616, "loss": 5.1905, "step": 734500 }, { "epoch": 4.704451627356226, "grad_norm": 0.494140625, "learning_rate": 0.0003623643869811502, "loss": 5.1941, "step": 735000 }, { "epoch": 4.70765193458572, "grad_norm": 0.421875, "learning_rate": 0.00036233878452331423, "loss": 5.1893, "step": 735500 }, { "epoch": 4.710852241815214, "grad_norm": 0.466796875, "learning_rate": 0.00036231318206547827, "loss": 5.1955, "step": 736000 }, { "epoch": 4.714052549044708, "grad_norm": 0.47265625, "learning_rate": 0.00036228757960764235, "loss": 5.1985, "step": 736500 }, { "epoch": 4.717252856274202, "grad_norm": 0.484375, "learning_rate": 0.0003622619771498064, "loss": 5.1987, "step": 737000 }, { "epoch": 4.720453163503697, "grad_norm": 0.5, "learning_rate": 0.0003622363746919704, "loss": 5.1945, "step": 737500 }, { "epoch": 4.723653470733191, "grad_norm": 0.458984375, "learning_rate": 0.0003622107722341345, "loss": 5.1946, "step": 738000 }, { "epoch": 4.726853777962685, "grad_norm": 0.48046875, "learning_rate": 0.00036218516977629855, "loss": 5.1921, "step": 738500 }, { "epoch": 4.730054085192179, "grad_norm": 0.4609375, "learning_rate": 0.00036215956731846263, "loss": 5.1962, "step": 739000 }, { "epoch": 4.733254392421673, "grad_norm": 0.490234375, "learning_rate": 0.00036213396486062667, "loss": 5.1977, "step": 739500 }, { "epoch": 4.7364546996511665, "grad_norm": 0.5234375, "learning_rate": 0.0003621083624027907, "loss": 5.1945, "step": 740000 }, { "epoch": 4.73965500688066, "grad_norm": 0.49609375, "learning_rate": 0.00036208275994495474, "loss": 5.1903, "step": 740500 }, { "epoch": 4.742855314110154, "grad_norm": 0.458984375, "learning_rate": 0.00036205715748711877, "loss": 5.2021, "step": 741000 }, { "epoch": 4.746055621339648, "grad_norm": 0.5, "learning_rate": 0.0003620315550292828, "loss": 5.1963, "step": 741500 }, { "epoch": 4.749255928569143, "grad_norm": 0.5234375, "learning_rate": 0.0003620059525714469, "loss": 5.1947, "step": 742000 }, { "epoch": 4.752456235798636, "grad_norm": 0.5234375, "learning_rate": 0.00036198035011361093, "loss": 5.1968, "step": 742500 }, { "epoch": 4.755656543028131, "grad_norm": 0.482421875, "learning_rate": 0.00036195474765577496, "loss": 5.1955, "step": 743000 }, { "epoch": 4.758856850257625, "grad_norm": 0.470703125, "learning_rate": 0.000361929145197939, "loss": 5.1982, "step": 743500 }, { "epoch": 4.762057157487119, "grad_norm": 0.5546875, "learning_rate": 0.00036190354274010303, "loss": 5.1986, "step": 744000 }, { "epoch": 4.765257464716613, "grad_norm": 0.482421875, "learning_rate": 0.0003618779402822671, "loss": 5.1994, "step": 744500 }, { "epoch": 4.768457771946107, "grad_norm": 0.498046875, "learning_rate": 0.00036185233782443115, "loss": 5.1965, "step": 745000 }, { "epoch": 4.771658079175601, "grad_norm": 0.5, "learning_rate": 0.00036182673536659524, "loss": 5.195, "step": 745500 }, { "epoch": 4.774858386405095, "grad_norm": 0.423828125, "learning_rate": 0.0003618011329087593, "loss": 5.1981, "step": 746000 }, { "epoch": 4.778058693634589, "grad_norm": 0.4921875, "learning_rate": 0.0003617755304509233, "loss": 5.1937, "step": 746500 }, { "epoch": 4.7812590008640825, "grad_norm": 0.47265625, "learning_rate": 0.00036174992799308734, "loss": 5.1932, "step": 747000 }, { "epoch": 4.784459308093577, "grad_norm": 0.54296875, "learning_rate": 0.00036172432553525143, "loss": 5.1944, "step": 747500 }, { "epoch": 4.787659615323071, "grad_norm": 0.47265625, "learning_rate": 0.00036169872307741547, "loss": 5.1957, "step": 748000 }, { "epoch": 4.790859922552565, "grad_norm": 0.51171875, "learning_rate": 0.0003616731206195795, "loss": 5.1977, "step": 748500 }, { "epoch": 4.794060229782059, "grad_norm": 0.478515625, "learning_rate": 0.00036164751816174354, "loss": 5.2021, "step": 749000 }, { "epoch": 4.797260537011553, "grad_norm": 0.5625, "learning_rate": 0.00036162191570390757, "loss": 5.1928, "step": 749500 }, { "epoch": 4.800460844241047, "grad_norm": 0.51171875, "learning_rate": 0.00036159631324607166, "loss": 5.1915, "step": 750000 }, { "epoch": 4.803661151470541, "grad_norm": 0.5234375, "learning_rate": 0.0003615707107882357, "loss": 5.1967, "step": 750500 }, { "epoch": 4.806861458700035, "grad_norm": 0.515625, "learning_rate": 0.0003615451083303997, "loss": 5.1971, "step": 751000 }, { "epoch": 4.810061765929529, "grad_norm": 0.462890625, "learning_rate": 0.00036151950587256376, "loss": 5.1977, "step": 751500 }, { "epoch": 4.813262073159024, "grad_norm": 0.486328125, "learning_rate": 0.0003614939034147278, "loss": 5.1976, "step": 752000 }, { "epoch": 4.816462380388518, "grad_norm": 0.486328125, "learning_rate": 0.0003614683009568919, "loss": 5.19, "step": 752500 }, { "epoch": 4.8196626876180115, "grad_norm": 0.474609375, "learning_rate": 0.0003614426984990559, "loss": 5.1953, "step": 753000 }, { "epoch": 4.8228629948475055, "grad_norm": 0.458984375, "learning_rate": 0.00036141709604122, "loss": 5.2064, "step": 753500 }, { "epoch": 4.826063302076999, "grad_norm": 0.44140625, "learning_rate": 0.00036139149358338404, "loss": 5.1947, "step": 754000 }, { "epoch": 4.829263609306493, "grad_norm": 0.48828125, "learning_rate": 0.0003613658911255481, "loss": 5.191, "step": 754500 }, { "epoch": 4.832463916535987, "grad_norm": 0.46875, "learning_rate": 0.0003613402886677121, "loss": 5.1949, "step": 755000 }, { "epoch": 4.835664223765481, "grad_norm": 0.498046875, "learning_rate": 0.0003613146862098762, "loss": 5.1965, "step": 755500 }, { "epoch": 4.838864530994975, "grad_norm": 0.462890625, "learning_rate": 0.00036128908375204023, "loss": 5.1932, "step": 756000 }, { "epoch": 4.84206483822447, "grad_norm": 0.45703125, "learning_rate": 0.00036126348129420427, "loss": 5.1951, "step": 756500 }, { "epoch": 4.845265145453964, "grad_norm": 0.48828125, "learning_rate": 0.0003612378788363683, "loss": 5.1925, "step": 757000 }, { "epoch": 4.848465452683458, "grad_norm": 0.4765625, "learning_rate": 0.00036121227637853233, "loss": 5.1934, "step": 757500 }, { "epoch": 4.851665759912952, "grad_norm": 0.478515625, "learning_rate": 0.00036118667392069637, "loss": 5.1932, "step": 758000 }, { "epoch": 4.854866067142446, "grad_norm": 0.53125, "learning_rate": 0.00036116107146286046, "loss": 5.1952, "step": 758500 }, { "epoch": 4.85806637437194, "grad_norm": 0.515625, "learning_rate": 0.0003611354690050245, "loss": 5.1937, "step": 759000 }, { "epoch": 4.861266681601434, "grad_norm": 0.462890625, "learning_rate": 0.0003611098665471885, "loss": 5.1867, "step": 759500 }, { "epoch": 4.8644669888309275, "grad_norm": 0.48046875, "learning_rate": 0.00036108426408935256, "loss": 5.1896, "step": 760000 }, { "epoch": 4.8676672960604215, "grad_norm": 0.4453125, "learning_rate": 0.00036105866163151665, "loss": 5.1969, "step": 760500 }, { "epoch": 4.870867603289916, "grad_norm": 0.466796875, "learning_rate": 0.00036103305917368074, "loss": 5.1907, "step": 761000 }, { "epoch": 4.87406791051941, "grad_norm": 0.546875, "learning_rate": 0.00036100745671584477, "loss": 5.1907, "step": 761500 }, { "epoch": 4.877268217748904, "grad_norm": 0.7734375, "learning_rate": 0.0003609818542580088, "loss": 5.1944, "step": 762000 }, { "epoch": 4.880468524978398, "grad_norm": 0.54296875, "learning_rate": 0.00036095625180017284, "loss": 5.1974, "step": 762500 }, { "epoch": 4.883668832207892, "grad_norm": 0.478515625, "learning_rate": 0.0003609306493423369, "loss": 5.1938, "step": 763000 }, { "epoch": 4.886869139437386, "grad_norm": 0.462890625, "learning_rate": 0.0003609050468845009, "loss": 5.1975, "step": 763500 }, { "epoch": 4.89006944666688, "grad_norm": 0.52734375, "learning_rate": 0.000360879444426665, "loss": 5.1969, "step": 764000 }, { "epoch": 4.893269753896374, "grad_norm": 0.50390625, "learning_rate": 0.00036085384196882903, "loss": 5.1967, "step": 764500 }, { "epoch": 4.896470061125868, "grad_norm": 0.466796875, "learning_rate": 0.00036082823951099306, "loss": 5.1969, "step": 765000 }, { "epoch": 4.899670368355362, "grad_norm": 0.46484375, "learning_rate": 0.0003608026370531571, "loss": 5.1924, "step": 765500 }, { "epoch": 4.902870675584856, "grad_norm": 0.4609375, "learning_rate": 0.00036077703459532113, "loss": 5.1943, "step": 766000 }, { "epoch": 4.9060709828143505, "grad_norm": 0.466796875, "learning_rate": 0.0003607514321374852, "loss": 5.1916, "step": 766500 }, { "epoch": 4.909271290043844, "grad_norm": 0.466796875, "learning_rate": 0.00036072582967964926, "loss": 5.199, "step": 767000 }, { "epoch": 4.912471597273338, "grad_norm": 0.498046875, "learning_rate": 0.0003607002272218133, "loss": 5.1961, "step": 767500 }, { "epoch": 4.915671904502832, "grad_norm": 0.490234375, "learning_rate": 0.0003606746247639774, "loss": 5.196, "step": 768000 }, { "epoch": 4.918872211732326, "grad_norm": 0.5078125, "learning_rate": 0.0003606490223061414, "loss": 5.1949, "step": 768500 }, { "epoch": 4.92207251896182, "grad_norm": 0.5, "learning_rate": 0.00036062341984830545, "loss": 5.19, "step": 769000 }, { "epoch": 4.925272826191314, "grad_norm": 0.4609375, "learning_rate": 0.00036059781739046954, "loss": 5.1954, "step": 769500 }, { "epoch": 4.928473133420808, "grad_norm": 0.48828125, "learning_rate": 0.00036057221493263357, "loss": 5.201, "step": 770000 }, { "epoch": 4.931673440650302, "grad_norm": 0.466796875, "learning_rate": 0.0003605466124747976, "loss": 5.1932, "step": 770500 }, { "epoch": 4.934873747879797, "grad_norm": 0.50390625, "learning_rate": 0.00036052101001696164, "loss": 5.1932, "step": 771000 }, { "epoch": 4.938074055109291, "grad_norm": 0.470703125, "learning_rate": 0.00036049540755912567, "loss": 5.1997, "step": 771500 }, { "epoch": 4.941274362338785, "grad_norm": 0.451171875, "learning_rate": 0.00036046980510128976, "loss": 5.1931, "step": 772000 }, { "epoch": 4.944474669568279, "grad_norm": 0.51171875, "learning_rate": 0.0003604442026434538, "loss": 5.1964, "step": 772500 }, { "epoch": 4.947674976797773, "grad_norm": 0.494140625, "learning_rate": 0.00036041860018561783, "loss": 5.1998, "step": 773000 }, { "epoch": 4.9508752840272665, "grad_norm": 0.451171875, "learning_rate": 0.00036039299772778186, "loss": 5.1968, "step": 773500 }, { "epoch": 4.9540755912567604, "grad_norm": 0.60546875, "learning_rate": 0.0003603673952699459, "loss": 5.1856, "step": 774000 }, { "epoch": 4.957275898486254, "grad_norm": 0.486328125, "learning_rate": 0.00036034179281211, "loss": 5.1989, "step": 774500 }, { "epoch": 4.960476205715748, "grad_norm": 0.49609375, "learning_rate": 0.000360316190354274, "loss": 5.1884, "step": 775000 }, { "epoch": 4.963676512945243, "grad_norm": 0.474609375, "learning_rate": 0.00036029058789643806, "loss": 5.1902, "step": 775500 }, { "epoch": 4.966876820174737, "grad_norm": 0.490234375, "learning_rate": 0.00036026498543860214, "loss": 5.1938, "step": 776000 }, { "epoch": 4.970077127404231, "grad_norm": 0.4609375, "learning_rate": 0.0003602393829807662, "loss": 5.1943, "step": 776500 }, { "epoch": 4.973277434633725, "grad_norm": 0.478515625, "learning_rate": 0.0003602137805229302, "loss": 5.1974, "step": 777000 }, { "epoch": 4.976477741863219, "grad_norm": 0.474609375, "learning_rate": 0.0003601881780650943, "loss": 5.1969, "step": 777500 }, { "epoch": 4.979678049092713, "grad_norm": 0.482421875, "learning_rate": 0.00036016257560725833, "loss": 5.1962, "step": 778000 }, { "epoch": 4.982878356322207, "grad_norm": 0.49609375, "learning_rate": 0.00036013697314942237, "loss": 5.1981, "step": 778500 }, { "epoch": 4.986078663551701, "grad_norm": 0.57421875, "learning_rate": 0.0003601113706915864, "loss": 5.2013, "step": 779000 }, { "epoch": 4.989278970781195, "grad_norm": 0.51171875, "learning_rate": 0.00036008576823375044, "loss": 5.1841, "step": 779500 }, { "epoch": 4.9924792780106895, "grad_norm": 0.4375, "learning_rate": 0.0003600601657759145, "loss": 5.194, "step": 780000 }, { "epoch": 4.995679585240183, "grad_norm": 0.462890625, "learning_rate": 0.00036003456331807856, "loss": 5.1985, "step": 780500 }, { "epoch": 4.998879892469677, "grad_norm": 0.48828125, "learning_rate": 0.0003600089608602426, "loss": 5.1993, "step": 781000 }, { "epoch": 5.0, "eval_loss": 5.186727523803711, "eval_runtime": 1.1486, "eval_samples_per_second": 870.654, "eval_steps_per_second": 13.93, "step": 781175 }, { "epoch": 5.002080199699171, "grad_norm": 0.455078125, "learning_rate": 0.00035998335840240663, "loss": 5.196, "step": 781500 }, { "epoch": 5.005280506928665, "grad_norm": 0.50390625, "learning_rate": 0.00035995775594457066, "loss": 5.1908, "step": 782000 }, { "epoch": 5.008480814158159, "grad_norm": 0.447265625, "learning_rate": 0.00035993215348673475, "loss": 5.1907, "step": 782500 }, { "epoch": 5.011681121387653, "grad_norm": 0.5078125, "learning_rate": 0.0003599065510288988, "loss": 5.1877, "step": 783000 }, { "epoch": 5.014881428617147, "grad_norm": 0.486328125, "learning_rate": 0.0003598809485710629, "loss": 5.1926, "step": 783500 }, { "epoch": 5.018081735846641, "grad_norm": 0.5078125, "learning_rate": 0.0003598553461132269, "loss": 5.1938, "step": 784000 }, { "epoch": 5.021282043076135, "grad_norm": 0.4921875, "learning_rate": 0.00035982974365539094, "loss": 5.1873, "step": 784500 }, { "epoch": 5.02448235030563, "grad_norm": 0.4765625, "learning_rate": 0.000359804141197555, "loss": 5.1941, "step": 785000 }, { "epoch": 5.027682657535124, "grad_norm": 0.48046875, "learning_rate": 0.00035977853873971907, "loss": 5.1923, "step": 785500 }, { "epoch": 5.030882964764618, "grad_norm": 0.4921875, "learning_rate": 0.0003597529362818831, "loss": 5.1884, "step": 786000 }, { "epoch": 5.0340832719941115, "grad_norm": 0.4453125, "learning_rate": 0.00035972733382404713, "loss": 5.1959, "step": 786500 }, { "epoch": 5.0372835792236055, "grad_norm": 0.451171875, "learning_rate": 0.00035970173136621117, "loss": 5.1889, "step": 787000 }, { "epoch": 5.040483886453099, "grad_norm": 0.54296875, "learning_rate": 0.0003596761289083752, "loss": 5.195, "step": 787500 }, { "epoch": 5.043684193682593, "grad_norm": 0.58203125, "learning_rate": 0.00035965052645053924, "loss": 5.1857, "step": 788000 }, { "epoch": 5.046884500912087, "grad_norm": 0.51171875, "learning_rate": 0.0003596249239927033, "loss": 5.1941, "step": 788500 }, { "epoch": 5.050084808141581, "grad_norm": 0.4765625, "learning_rate": 0.00035959932153486736, "loss": 5.1934, "step": 789000 }, { "epoch": 5.053285115371076, "grad_norm": 0.5625, "learning_rate": 0.0003595737190770314, "loss": 5.1901, "step": 789500 }, { "epoch": 5.05648542260057, "grad_norm": 0.4921875, "learning_rate": 0.00035954811661919543, "loss": 5.1927, "step": 790000 }, { "epoch": 5.059685729830064, "grad_norm": 0.474609375, "learning_rate": 0.0003595225141613595, "loss": 5.1878, "step": 790500 }, { "epoch": 5.062886037059558, "grad_norm": 0.443359375, "learning_rate": 0.00035949691170352355, "loss": 5.1916, "step": 791000 }, { "epoch": 5.066086344289052, "grad_norm": 0.478515625, "learning_rate": 0.00035947130924568764, "loss": 5.1882, "step": 791500 }, { "epoch": 5.069286651518546, "grad_norm": 0.5234375, "learning_rate": 0.0003594457067878517, "loss": 5.1905, "step": 792000 }, { "epoch": 5.07248695874804, "grad_norm": 0.484375, "learning_rate": 0.0003594201043300157, "loss": 5.1944, "step": 792500 }, { "epoch": 5.075687265977534, "grad_norm": 0.54296875, "learning_rate": 0.00035939450187217974, "loss": 5.1941, "step": 793000 }, { "epoch": 5.0788875732070276, "grad_norm": 0.54296875, "learning_rate": 0.0003593688994143438, "loss": 5.1883, "step": 793500 }, { "epoch": 5.0820878804365215, "grad_norm": 0.435546875, "learning_rate": 0.00035934329695650786, "loss": 5.1958, "step": 794000 }, { "epoch": 5.085288187666016, "grad_norm": 0.4765625, "learning_rate": 0.0003593176944986719, "loss": 5.1933, "step": 794500 }, { "epoch": 5.08848849489551, "grad_norm": 0.458984375, "learning_rate": 0.00035929209204083593, "loss": 5.1934, "step": 795000 }, { "epoch": 5.091688802125004, "grad_norm": 0.466796875, "learning_rate": 0.00035926648958299997, "loss": 5.187, "step": 795500 }, { "epoch": 5.094889109354498, "grad_norm": 0.486328125, "learning_rate": 0.000359240887125164, "loss": 5.199, "step": 796000 }, { "epoch": 5.098089416583992, "grad_norm": 0.546875, "learning_rate": 0.0003592152846673281, "loss": 5.186, "step": 796500 }, { "epoch": 5.101289723813486, "grad_norm": 0.462890625, "learning_rate": 0.0003591896822094921, "loss": 5.1892, "step": 797000 }, { "epoch": 5.10449003104298, "grad_norm": 0.515625, "learning_rate": 0.00035916407975165616, "loss": 5.1931, "step": 797500 }, { "epoch": 5.107690338272474, "grad_norm": 0.478515625, "learning_rate": 0.0003591384772938202, "loss": 5.1888, "step": 798000 }, { "epoch": 5.110890645501968, "grad_norm": 0.48828125, "learning_rate": 0.0003591128748359843, "loss": 5.1965, "step": 798500 }, { "epoch": 5.114090952731463, "grad_norm": 0.57421875, "learning_rate": 0.0003590872723781483, "loss": 5.1959, "step": 799000 }, { "epoch": 5.117291259960957, "grad_norm": 0.515625, "learning_rate": 0.0003590616699203124, "loss": 5.1928, "step": 799500 }, { "epoch": 5.1204915671904505, "grad_norm": 0.490234375, "learning_rate": 0.00035903606746247644, "loss": 5.1896, "step": 800000 }, { "epoch": 5.1236918744199444, "grad_norm": 0.5234375, "learning_rate": 0.00035901046500464047, "loss": 5.1934, "step": 800500 }, { "epoch": 5.126892181649438, "grad_norm": 0.478515625, "learning_rate": 0.0003589848625468045, "loss": 5.1969, "step": 801000 }, { "epoch": 5.130092488878932, "grad_norm": 0.498046875, "learning_rate": 0.00035895926008896854, "loss": 5.1986, "step": 801500 }, { "epoch": 5.133292796108426, "grad_norm": 0.51171875, "learning_rate": 0.00035893365763113263, "loss": 5.1884, "step": 802000 }, { "epoch": 5.13649310333792, "grad_norm": 0.52734375, "learning_rate": 0.00035890805517329666, "loss": 5.195, "step": 802500 }, { "epoch": 5.139693410567414, "grad_norm": 0.48828125, "learning_rate": 0.0003588824527154607, "loss": 5.1907, "step": 803000 }, { "epoch": 5.142893717796909, "grad_norm": 0.458984375, "learning_rate": 0.00035885685025762473, "loss": 5.1878, "step": 803500 }, { "epoch": 5.146094025026403, "grad_norm": 0.51953125, "learning_rate": 0.00035883124779978877, "loss": 5.1876, "step": 804000 }, { "epoch": 5.149294332255897, "grad_norm": 0.4765625, "learning_rate": 0.0003588056453419528, "loss": 5.19, "step": 804500 }, { "epoch": 5.152494639485391, "grad_norm": 0.4765625, "learning_rate": 0.0003587800428841169, "loss": 5.196, "step": 805000 }, { "epoch": 5.155694946714885, "grad_norm": 0.52734375, "learning_rate": 0.0003587544404262809, "loss": 5.1937, "step": 805500 }, { "epoch": 5.158895253944379, "grad_norm": 0.482421875, "learning_rate": 0.000358728837968445, "loss": 5.191, "step": 806000 }, { "epoch": 5.162095561173873, "grad_norm": 0.455078125, "learning_rate": 0.00035870323551060905, "loss": 5.191, "step": 806500 }, { "epoch": 5.1652958684033665, "grad_norm": 0.5, "learning_rate": 0.0003586776330527731, "loss": 5.1922, "step": 807000 }, { "epoch": 5.1684961756328605, "grad_norm": 0.54296875, "learning_rate": 0.00035865203059493717, "loss": 5.1869, "step": 807500 }, { "epoch": 5.171696482862354, "grad_norm": 0.5078125, "learning_rate": 0.0003586264281371012, "loss": 5.1968, "step": 808000 }, { "epoch": 5.174896790091849, "grad_norm": 0.490234375, "learning_rate": 0.00035860082567926524, "loss": 5.189, "step": 808500 }, { "epoch": 5.178097097321343, "grad_norm": 0.44140625, "learning_rate": 0.00035857522322142927, "loss": 5.1945, "step": 809000 }, { "epoch": 5.181297404550837, "grad_norm": 0.4921875, "learning_rate": 0.0003585496207635933, "loss": 5.1938, "step": 809500 }, { "epoch": 5.184497711780331, "grad_norm": 0.5390625, "learning_rate": 0.0003585240183057574, "loss": 5.1902, "step": 810000 }, { "epoch": 5.187698019009825, "grad_norm": 0.5, "learning_rate": 0.00035849841584792143, "loss": 5.1895, "step": 810500 }, { "epoch": 5.190898326239319, "grad_norm": 0.490234375, "learning_rate": 0.00035847281339008546, "loss": 5.1867, "step": 811000 }, { "epoch": 5.194098633468813, "grad_norm": 0.45703125, "learning_rate": 0.0003584472109322495, "loss": 5.1907, "step": 811500 }, { "epoch": 5.197298940698307, "grad_norm": 0.56640625, "learning_rate": 0.00035842160847441353, "loss": 5.1907, "step": 812000 }, { "epoch": 5.200499247927801, "grad_norm": 0.474609375, "learning_rate": 0.00035839600601657756, "loss": 5.192, "step": 812500 }, { "epoch": 5.203699555157295, "grad_norm": 0.4765625, "learning_rate": 0.00035837040355874165, "loss": 5.2016, "step": 813000 }, { "epoch": 5.2068998623867895, "grad_norm": 0.443359375, "learning_rate": 0.0003583448011009057, "loss": 5.193, "step": 813500 }, { "epoch": 5.210100169616283, "grad_norm": 0.51171875, "learning_rate": 0.0003583191986430698, "loss": 5.1969, "step": 814000 }, { "epoch": 5.213300476845777, "grad_norm": 0.51953125, "learning_rate": 0.0003582935961852338, "loss": 5.193, "step": 814500 }, { "epoch": 5.216500784075271, "grad_norm": 0.51171875, "learning_rate": 0.00035826799372739784, "loss": 5.1904, "step": 815000 }, { "epoch": 5.219701091304765, "grad_norm": 0.498046875, "learning_rate": 0.00035824239126956193, "loss": 5.1898, "step": 815500 }, { "epoch": 5.222901398534259, "grad_norm": 0.48046875, "learning_rate": 0.00035821678881172597, "loss": 5.1947, "step": 816000 }, { "epoch": 5.226101705763753, "grad_norm": 0.52734375, "learning_rate": 0.00035819118635389, "loss": 5.201, "step": 816500 }, { "epoch": 5.229302012993247, "grad_norm": 0.494140625, "learning_rate": 0.00035816558389605404, "loss": 5.195, "step": 817000 }, { "epoch": 5.232502320222741, "grad_norm": 0.484375, "learning_rate": 0.00035813998143821807, "loss": 5.1934, "step": 817500 }, { "epoch": 5.235702627452236, "grad_norm": 0.51171875, "learning_rate": 0.0003581143789803821, "loss": 5.1882, "step": 818000 }, { "epoch": 5.23890293468173, "grad_norm": 0.5, "learning_rate": 0.0003580887765225462, "loss": 5.1984, "step": 818500 }, { "epoch": 5.242103241911224, "grad_norm": 0.4921875, "learning_rate": 0.0003580631740647102, "loss": 5.1947, "step": 819000 }, { "epoch": 5.245303549140718, "grad_norm": 0.44921875, "learning_rate": 0.00035803757160687426, "loss": 5.1932, "step": 819500 }, { "epoch": 5.2485038563702116, "grad_norm": 0.5, "learning_rate": 0.0003580119691490383, "loss": 5.1865, "step": 820000 }, { "epoch": 5.2517041635997055, "grad_norm": 0.53515625, "learning_rate": 0.0003579863666912024, "loss": 5.1919, "step": 820500 }, { "epoch": 5.254904470829199, "grad_norm": 0.515625, "learning_rate": 0.0003579607642333664, "loss": 5.196, "step": 821000 }, { "epoch": 5.258104778058693, "grad_norm": 0.51953125, "learning_rate": 0.0003579351617755305, "loss": 5.1858, "step": 821500 }, { "epoch": 5.261305085288187, "grad_norm": 0.515625, "learning_rate": 0.00035790955931769454, "loss": 5.1927, "step": 822000 }, { "epoch": 5.264505392517682, "grad_norm": 0.51953125, "learning_rate": 0.0003578839568598586, "loss": 5.1908, "step": 822500 }, { "epoch": 5.267705699747176, "grad_norm": 0.5390625, "learning_rate": 0.0003578583544020226, "loss": 5.1927, "step": 823000 }, { "epoch": 5.27090600697667, "grad_norm": 0.52734375, "learning_rate": 0.00035783275194418664, "loss": 5.1974, "step": 823500 }, { "epoch": 5.274106314206164, "grad_norm": 0.578125, "learning_rate": 0.00035780714948635073, "loss": 5.1912, "step": 824000 }, { "epoch": 5.277306621435658, "grad_norm": 0.48828125, "learning_rate": 0.00035778154702851477, "loss": 5.1906, "step": 824500 }, { "epoch": 5.280506928665152, "grad_norm": 0.5078125, "learning_rate": 0.0003577559445706788, "loss": 5.1898, "step": 825000 }, { "epoch": 5.283707235894646, "grad_norm": 0.50390625, "learning_rate": 0.00035773034211284283, "loss": 5.1904, "step": 825500 }, { "epoch": 5.28690754312414, "grad_norm": 0.53515625, "learning_rate": 0.00035770473965500687, "loss": 5.1934, "step": 826000 }, { "epoch": 5.290107850353634, "grad_norm": 0.53125, "learning_rate": 0.00035767913719717096, "loss": 5.1942, "step": 826500 }, { "epoch": 5.2933081575831284, "grad_norm": 0.51171875, "learning_rate": 0.000357653534739335, "loss": 5.1926, "step": 827000 }, { "epoch": 5.296508464812622, "grad_norm": 0.5078125, "learning_rate": 0.000357627932281499, "loss": 5.1936, "step": 827500 }, { "epoch": 5.299708772042116, "grad_norm": 0.47265625, "learning_rate": 0.00035760232982366306, "loss": 5.1895, "step": 828000 }, { "epoch": 5.30290907927161, "grad_norm": 0.55859375, "learning_rate": 0.00035757672736582715, "loss": 5.1925, "step": 828500 }, { "epoch": 5.306109386501104, "grad_norm": 0.5078125, "learning_rate": 0.0003575511249079912, "loss": 5.196, "step": 829000 }, { "epoch": 5.309309693730598, "grad_norm": 0.5546875, "learning_rate": 0.00035752552245015527, "loss": 5.1964, "step": 829500 }, { "epoch": 5.312510000960092, "grad_norm": 0.474609375, "learning_rate": 0.0003574999199923193, "loss": 5.1984, "step": 830000 }, { "epoch": 5.315710308189586, "grad_norm": 0.490234375, "learning_rate": 0.00035747431753448334, "loss": 5.1861, "step": 830500 }, { "epoch": 5.31891061541908, "grad_norm": 0.53515625, "learning_rate": 0.0003574487150766474, "loss": 5.1971, "step": 831000 }, { "epoch": 5.322110922648574, "grad_norm": 0.453125, "learning_rate": 0.0003574231126188114, "loss": 5.2024, "step": 831500 }, { "epoch": 5.325311229878069, "grad_norm": 0.498046875, "learning_rate": 0.0003573975101609755, "loss": 5.194, "step": 832000 }, { "epoch": 5.328511537107563, "grad_norm": 0.48828125, "learning_rate": 0.00035737190770313953, "loss": 5.1953, "step": 832500 }, { "epoch": 5.331711844337057, "grad_norm": 0.53515625, "learning_rate": 0.00035734630524530357, "loss": 5.192, "step": 833000 }, { "epoch": 5.3349121515665505, "grad_norm": 0.5, "learning_rate": 0.0003573207027874676, "loss": 5.1998, "step": 833500 }, { "epoch": 5.3381124587960445, "grad_norm": 0.55859375, "learning_rate": 0.00035729510032963163, "loss": 5.1896, "step": 834000 }, { "epoch": 5.341312766025538, "grad_norm": 0.53515625, "learning_rate": 0.00035726949787179567, "loss": 5.1879, "step": 834500 }, { "epoch": 5.344513073255032, "grad_norm": 0.42578125, "learning_rate": 0.00035724389541395976, "loss": 5.194, "step": 835000 }, { "epoch": 5.347713380484526, "grad_norm": 0.52734375, "learning_rate": 0.0003572182929561238, "loss": 5.193, "step": 835500 }, { "epoch": 5.35091368771402, "grad_norm": 0.482421875, "learning_rate": 0.0003571926904982878, "loss": 5.1903, "step": 836000 }, { "epoch": 5.354113994943514, "grad_norm": 0.51171875, "learning_rate": 0.0003571670880404519, "loss": 5.1892, "step": 836500 }, { "epoch": 5.357314302173009, "grad_norm": 0.48046875, "learning_rate": 0.00035714148558261595, "loss": 5.2005, "step": 837000 }, { "epoch": 5.360514609402503, "grad_norm": 0.515625, "learning_rate": 0.00035711588312478004, "loss": 5.1901, "step": 837500 }, { "epoch": 5.363714916631997, "grad_norm": 0.53125, "learning_rate": 0.00035709028066694407, "loss": 5.1951, "step": 838000 }, { "epoch": 5.366915223861491, "grad_norm": 0.5546875, "learning_rate": 0.0003570646782091081, "loss": 5.189, "step": 838500 }, { "epoch": 5.370115531090985, "grad_norm": 0.466796875, "learning_rate": 0.00035703907575127214, "loss": 5.1957, "step": 839000 }, { "epoch": 5.373315838320479, "grad_norm": 0.51953125, "learning_rate": 0.0003570134732934362, "loss": 5.1892, "step": 839500 }, { "epoch": 5.376516145549973, "grad_norm": 0.4296875, "learning_rate": 0.0003569878708356002, "loss": 5.192, "step": 840000 }, { "epoch": 5.3797164527794665, "grad_norm": 0.48828125, "learning_rate": 0.0003569622683777643, "loss": 5.1936, "step": 840500 }, { "epoch": 5.3829167600089605, "grad_norm": 0.53125, "learning_rate": 0.00035693666591992833, "loss": 5.1927, "step": 841000 }, { "epoch": 5.386117067238455, "grad_norm": 0.51171875, "learning_rate": 0.00035691106346209236, "loss": 5.1942, "step": 841500 }, { "epoch": 5.389317374467949, "grad_norm": 0.439453125, "learning_rate": 0.0003568854610042564, "loss": 5.1916, "step": 842000 }, { "epoch": 5.392517681697443, "grad_norm": 0.5390625, "learning_rate": 0.00035685985854642043, "loss": 5.1909, "step": 842500 }, { "epoch": 5.395717988926937, "grad_norm": 0.478515625, "learning_rate": 0.0003568342560885845, "loss": 5.1895, "step": 843000 }, { "epoch": 5.398918296156431, "grad_norm": 0.498046875, "learning_rate": 0.00035680865363074856, "loss": 5.196, "step": 843500 }, { "epoch": 5.402118603385925, "grad_norm": 0.478515625, "learning_rate": 0.00035678305117291264, "loss": 5.1934, "step": 844000 }, { "epoch": 5.405318910615419, "grad_norm": 0.5, "learning_rate": 0.0003567574487150767, "loss": 5.193, "step": 844500 }, { "epoch": 5.408519217844913, "grad_norm": 0.474609375, "learning_rate": 0.0003567318462572407, "loss": 5.1885, "step": 845000 }, { "epoch": 5.411719525074407, "grad_norm": 0.46875, "learning_rate": 0.00035670624379940475, "loss": 5.191, "step": 845500 }, { "epoch": 5.414919832303902, "grad_norm": 0.5234375, "learning_rate": 0.00035668064134156883, "loss": 5.1924, "step": 846000 }, { "epoch": 5.4181201395333956, "grad_norm": 0.466796875, "learning_rate": 0.00035665503888373287, "loss": 5.1941, "step": 846500 }, { "epoch": 5.4213204467628895, "grad_norm": 0.4609375, "learning_rate": 0.0003566294364258969, "loss": 5.1947, "step": 847000 }, { "epoch": 5.424520753992383, "grad_norm": 0.5, "learning_rate": 0.00035660383396806094, "loss": 5.1892, "step": 847500 }, { "epoch": 5.427721061221877, "grad_norm": 0.54296875, "learning_rate": 0.00035657823151022497, "loss": 5.1924, "step": 848000 }, { "epoch": 5.430921368451371, "grad_norm": 0.55078125, "learning_rate": 0.00035655262905238906, "loss": 5.1932, "step": 848500 }, { "epoch": 5.434121675680865, "grad_norm": 0.498046875, "learning_rate": 0.0003565270265945531, "loss": 5.1959, "step": 849000 }, { "epoch": 5.437321982910359, "grad_norm": 0.54296875, "learning_rate": 0.00035650142413671713, "loss": 5.1937, "step": 849500 }, { "epoch": 5.440522290139853, "grad_norm": 0.54296875, "learning_rate": 0.00035647582167888116, "loss": 5.1952, "step": 850000 }, { "epoch": 5.443722597369347, "grad_norm": 0.48828125, "learning_rate": 0.0003564502192210452, "loss": 5.1862, "step": 850500 }, { "epoch": 5.446922904598842, "grad_norm": 0.5234375, "learning_rate": 0.0003564246167632093, "loss": 5.1865, "step": 851000 }, { "epoch": 5.450123211828336, "grad_norm": 0.52734375, "learning_rate": 0.0003563990143053733, "loss": 5.1919, "step": 851500 }, { "epoch": 5.45332351905783, "grad_norm": 0.57421875, "learning_rate": 0.0003563734118475374, "loss": 5.1846, "step": 852000 }, { "epoch": 5.456523826287324, "grad_norm": 0.51171875, "learning_rate": 0.00035634780938970144, "loss": 5.1946, "step": 852500 }, { "epoch": 5.459724133516818, "grad_norm": 0.57421875, "learning_rate": 0.0003563222069318655, "loss": 5.188, "step": 853000 }, { "epoch": 5.462924440746312, "grad_norm": 0.51953125, "learning_rate": 0.0003562966044740295, "loss": 5.189, "step": 853500 }, { "epoch": 5.4661247479758055, "grad_norm": 0.51953125, "learning_rate": 0.0003562710020161936, "loss": 5.1924, "step": 854000 }, { "epoch": 5.469325055205299, "grad_norm": 0.486328125, "learning_rate": 0.00035624539955835763, "loss": 5.1974, "step": 854500 }, { "epoch": 5.472525362434793, "grad_norm": 0.546875, "learning_rate": 0.00035621979710052167, "loss": 5.19, "step": 855000 }, { "epoch": 5.475725669664287, "grad_norm": 0.474609375, "learning_rate": 0.0003561941946426857, "loss": 5.1927, "step": 855500 }, { "epoch": 5.478925976893782, "grad_norm": 0.51171875, "learning_rate": 0.00035616859218484974, "loss": 5.1937, "step": 856000 }, { "epoch": 5.482126284123276, "grad_norm": 0.51953125, "learning_rate": 0.0003561429897270138, "loss": 5.1928, "step": 856500 }, { "epoch": 5.48532659135277, "grad_norm": 0.51171875, "learning_rate": 0.00035611738726917786, "loss": 5.1935, "step": 857000 }, { "epoch": 5.488526898582264, "grad_norm": 0.4765625, "learning_rate": 0.0003560917848113419, "loss": 5.1932, "step": 857500 }, { "epoch": 5.491727205811758, "grad_norm": 0.51171875, "learning_rate": 0.00035606618235350593, "loss": 5.1861, "step": 858000 }, { "epoch": 5.494927513041252, "grad_norm": 0.52734375, "learning_rate": 0.00035604057989567, "loss": 5.1931, "step": 858500 }, { "epoch": 5.498127820270746, "grad_norm": 0.48046875, "learning_rate": 0.00035601497743783405, "loss": 5.1891, "step": 859000 }, { "epoch": 5.50132812750024, "grad_norm": 0.470703125, "learning_rate": 0.00035598937497999814, "loss": 5.1924, "step": 859500 }, { "epoch": 5.504528434729734, "grad_norm": 0.515625, "learning_rate": 0.0003559637725221622, "loss": 5.1958, "step": 860000 }, { "epoch": 5.5077287419592285, "grad_norm": 0.47265625, "learning_rate": 0.0003559381700643262, "loss": 5.1897, "step": 860500 }, { "epoch": 5.510929049188722, "grad_norm": 0.515625, "learning_rate": 0.00035591256760649024, "loss": 5.1907, "step": 861000 }, { "epoch": 5.514129356418216, "grad_norm": 0.55859375, "learning_rate": 0.0003558869651486543, "loss": 5.1888, "step": 861500 }, { "epoch": 5.51732966364771, "grad_norm": 0.5234375, "learning_rate": 0.00035586136269081836, "loss": 5.1928, "step": 862000 }, { "epoch": 5.520529970877204, "grad_norm": 0.53515625, "learning_rate": 0.0003558357602329824, "loss": 5.1948, "step": 862500 }, { "epoch": 5.523730278106698, "grad_norm": 0.515625, "learning_rate": 0.00035581015777514643, "loss": 5.1949, "step": 863000 }, { "epoch": 5.526930585336192, "grad_norm": 0.5390625, "learning_rate": 0.00035578455531731047, "loss": 5.1971, "step": 863500 }, { "epoch": 5.530130892565686, "grad_norm": 0.5234375, "learning_rate": 0.0003557589528594745, "loss": 5.189, "step": 864000 }, { "epoch": 5.53333119979518, "grad_norm": 0.5078125, "learning_rate": 0.00035573335040163854, "loss": 5.1916, "step": 864500 }, { "epoch": 5.536531507024675, "grad_norm": 0.546875, "learning_rate": 0.0003557077479438026, "loss": 5.1895, "step": 865000 }, { "epoch": 5.539731814254169, "grad_norm": 0.50390625, "learning_rate": 0.00035568214548596666, "loss": 5.1928, "step": 865500 }, { "epoch": 5.542932121483663, "grad_norm": 0.5390625, "learning_rate": 0.0003556565430281307, "loss": 5.1955, "step": 866000 }, { "epoch": 5.546132428713157, "grad_norm": 0.4765625, "learning_rate": 0.0003556309405702948, "loss": 5.1908, "step": 866500 }, { "epoch": 5.5493327359426505, "grad_norm": 0.50390625, "learning_rate": 0.0003556053381124588, "loss": 5.1921, "step": 867000 }, { "epoch": 5.5525330431721445, "grad_norm": 0.48046875, "learning_rate": 0.0003555797356546229, "loss": 5.187, "step": 867500 }, { "epoch": 5.555733350401638, "grad_norm": 0.466796875, "learning_rate": 0.00035555413319678694, "loss": 5.1901, "step": 868000 }, { "epoch": 5.558933657631132, "grad_norm": 0.5078125, "learning_rate": 0.00035552853073895097, "loss": 5.1981, "step": 868500 }, { "epoch": 5.562133964860626, "grad_norm": 0.55078125, "learning_rate": 0.000355502928281115, "loss": 5.1823, "step": 869000 }, { "epoch": 5.565334272090121, "grad_norm": 0.51171875, "learning_rate": 0.00035547732582327904, "loss": 5.1946, "step": 869500 }, { "epoch": 5.568534579319615, "grad_norm": 0.50390625, "learning_rate": 0.0003554517233654431, "loss": 5.1921, "step": 870000 }, { "epoch": 5.571734886549109, "grad_norm": 0.5, "learning_rate": 0.00035542612090760716, "loss": 5.1861, "step": 870500 }, { "epoch": 5.574935193778603, "grad_norm": 0.47265625, "learning_rate": 0.0003554005184497712, "loss": 5.1876, "step": 871000 }, { "epoch": 5.578135501008097, "grad_norm": 0.4765625, "learning_rate": 0.00035537491599193523, "loss": 5.1953, "step": 871500 }, { "epoch": 5.581335808237591, "grad_norm": 0.490234375, "learning_rate": 0.00035534931353409927, "loss": 5.1937, "step": 872000 }, { "epoch": 5.584536115467085, "grad_norm": 0.55859375, "learning_rate": 0.0003553237110762633, "loss": 5.1894, "step": 872500 }, { "epoch": 5.587736422696579, "grad_norm": 0.486328125, "learning_rate": 0.0003552981086184274, "loss": 5.1917, "step": 873000 }, { "epoch": 5.590936729926073, "grad_norm": 0.494140625, "learning_rate": 0.0003552725061605914, "loss": 5.1891, "step": 873500 }, { "epoch": 5.594137037155567, "grad_norm": 0.48046875, "learning_rate": 0.0003552469037027555, "loss": 5.1961, "step": 874000 }, { "epoch": 5.5973373443850605, "grad_norm": 0.52734375, "learning_rate": 0.00035522130124491955, "loss": 5.1954, "step": 874500 }, { "epoch": 5.600537651614555, "grad_norm": 0.48828125, "learning_rate": 0.0003551956987870836, "loss": 5.1918, "step": 875000 }, { "epoch": 5.603737958844049, "grad_norm": 0.515625, "learning_rate": 0.0003551700963292476, "loss": 5.1932, "step": 875500 }, { "epoch": 5.606938266073543, "grad_norm": 0.484375, "learning_rate": 0.0003551444938714117, "loss": 5.2006, "step": 876000 }, { "epoch": 5.610138573303037, "grad_norm": 0.4921875, "learning_rate": 0.00035511889141357574, "loss": 5.1887, "step": 876500 }, { "epoch": 5.613338880532531, "grad_norm": 0.52734375, "learning_rate": 0.00035509328895573977, "loss": 5.1883, "step": 877000 }, { "epoch": 5.616539187762025, "grad_norm": 0.494140625, "learning_rate": 0.0003550676864979038, "loss": 5.1887, "step": 877500 }, { "epoch": 5.619739494991519, "grad_norm": 0.466796875, "learning_rate": 0.00035504208404006784, "loss": 5.1867, "step": 878000 }, { "epoch": 5.622939802221013, "grad_norm": 0.52734375, "learning_rate": 0.00035501648158223193, "loss": 5.196, "step": 878500 }, { "epoch": 5.626140109450507, "grad_norm": 0.5546875, "learning_rate": 0.00035499087912439596, "loss": 5.1891, "step": 879000 }, { "epoch": 5.629340416680002, "grad_norm": 0.48828125, "learning_rate": 0.00035496527666656, "loss": 5.1935, "step": 879500 }, { "epoch": 5.632540723909496, "grad_norm": 0.515625, "learning_rate": 0.00035493967420872403, "loss": 5.1882, "step": 880000 }, { "epoch": 5.6357410311389895, "grad_norm": 0.5234375, "learning_rate": 0.00035491407175088807, "loss": 5.1948, "step": 880500 }, { "epoch": 5.638941338368483, "grad_norm": 0.462890625, "learning_rate": 0.00035488846929305215, "loss": 5.1945, "step": 881000 }, { "epoch": 5.642141645597977, "grad_norm": 0.5234375, "learning_rate": 0.0003548628668352162, "loss": 5.1885, "step": 881500 }, { "epoch": 5.645341952827471, "grad_norm": 0.4765625, "learning_rate": 0.0003548372643773803, "loss": 5.1935, "step": 882000 }, { "epoch": 5.648542260056965, "grad_norm": 0.515625, "learning_rate": 0.0003548116619195443, "loss": 5.1941, "step": 882500 }, { "epoch": 5.651742567286459, "grad_norm": 0.59765625, "learning_rate": 0.00035478605946170834, "loss": 5.1936, "step": 883000 }, { "epoch": 5.654942874515953, "grad_norm": 0.462890625, "learning_rate": 0.0003547604570038724, "loss": 5.1873, "step": 883500 }, { "epoch": 5.658143181745448, "grad_norm": 0.52734375, "learning_rate": 0.00035473485454603647, "loss": 5.1939, "step": 884000 }, { "epoch": 5.661343488974942, "grad_norm": 0.5078125, "learning_rate": 0.0003547092520882005, "loss": 5.1914, "step": 884500 }, { "epoch": 5.664543796204436, "grad_norm": 0.4921875, "learning_rate": 0.00035468364963036454, "loss": 5.1913, "step": 885000 }, { "epoch": 5.66774410343393, "grad_norm": 0.515625, "learning_rate": 0.00035465804717252857, "loss": 5.1865, "step": 885500 }, { "epoch": 5.670944410663424, "grad_norm": 0.5078125, "learning_rate": 0.0003546324447146926, "loss": 5.191, "step": 886000 }, { "epoch": 5.674144717892918, "grad_norm": 0.54296875, "learning_rate": 0.0003546068422568567, "loss": 5.1936, "step": 886500 }, { "epoch": 5.677345025122412, "grad_norm": 0.62109375, "learning_rate": 0.0003545812397990207, "loss": 5.1917, "step": 887000 }, { "epoch": 5.6805453323519055, "grad_norm": 0.53125, "learning_rate": 0.00035455563734118476, "loss": 5.1941, "step": 887500 }, { "epoch": 5.6837456395813994, "grad_norm": 0.48046875, "learning_rate": 0.0003545300348833488, "loss": 5.1957, "step": 888000 }, { "epoch": 5.686945946810894, "grad_norm": 0.51953125, "learning_rate": 0.00035450443242551283, "loss": 5.1946, "step": 888500 }, { "epoch": 5.690146254040388, "grad_norm": 0.50390625, "learning_rate": 0.0003544788299676769, "loss": 5.1872, "step": 889000 }, { "epoch": 5.693346561269882, "grad_norm": 0.48828125, "learning_rate": 0.00035445322750984095, "loss": 5.1903, "step": 889500 }, { "epoch": 5.696546868499376, "grad_norm": 0.54296875, "learning_rate": 0.00035442762505200504, "loss": 5.1979, "step": 890000 }, { "epoch": 5.69974717572887, "grad_norm": 0.515625, "learning_rate": 0.0003544020225941691, "loss": 5.1933, "step": 890500 }, { "epoch": 5.702947482958364, "grad_norm": 0.54296875, "learning_rate": 0.0003543764201363331, "loss": 5.1966, "step": 891000 }, { "epoch": 5.706147790187858, "grad_norm": 0.50390625, "learning_rate": 0.00035435081767849714, "loss": 5.1889, "step": 891500 }, { "epoch": 5.709348097417352, "grad_norm": 0.49609375, "learning_rate": 0.00035432521522066123, "loss": 5.1904, "step": 892000 }, { "epoch": 5.712548404646846, "grad_norm": 0.498046875, "learning_rate": 0.00035429961276282527, "loss": 5.1933, "step": 892500 }, { "epoch": 5.715748711876341, "grad_norm": 0.5078125, "learning_rate": 0.0003542740103049893, "loss": 5.1877, "step": 893000 }, { "epoch": 5.7189490191058345, "grad_norm": 0.58203125, "learning_rate": 0.00035424840784715333, "loss": 5.1965, "step": 893500 }, { "epoch": 5.7221493263353285, "grad_norm": 0.5234375, "learning_rate": 0.00035422280538931737, "loss": 5.1922, "step": 894000 }, { "epoch": 5.725349633564822, "grad_norm": 0.50390625, "learning_rate": 0.0003541972029314814, "loss": 5.1867, "step": 894500 }, { "epoch": 5.728549940794316, "grad_norm": 0.6171875, "learning_rate": 0.0003541716004736455, "loss": 5.1915, "step": 895000 }, { "epoch": 5.73175024802381, "grad_norm": 0.50390625, "learning_rate": 0.0003541459980158095, "loss": 5.1982, "step": 895500 }, { "epoch": 5.734950555253304, "grad_norm": 0.51953125, "learning_rate": 0.00035412039555797356, "loss": 5.1919, "step": 896000 }, { "epoch": 5.738150862482798, "grad_norm": 0.48828125, "learning_rate": 0.00035409479310013765, "loss": 5.1929, "step": 896500 }, { "epoch": 5.741351169712292, "grad_norm": 0.5390625, "learning_rate": 0.0003540691906423017, "loss": 5.1914, "step": 897000 }, { "epoch": 5.744551476941786, "grad_norm": 0.54296875, "learning_rate": 0.00035404358818446577, "loss": 5.1876, "step": 897500 }, { "epoch": 5.74775178417128, "grad_norm": 0.5, "learning_rate": 0.0003540179857266298, "loss": 5.1937, "step": 898000 }, { "epoch": 5.750952091400775, "grad_norm": 0.515625, "learning_rate": 0.00035399238326879384, "loss": 5.194, "step": 898500 }, { "epoch": 5.754152398630269, "grad_norm": 0.50390625, "learning_rate": 0.0003539667808109579, "loss": 5.1912, "step": 899000 }, { "epoch": 5.757352705859763, "grad_norm": 0.51171875, "learning_rate": 0.0003539411783531219, "loss": 5.1917, "step": 899500 }, { "epoch": 5.760553013089257, "grad_norm": 0.5390625, "learning_rate": 0.00035391557589528594, "loss": 5.1971, "step": 900000 }, { "epoch": 5.7637533203187505, "grad_norm": 0.458984375, "learning_rate": 0.00035388997343745003, "loss": 5.1915, "step": 900500 }, { "epoch": 5.7669536275482445, "grad_norm": 0.53125, "learning_rate": 0.00035386437097961407, "loss": 5.1934, "step": 901000 }, { "epoch": 5.770153934777738, "grad_norm": 0.462890625, "learning_rate": 0.0003538387685217781, "loss": 5.1906, "step": 901500 }, { "epoch": 5.773354242007232, "grad_norm": 0.482421875, "learning_rate": 0.00035381316606394213, "loss": 5.195, "step": 902000 }, { "epoch": 5.776554549236726, "grad_norm": 0.4921875, "learning_rate": 0.00035378756360610617, "loss": 5.1931, "step": 902500 }, { "epoch": 5.779754856466221, "grad_norm": 0.51171875, "learning_rate": 0.00035376196114827026, "loss": 5.1899, "step": 903000 }, { "epoch": 5.782955163695715, "grad_norm": 0.609375, "learning_rate": 0.0003537363586904343, "loss": 5.1867, "step": 903500 }, { "epoch": 5.786155470925209, "grad_norm": 0.52734375, "learning_rate": 0.0003537107562325983, "loss": 5.1938, "step": 904000 }, { "epoch": 5.789355778154703, "grad_norm": 0.48828125, "learning_rate": 0.0003536851537747624, "loss": 5.1991, "step": 904500 }, { "epoch": 5.792556085384197, "grad_norm": 0.51953125, "learning_rate": 0.00035365955131692645, "loss": 5.1889, "step": 905000 }, { "epoch": 5.795756392613691, "grad_norm": 0.4609375, "learning_rate": 0.0003536339488590905, "loss": 5.1888, "step": 905500 }, { "epoch": 5.798956699843185, "grad_norm": 0.51171875, "learning_rate": 0.00035360834640125457, "loss": 5.1912, "step": 906000 }, { "epoch": 5.802157007072679, "grad_norm": 0.6015625, "learning_rate": 0.0003535827439434186, "loss": 5.1871, "step": 906500 }, { "epoch": 5.805357314302173, "grad_norm": 0.5625, "learning_rate": 0.00035355714148558264, "loss": 5.198, "step": 907000 }, { "epoch": 5.808557621531667, "grad_norm": 0.58984375, "learning_rate": 0.0003535315390277467, "loss": 5.1929, "step": 907500 }, { "epoch": 5.811757928761161, "grad_norm": 0.5078125, "learning_rate": 0.0003535059365699107, "loss": 5.1934, "step": 908000 }, { "epoch": 5.814958235990655, "grad_norm": 0.498046875, "learning_rate": 0.0003534803341120748, "loss": 5.1958, "step": 908500 }, { "epoch": 5.818158543220149, "grad_norm": 0.55078125, "learning_rate": 0.00035345473165423883, "loss": 5.19, "step": 909000 }, { "epoch": 5.821358850449643, "grad_norm": 0.52734375, "learning_rate": 0.00035342912919640286, "loss": 5.1926, "step": 909500 }, { "epoch": 5.824559157679137, "grad_norm": 0.4765625, "learning_rate": 0.0003534035267385669, "loss": 5.196, "step": 910000 }, { "epoch": 5.827759464908631, "grad_norm": 0.46484375, "learning_rate": 0.00035337792428073093, "loss": 5.1921, "step": 910500 }, { "epoch": 5.830959772138125, "grad_norm": 0.486328125, "learning_rate": 0.00035335232182289497, "loss": 5.1961, "step": 911000 }, { "epoch": 5.834160079367619, "grad_norm": 0.50390625, "learning_rate": 0.00035332671936505906, "loss": 5.1909, "step": 911500 }, { "epoch": 5.837360386597114, "grad_norm": 0.6328125, "learning_rate": 0.00035330111690722314, "loss": 5.1908, "step": 912000 }, { "epoch": 5.840560693826608, "grad_norm": 0.51171875, "learning_rate": 0.0003532755144493872, "loss": 5.199, "step": 912500 }, { "epoch": 5.843761001056102, "grad_norm": 0.5390625, "learning_rate": 0.0003532499119915512, "loss": 5.1974, "step": 913000 }, { "epoch": 5.846961308285596, "grad_norm": 0.51953125, "learning_rate": 0.00035322430953371525, "loss": 5.1941, "step": 913500 }, { "epoch": 5.8501616155150895, "grad_norm": 0.5234375, "learning_rate": 0.00035319870707587934, "loss": 5.1948, "step": 914000 }, { "epoch": 5.8533619227445834, "grad_norm": 0.494140625, "learning_rate": 0.00035317310461804337, "loss": 5.1903, "step": 914500 }, { "epoch": 5.856562229974077, "grad_norm": 0.5078125, "learning_rate": 0.0003531475021602074, "loss": 5.1971, "step": 915000 }, { "epoch": 5.859762537203571, "grad_norm": 0.490234375, "learning_rate": 0.00035312189970237144, "loss": 5.1977, "step": 915500 }, { "epoch": 5.862962844433065, "grad_norm": 0.53515625, "learning_rate": 0.00035309629724453547, "loss": 5.1835, "step": 916000 }, { "epoch": 5.86616315166256, "grad_norm": 0.51171875, "learning_rate": 0.0003530706947866995, "loss": 5.1871, "step": 916500 }, { "epoch": 5.869363458892054, "grad_norm": 0.546875, "learning_rate": 0.0003530450923288636, "loss": 5.1921, "step": 917000 }, { "epoch": 5.872563766121548, "grad_norm": 0.48046875, "learning_rate": 0.00035301948987102763, "loss": 5.1888, "step": 917500 }, { "epoch": 5.875764073351042, "grad_norm": 0.5, "learning_rate": 0.00035299388741319166, "loss": 5.1879, "step": 918000 }, { "epoch": 5.878964380580536, "grad_norm": 0.55078125, "learning_rate": 0.0003529682849553557, "loss": 5.1944, "step": 918500 }, { "epoch": 5.88216468781003, "grad_norm": 0.5, "learning_rate": 0.0003529426824975198, "loss": 5.1927, "step": 919000 }, { "epoch": 5.885364995039524, "grad_norm": 0.54296875, "learning_rate": 0.0003529170800396838, "loss": 5.1947, "step": 919500 }, { "epoch": 5.888565302269018, "grad_norm": 0.51953125, "learning_rate": 0.0003528914775818479, "loss": 5.1938, "step": 920000 }, { "epoch": 5.891765609498512, "grad_norm": 0.5234375, "learning_rate": 0.00035286587512401194, "loss": 5.1885, "step": 920500 }, { "epoch": 5.8949659167280055, "grad_norm": 0.484375, "learning_rate": 0.000352840272666176, "loss": 5.193, "step": 921000 }, { "epoch": 5.8981662239574995, "grad_norm": 0.53125, "learning_rate": 0.00035281467020834, "loss": 5.1907, "step": 921500 }, { "epoch": 5.901366531186994, "grad_norm": 0.53125, "learning_rate": 0.00035278906775050405, "loss": 5.1884, "step": 922000 }, { "epoch": 5.904566838416488, "grad_norm": 0.50390625, "learning_rate": 0.00035276346529266813, "loss": 5.1923, "step": 922500 }, { "epoch": 5.907767145645982, "grad_norm": 0.4921875, "learning_rate": 0.00035273786283483217, "loss": 5.1928, "step": 923000 }, { "epoch": 5.910967452875476, "grad_norm": 0.5, "learning_rate": 0.0003527122603769962, "loss": 5.1888, "step": 923500 }, { "epoch": 5.91416776010497, "grad_norm": 0.48828125, "learning_rate": 0.00035268665791916024, "loss": 5.1952, "step": 924000 }, { "epoch": 5.917368067334464, "grad_norm": 0.5, "learning_rate": 0.00035266105546132427, "loss": 5.1968, "step": 924500 }, { "epoch": 5.920568374563958, "grad_norm": 0.5234375, "learning_rate": 0.00035263545300348836, "loss": 5.1912, "step": 925000 }, { "epoch": 5.923768681793452, "grad_norm": 0.53125, "learning_rate": 0.0003526098505456524, "loss": 5.1918, "step": 925500 }, { "epoch": 5.926968989022946, "grad_norm": 0.53125, "learning_rate": 0.00035258424808781643, "loss": 5.1855, "step": 926000 }, { "epoch": 5.930169296252441, "grad_norm": 0.55859375, "learning_rate": 0.00035255864562998046, "loss": 5.1912, "step": 926500 }, { "epoch": 5.9333696034819345, "grad_norm": 0.482421875, "learning_rate": 0.00035253304317214455, "loss": 5.1881, "step": 927000 }, { "epoch": 5.9365699107114285, "grad_norm": 0.49609375, "learning_rate": 0.00035250744071430864, "loss": 5.1856, "step": 927500 }, { "epoch": 5.939770217940922, "grad_norm": 0.51953125, "learning_rate": 0.0003524818382564727, "loss": 5.1917, "step": 928000 }, { "epoch": 5.942970525170416, "grad_norm": 0.515625, "learning_rate": 0.0003524562357986367, "loss": 5.1948, "step": 928500 }, { "epoch": 5.94617083239991, "grad_norm": 0.484375, "learning_rate": 0.00035243063334080074, "loss": 5.1927, "step": 929000 }, { "epoch": 5.949371139629404, "grad_norm": 0.53515625, "learning_rate": 0.0003524050308829648, "loss": 5.1937, "step": 929500 }, { "epoch": 5.952571446858898, "grad_norm": 0.5234375, "learning_rate": 0.0003523794284251288, "loss": 5.189, "step": 930000 }, { "epoch": 5.955771754088392, "grad_norm": 0.5, "learning_rate": 0.0003523538259672929, "loss": 5.1947, "step": 930500 }, { "epoch": 5.958972061317887, "grad_norm": 0.478515625, "learning_rate": 0.00035232822350945693, "loss": 5.1924, "step": 931000 }, { "epoch": 5.962172368547381, "grad_norm": 0.49609375, "learning_rate": 0.00035230262105162097, "loss": 5.193, "step": 931500 }, { "epoch": 5.965372675776875, "grad_norm": 0.482421875, "learning_rate": 0.000352277018593785, "loss": 5.1917, "step": 932000 }, { "epoch": 5.968572983006369, "grad_norm": 0.54296875, "learning_rate": 0.00035225141613594904, "loss": 5.1942, "step": 932500 }, { "epoch": 5.971773290235863, "grad_norm": 0.50390625, "learning_rate": 0.0003522258136781131, "loss": 5.1945, "step": 933000 }, { "epoch": 5.974973597465357, "grad_norm": 0.5234375, "learning_rate": 0.00035220021122027716, "loss": 5.1909, "step": 933500 }, { "epoch": 5.9781739046948505, "grad_norm": 0.56640625, "learning_rate": 0.0003521746087624412, "loss": 5.1906, "step": 934000 }, { "epoch": 5.9813742119243445, "grad_norm": 0.4921875, "learning_rate": 0.0003521490063046053, "loss": 5.1929, "step": 934500 }, { "epoch": 5.984574519153838, "grad_norm": 0.55078125, "learning_rate": 0.0003521234038467693, "loss": 5.1918, "step": 935000 }, { "epoch": 5.987774826383333, "grad_norm": 0.53125, "learning_rate": 0.00035209780138893335, "loss": 5.1913, "step": 935500 }, { "epoch": 5.990975133612827, "grad_norm": 0.54296875, "learning_rate": 0.00035207219893109744, "loss": 5.1875, "step": 936000 }, { "epoch": 5.994175440842321, "grad_norm": 0.56640625, "learning_rate": 0.00035204659647326147, "loss": 5.1937, "step": 936500 }, { "epoch": 5.997375748071815, "grad_norm": 0.5, "learning_rate": 0.0003520209940154255, "loss": 5.1945, "step": 937000 }, { "epoch": 6.0, "eval_loss": 5.182580947875977, "eval_runtime": 1.1238, "eval_samples_per_second": 889.838, "eval_steps_per_second": 14.237, "step": 937410 }, { "epoch": 6.000576055301309, "grad_norm": 0.484375, "learning_rate": 0.00035199539155758954, "loss": 5.1917, "step": 937500 }, { "epoch": 6.003776362530803, "grad_norm": 0.50390625, "learning_rate": 0.0003519697890997536, "loss": 5.1882, "step": 938000 }, { "epoch": 6.006976669760297, "grad_norm": 0.5078125, "learning_rate": 0.00035194418664191766, "loss": 5.1892, "step": 938500 }, { "epoch": 6.010176976989791, "grad_norm": 0.53515625, "learning_rate": 0.0003519185841840817, "loss": 5.193, "step": 939000 }, { "epoch": 6.013377284219285, "grad_norm": 0.486328125, "learning_rate": 0.00035189298172624573, "loss": 5.1884, "step": 939500 }, { "epoch": 6.016577591448779, "grad_norm": 0.49609375, "learning_rate": 0.00035186737926840977, "loss": 5.1862, "step": 940000 }, { "epoch": 6.0197778986782735, "grad_norm": 0.515625, "learning_rate": 0.0003518417768105738, "loss": 5.1902, "step": 940500 }, { "epoch": 6.0229782059077674, "grad_norm": 0.49609375, "learning_rate": 0.00035181617435273783, "loss": 5.1893, "step": 941000 }, { "epoch": 6.026178513137261, "grad_norm": 0.51953125, "learning_rate": 0.0003517905718949019, "loss": 5.1898, "step": 941500 }, { "epoch": 6.029378820366755, "grad_norm": 0.5234375, "learning_rate": 0.00035176496943706596, "loss": 5.1912, "step": 942000 }, { "epoch": 6.032579127596249, "grad_norm": 0.51953125, "learning_rate": 0.00035173936697923005, "loss": 5.1923, "step": 942500 }, { "epoch": 6.035779434825743, "grad_norm": 0.55859375, "learning_rate": 0.0003517137645213941, "loss": 5.1908, "step": 943000 }, { "epoch": 6.038979742055237, "grad_norm": 0.52734375, "learning_rate": 0.0003516881620635581, "loss": 5.1907, "step": 943500 }, { "epoch": 6.042180049284731, "grad_norm": 0.462890625, "learning_rate": 0.0003516625596057222, "loss": 5.1906, "step": 944000 }, { "epoch": 6.045380356514225, "grad_norm": 0.515625, "learning_rate": 0.00035163695714788624, "loss": 5.1924, "step": 944500 }, { "epoch": 6.04858066374372, "grad_norm": 0.5234375, "learning_rate": 0.00035161135469005027, "loss": 5.1851, "step": 945000 }, { "epoch": 6.051780970973214, "grad_norm": 0.54296875, "learning_rate": 0.0003515857522322143, "loss": 5.1892, "step": 945500 }, { "epoch": 6.054981278202708, "grad_norm": 0.62109375, "learning_rate": 0.00035156014977437834, "loss": 5.1903, "step": 946000 }, { "epoch": 6.058181585432202, "grad_norm": 0.49609375, "learning_rate": 0.0003515345473165424, "loss": 5.1901, "step": 946500 }, { "epoch": 6.061381892661696, "grad_norm": 0.5546875, "learning_rate": 0.00035150894485870646, "loss": 5.1916, "step": 947000 }, { "epoch": 6.0645821998911895, "grad_norm": 0.515625, "learning_rate": 0.0003514833424008705, "loss": 5.1928, "step": 947500 }, { "epoch": 6.0677825071206835, "grad_norm": 0.486328125, "learning_rate": 0.00035145773994303453, "loss": 5.194, "step": 948000 }, { "epoch": 6.070982814350177, "grad_norm": 0.51953125, "learning_rate": 0.00035143213748519857, "loss": 5.1887, "step": 948500 }, { "epoch": 6.074183121579671, "grad_norm": 0.5546875, "learning_rate": 0.00035140653502736265, "loss": 5.1883, "step": 949000 }, { "epoch": 6.077383428809165, "grad_norm": 0.5078125, "learning_rate": 0.0003513809325695267, "loss": 5.1863, "step": 949500 }, { "epoch": 6.08058373603866, "grad_norm": 0.51953125, "learning_rate": 0.0003513553301116908, "loss": 5.1892, "step": 950000 }, { "epoch": 6.083784043268154, "grad_norm": 0.5546875, "learning_rate": 0.0003513297276538548, "loss": 5.1887, "step": 950500 }, { "epoch": 6.086984350497648, "grad_norm": 0.51953125, "learning_rate": 0.00035130412519601884, "loss": 5.1915, "step": 951000 }, { "epoch": 6.090184657727142, "grad_norm": 0.55078125, "learning_rate": 0.0003512785227381829, "loss": 5.1833, "step": 951500 }, { "epoch": 6.093384964956636, "grad_norm": 0.48046875, "learning_rate": 0.0003512529202803469, "loss": 5.1895, "step": 952000 }, { "epoch": 6.09658527218613, "grad_norm": 0.51171875, "learning_rate": 0.000351227317822511, "loss": 5.1865, "step": 952500 }, { "epoch": 6.099785579415624, "grad_norm": 0.51171875, "learning_rate": 0.00035120171536467504, "loss": 5.1873, "step": 953000 }, { "epoch": 6.102985886645118, "grad_norm": 0.59375, "learning_rate": 0.00035117611290683907, "loss": 5.1876, "step": 953500 }, { "epoch": 6.106186193874612, "grad_norm": 0.50390625, "learning_rate": 0.0003511505104490031, "loss": 5.1868, "step": 954000 }, { "epoch": 6.109386501104106, "grad_norm": 0.5703125, "learning_rate": 0.00035112490799116714, "loss": 5.1895, "step": 954500 }, { "epoch": 6.1125868083336, "grad_norm": 0.4921875, "learning_rate": 0.00035109930553333123, "loss": 5.1849, "step": 955000 }, { "epoch": 6.115787115563094, "grad_norm": 0.51171875, "learning_rate": 0.00035107370307549526, "loss": 5.1908, "step": 955500 }, { "epoch": 6.118987422792588, "grad_norm": 0.5, "learning_rate": 0.0003510481006176593, "loss": 5.1902, "step": 956000 }, { "epoch": 6.122187730022082, "grad_norm": 0.57421875, "learning_rate": 0.00035102249815982333, "loss": 5.1848, "step": 956500 }, { "epoch": 6.125388037251576, "grad_norm": 0.498046875, "learning_rate": 0.0003509968957019874, "loss": 5.1872, "step": 957000 }, { "epoch": 6.12858834448107, "grad_norm": 0.5390625, "learning_rate": 0.00035097129324415145, "loss": 5.1885, "step": 957500 }, { "epoch": 6.131788651710564, "grad_norm": 0.51953125, "learning_rate": 0.00035094569078631554, "loss": 5.1939, "step": 958000 }, { "epoch": 6.134988958940058, "grad_norm": 0.56640625, "learning_rate": 0.0003509200883284796, "loss": 5.1867, "step": 958500 }, { "epoch": 6.138189266169552, "grad_norm": 0.51953125, "learning_rate": 0.0003508944858706436, "loss": 5.1911, "step": 959000 }, { "epoch": 6.141389573399047, "grad_norm": 0.578125, "learning_rate": 0.00035086888341280764, "loss": 5.1907, "step": 959500 }, { "epoch": 6.144589880628541, "grad_norm": 0.53515625, "learning_rate": 0.0003508432809549717, "loss": 5.1826, "step": 960000 }, { "epoch": 6.1477901878580345, "grad_norm": 0.546875, "learning_rate": 0.00035081767849713577, "loss": 5.1895, "step": 960500 }, { "epoch": 6.1509904950875285, "grad_norm": 0.515625, "learning_rate": 0.0003507920760392998, "loss": 5.1838, "step": 961000 }, { "epoch": 6.154190802317022, "grad_norm": 0.51171875, "learning_rate": 0.00035076647358146384, "loss": 5.189, "step": 961500 }, { "epoch": 6.157391109546516, "grad_norm": 0.55859375, "learning_rate": 0.00035074087112362787, "loss": 5.197, "step": 962000 }, { "epoch": 6.16059141677601, "grad_norm": 0.4921875, "learning_rate": 0.0003507152686657919, "loss": 5.1883, "step": 962500 }, { "epoch": 6.163791724005504, "grad_norm": 0.54296875, "learning_rate": 0.000350689666207956, "loss": 5.1886, "step": 963000 }, { "epoch": 6.166992031234998, "grad_norm": 0.5, "learning_rate": 0.00035066406375012, "loss": 5.1899, "step": 963500 }, { "epoch": 6.170192338464493, "grad_norm": 0.5234375, "learning_rate": 0.00035063846129228406, "loss": 5.188, "step": 964000 }, { "epoch": 6.173392645693987, "grad_norm": 0.5859375, "learning_rate": 0.0003506128588344481, "loss": 5.1878, "step": 964500 }, { "epoch": 6.176592952923481, "grad_norm": 0.58984375, "learning_rate": 0.0003505872563766122, "loss": 5.1893, "step": 965000 }, { "epoch": 6.179793260152975, "grad_norm": 0.484375, "learning_rate": 0.0003505616539187762, "loss": 5.1954, "step": 965500 }, { "epoch": 6.182993567382469, "grad_norm": 0.5, "learning_rate": 0.0003505360514609403, "loss": 5.1895, "step": 966000 }, { "epoch": 6.186193874611963, "grad_norm": 0.5546875, "learning_rate": 0.00035051044900310434, "loss": 5.1938, "step": 966500 }, { "epoch": 6.189394181841457, "grad_norm": 0.51171875, "learning_rate": 0.0003504848465452684, "loss": 5.1846, "step": 967000 }, { "epoch": 6.192594489070951, "grad_norm": 0.546875, "learning_rate": 0.0003504592440874324, "loss": 5.1904, "step": 967500 }, { "epoch": 6.1957947963004445, "grad_norm": 0.515625, "learning_rate": 0.00035043364162959644, "loss": 5.1925, "step": 968000 }, { "epoch": 6.198995103529938, "grad_norm": 0.5546875, "learning_rate": 0.00035040803917176053, "loss": 5.1875, "step": 968500 }, { "epoch": 6.202195410759433, "grad_norm": 0.5078125, "learning_rate": 0.00035038243671392457, "loss": 5.1934, "step": 969000 }, { "epoch": 6.205395717988927, "grad_norm": 0.51953125, "learning_rate": 0.0003503568342560886, "loss": 5.1878, "step": 969500 }, { "epoch": 6.208596025218421, "grad_norm": 0.51953125, "learning_rate": 0.00035033123179825263, "loss": 5.1826, "step": 970000 }, { "epoch": 6.211796332447915, "grad_norm": 0.55859375, "learning_rate": 0.00035030562934041667, "loss": 5.1873, "step": 970500 }, { "epoch": 6.214996639677409, "grad_norm": 0.54296875, "learning_rate": 0.0003502800268825807, "loss": 5.1948, "step": 971000 }, { "epoch": 6.218196946906903, "grad_norm": 0.4921875, "learning_rate": 0.0003502544244247448, "loss": 5.1897, "step": 971500 }, { "epoch": 6.221397254136397, "grad_norm": 0.50390625, "learning_rate": 0.0003502288219669088, "loss": 5.19, "step": 972000 }, { "epoch": 6.224597561365891, "grad_norm": 0.5703125, "learning_rate": 0.0003502032195090729, "loss": 5.1866, "step": 972500 }, { "epoch": 6.227797868595385, "grad_norm": 0.50390625, "learning_rate": 0.00035017761705123695, "loss": 5.1908, "step": 973000 }, { "epoch": 6.23099817582488, "grad_norm": 0.52734375, "learning_rate": 0.000350152014593401, "loss": 5.1909, "step": 973500 }, { "epoch": 6.2341984830543735, "grad_norm": 0.54296875, "learning_rate": 0.00035012641213556507, "loss": 5.1882, "step": 974000 }, { "epoch": 6.2373987902838675, "grad_norm": 0.58984375, "learning_rate": 0.0003501008096777291, "loss": 5.1924, "step": 974500 }, { "epoch": 6.240599097513361, "grad_norm": 0.5703125, "learning_rate": 0.00035007520721989314, "loss": 5.1889, "step": 975000 }, { "epoch": 6.243799404742855, "grad_norm": 0.51171875, "learning_rate": 0.0003500496047620572, "loss": 5.1881, "step": 975500 }, { "epoch": 6.246999711972349, "grad_norm": 0.5859375, "learning_rate": 0.0003500240023042212, "loss": 5.188, "step": 976000 }, { "epoch": 6.250200019201843, "grad_norm": 0.5078125, "learning_rate": 0.00034999839984638524, "loss": 5.1906, "step": 976500 }, { "epoch": 6.253400326431337, "grad_norm": 0.48828125, "learning_rate": 0.00034997279738854933, "loss": 5.184, "step": 977000 }, { "epoch": 6.256600633660831, "grad_norm": 0.52734375, "learning_rate": 0.00034994719493071336, "loss": 5.1946, "step": 977500 }, { "epoch": 6.259800940890326, "grad_norm": 0.52734375, "learning_rate": 0.0003499215924728774, "loss": 5.1867, "step": 978000 }, { "epoch": 6.26300124811982, "grad_norm": 0.5078125, "learning_rate": 0.00034989599001504143, "loss": 5.1909, "step": 978500 }, { "epoch": 6.266201555349314, "grad_norm": 0.54296875, "learning_rate": 0.00034987038755720547, "loss": 5.192, "step": 979000 }, { "epoch": 6.269401862578808, "grad_norm": 0.515625, "learning_rate": 0.00034984478509936956, "loss": 5.1862, "step": 979500 }, { "epoch": 6.272602169808302, "grad_norm": 0.478515625, "learning_rate": 0.0003498191826415336, "loss": 5.1925, "step": 980000 }, { "epoch": 6.275802477037796, "grad_norm": 0.490234375, "learning_rate": 0.0003497935801836977, "loss": 5.1927, "step": 980500 }, { "epoch": 6.2790027842672895, "grad_norm": 0.50390625, "learning_rate": 0.0003497679777258617, "loss": 5.1869, "step": 981000 }, { "epoch": 6.2822030914967835, "grad_norm": 0.515625, "learning_rate": 0.00034974237526802575, "loss": 5.1929, "step": 981500 }, { "epoch": 6.285403398726277, "grad_norm": 0.52734375, "learning_rate": 0.0003497167728101898, "loss": 5.1874, "step": 982000 }, { "epoch": 6.288603705955771, "grad_norm": 0.57421875, "learning_rate": 0.00034969117035235387, "loss": 5.1839, "step": 982500 }, { "epoch": 6.291804013185266, "grad_norm": 0.57421875, "learning_rate": 0.0003496655678945179, "loss": 5.1902, "step": 983000 }, { "epoch": 6.29500432041476, "grad_norm": 0.54296875, "learning_rate": 0.00034963996543668194, "loss": 5.1861, "step": 983500 }, { "epoch": 6.298204627644254, "grad_norm": 0.5703125, "learning_rate": 0.00034961436297884597, "loss": 5.1926, "step": 984000 }, { "epoch": 6.301404934873748, "grad_norm": 0.490234375, "learning_rate": 0.00034958876052101, "loss": 5.1846, "step": 984500 }, { "epoch": 6.304605242103242, "grad_norm": 0.53125, "learning_rate": 0.0003495631580631741, "loss": 5.1901, "step": 985000 }, { "epoch": 6.307805549332736, "grad_norm": 0.515625, "learning_rate": 0.00034953755560533813, "loss": 5.1898, "step": 985500 }, { "epoch": 6.31100585656223, "grad_norm": 0.53515625, "learning_rate": 0.00034951195314750216, "loss": 5.1932, "step": 986000 }, { "epoch": 6.314206163791724, "grad_norm": 0.66015625, "learning_rate": 0.0003494863506896662, "loss": 5.186, "step": 986500 }, { "epoch": 6.317406471021218, "grad_norm": 0.482421875, "learning_rate": 0.0003494607482318303, "loss": 5.192, "step": 987000 }, { "epoch": 6.320606778250712, "grad_norm": 0.57421875, "learning_rate": 0.0003494351457739943, "loss": 5.193, "step": 987500 }, { "epoch": 6.323807085480206, "grad_norm": 0.5234375, "learning_rate": 0.0003494095433161584, "loss": 5.1917, "step": 988000 }, { "epoch": 6.3270073927097, "grad_norm": 0.5234375, "learning_rate": 0.00034938394085832244, "loss": 5.188, "step": 988500 }, { "epoch": 6.330207699939194, "grad_norm": 0.625, "learning_rate": 0.0003493583384004865, "loss": 5.19, "step": 989000 }, { "epoch": 6.333408007168688, "grad_norm": 0.484375, "learning_rate": 0.0003493327359426505, "loss": 5.1942, "step": 989500 }, { "epoch": 6.336608314398182, "grad_norm": 0.5625, "learning_rate": 0.00034930713348481455, "loss": 5.1932, "step": 990000 }, { "epoch": 6.339808621627676, "grad_norm": 0.5234375, "learning_rate": 0.00034928153102697863, "loss": 5.189, "step": 990500 }, { "epoch": 6.34300892885717, "grad_norm": 0.5625, "learning_rate": 0.00034925592856914267, "loss": 5.1888, "step": 991000 }, { "epoch": 6.346209236086664, "grad_norm": 0.51171875, "learning_rate": 0.0003492303261113067, "loss": 5.1849, "step": 991500 }, { "epoch": 6.349409543316158, "grad_norm": 0.49609375, "learning_rate": 0.00034920472365347074, "loss": 5.1969, "step": 992000 }, { "epoch": 6.352609850545653, "grad_norm": 0.5234375, "learning_rate": 0.00034917912119563477, "loss": 5.1877, "step": 992500 }, { "epoch": 6.355810157775147, "grad_norm": 0.5234375, "learning_rate": 0.0003491535187377988, "loss": 5.1914, "step": 993000 }, { "epoch": 6.359010465004641, "grad_norm": 0.5234375, "learning_rate": 0.0003491279162799629, "loss": 5.1909, "step": 993500 }, { "epoch": 6.362210772234135, "grad_norm": 0.515625, "learning_rate": 0.00034910231382212693, "loss": 5.1861, "step": 994000 }, { "epoch": 6.3654110794636285, "grad_norm": 0.5, "learning_rate": 0.00034907671136429096, "loss": 5.1896, "step": 994500 }, { "epoch": 6.368611386693122, "grad_norm": 0.5859375, "learning_rate": 0.00034905110890645505, "loss": 5.1852, "step": 995000 }, { "epoch": 6.371811693922616, "grad_norm": 0.5078125, "learning_rate": 0.0003490255064486191, "loss": 5.1914, "step": 995500 }, { "epoch": 6.37501200115211, "grad_norm": 0.498046875, "learning_rate": 0.0003489999039907832, "loss": 5.1831, "step": 996000 }, { "epoch": 6.378212308381604, "grad_norm": 0.490234375, "learning_rate": 0.0003489743015329472, "loss": 5.1936, "step": 996500 }, { "epoch": 6.381412615611099, "grad_norm": 0.55859375, "learning_rate": 0.00034894869907511124, "loss": 5.1896, "step": 997000 }, { "epoch": 6.384612922840593, "grad_norm": 0.4609375, "learning_rate": 0.0003489230966172753, "loss": 5.1879, "step": 997500 }, { "epoch": 6.387813230070087, "grad_norm": 0.578125, "learning_rate": 0.0003488974941594393, "loss": 5.1874, "step": 998000 }, { "epoch": 6.391013537299581, "grad_norm": 0.55859375, "learning_rate": 0.0003488718917016034, "loss": 5.1846, "step": 998500 }, { "epoch": 6.394213844529075, "grad_norm": 0.5703125, "learning_rate": 0.00034884628924376743, "loss": 5.1941, "step": 999000 }, { "epoch": 6.397414151758569, "grad_norm": 0.58203125, "learning_rate": 0.00034882068678593147, "loss": 5.1869, "step": 999500 }, { "epoch": 6.400614458988063, "grad_norm": 0.4921875, "learning_rate": 0.0003487950843280955, "loss": 5.1922, "step": 1000000 }, { "epoch": 6.403814766217557, "grad_norm": 0.48828125, "learning_rate": 0.00034876948187025954, "loss": 5.1877, "step": 1000500 }, { "epoch": 6.407015073447051, "grad_norm": 0.59375, "learning_rate": 0.00034874387941242357, "loss": 5.1912, "step": 1001000 }, { "epoch": 6.410215380676545, "grad_norm": 0.5078125, "learning_rate": 0.00034871827695458766, "loss": 5.1865, "step": 1001500 }, { "epoch": 6.413415687906039, "grad_norm": 0.50390625, "learning_rate": 0.0003486926744967517, "loss": 5.1884, "step": 1002000 }, { "epoch": 6.416615995135533, "grad_norm": 0.55859375, "learning_rate": 0.00034866707203891573, "loss": 5.1913, "step": 1002500 }, { "epoch": 6.419816302365027, "grad_norm": 0.5625, "learning_rate": 0.0003486414695810798, "loss": 5.1852, "step": 1003000 }, { "epoch": 6.423016609594521, "grad_norm": 0.5390625, "learning_rate": 0.00034861586712324385, "loss": 5.1951, "step": 1003500 }, { "epoch": 6.426216916824015, "grad_norm": 0.51171875, "learning_rate": 0.00034859026466540794, "loss": 5.1873, "step": 1004000 }, { "epoch": 6.429417224053509, "grad_norm": 0.515625, "learning_rate": 0.00034856466220757197, "loss": 5.1911, "step": 1004500 }, { "epoch": 6.432617531283003, "grad_norm": 0.58984375, "learning_rate": 0.000348539059749736, "loss": 5.1974, "step": 1005000 }, { "epoch": 6.435817838512497, "grad_norm": 0.5078125, "learning_rate": 0.00034851345729190004, "loss": 5.1876, "step": 1005500 }, { "epoch": 6.439018145741991, "grad_norm": 0.5625, "learning_rate": 0.0003484878548340641, "loss": 5.19, "step": 1006000 }, { "epoch": 6.442218452971486, "grad_norm": 0.6015625, "learning_rate": 0.0003484622523762281, "loss": 5.1888, "step": 1006500 }, { "epoch": 6.44541876020098, "grad_norm": 0.5390625, "learning_rate": 0.0003484366499183922, "loss": 5.1857, "step": 1007000 }, { "epoch": 6.4486190674304735, "grad_norm": 0.5234375, "learning_rate": 0.00034841104746055623, "loss": 5.1881, "step": 1007500 }, { "epoch": 6.4518193746599675, "grad_norm": 0.51953125, "learning_rate": 0.00034838544500272027, "loss": 5.1899, "step": 1008000 }, { "epoch": 6.455019681889461, "grad_norm": 0.515625, "learning_rate": 0.0003483598425448843, "loss": 5.1891, "step": 1008500 }, { "epoch": 6.458219989118955, "grad_norm": 0.48828125, "learning_rate": 0.00034833424008704833, "loss": 5.1972, "step": 1009000 }, { "epoch": 6.461420296348449, "grad_norm": 0.59375, "learning_rate": 0.0003483086376292124, "loss": 5.1911, "step": 1009500 }, { "epoch": 6.464620603577943, "grad_norm": 0.53515625, "learning_rate": 0.00034828303517137646, "loss": 5.1908, "step": 1010000 }, { "epoch": 6.467820910807437, "grad_norm": 0.546875, "learning_rate": 0.00034825743271354055, "loss": 5.1851, "step": 1010500 }, { "epoch": 6.471021218036931, "grad_norm": 0.54296875, "learning_rate": 0.0003482318302557046, "loss": 5.1934, "step": 1011000 }, { "epoch": 6.474221525266426, "grad_norm": 0.53125, "learning_rate": 0.0003482062277978686, "loss": 5.1904, "step": 1011500 }, { "epoch": 6.47742183249592, "grad_norm": 0.5390625, "learning_rate": 0.00034818062534003265, "loss": 5.1897, "step": 1012000 }, { "epoch": 6.480622139725414, "grad_norm": 0.5234375, "learning_rate": 0.00034815502288219674, "loss": 5.1882, "step": 1012500 }, { "epoch": 6.483822446954908, "grad_norm": 0.5078125, "learning_rate": 0.00034812942042436077, "loss": 5.1908, "step": 1013000 }, { "epoch": 6.487022754184402, "grad_norm": 0.51171875, "learning_rate": 0.0003481038179665248, "loss": 5.1889, "step": 1013500 }, { "epoch": 6.490223061413896, "grad_norm": 0.53125, "learning_rate": 0.00034807821550868884, "loss": 5.1961, "step": 1014000 }, { "epoch": 6.4934233686433895, "grad_norm": 0.474609375, "learning_rate": 0.0003480526130508529, "loss": 5.1916, "step": 1014500 }, { "epoch": 6.4966236758728835, "grad_norm": 0.4921875, "learning_rate": 0.00034802701059301696, "loss": 5.1997, "step": 1015000 }, { "epoch": 6.499823983102377, "grad_norm": 0.5625, "learning_rate": 0.000348001408135181, "loss": 5.1884, "step": 1015500 }, { "epoch": 6.503024290331872, "grad_norm": 0.52734375, "learning_rate": 0.00034797580567734503, "loss": 5.1854, "step": 1016000 }, { "epoch": 6.506224597561366, "grad_norm": 0.52734375, "learning_rate": 0.00034795020321950907, "loss": 5.1893, "step": 1016500 }, { "epoch": 6.50942490479086, "grad_norm": 0.4921875, "learning_rate": 0.0003479246007616731, "loss": 5.1924, "step": 1017000 }, { "epoch": 6.512625212020354, "grad_norm": 0.51953125, "learning_rate": 0.0003478989983038372, "loss": 5.1975, "step": 1017500 }, { "epoch": 6.515825519249848, "grad_norm": 0.52734375, "learning_rate": 0.0003478733958460012, "loss": 5.1912, "step": 1018000 }, { "epoch": 6.519025826479342, "grad_norm": 0.671875, "learning_rate": 0.0003478477933881653, "loss": 5.1807, "step": 1018500 }, { "epoch": 6.522226133708836, "grad_norm": 0.51953125, "learning_rate": 0.00034782219093032935, "loss": 5.1958, "step": 1019000 }, { "epoch": 6.52542644093833, "grad_norm": 0.54296875, "learning_rate": 0.0003477965884724934, "loss": 5.1963, "step": 1019500 }, { "epoch": 6.528626748167824, "grad_norm": 0.57421875, "learning_rate": 0.0003477709860146574, "loss": 5.1893, "step": 1020000 }, { "epoch": 6.531827055397319, "grad_norm": 0.5, "learning_rate": 0.0003477453835568215, "loss": 5.1881, "step": 1020500 }, { "epoch": 6.5350273626268125, "grad_norm": 0.56640625, "learning_rate": 0.00034771978109898554, "loss": 5.1903, "step": 1021000 }, { "epoch": 6.538227669856306, "grad_norm": 0.515625, "learning_rate": 0.00034769417864114957, "loss": 5.193, "step": 1021500 }, { "epoch": 6.5414279770858, "grad_norm": 0.58984375, "learning_rate": 0.0003476685761833136, "loss": 5.195, "step": 1022000 }, { "epoch": 6.544628284315294, "grad_norm": 0.4765625, "learning_rate": 0.00034764297372547764, "loss": 5.1893, "step": 1022500 }, { "epoch": 6.547828591544788, "grad_norm": 0.53125, "learning_rate": 0.0003476173712676417, "loss": 5.1921, "step": 1023000 }, { "epoch": 6.551028898774282, "grad_norm": 0.52734375, "learning_rate": 0.00034759176880980576, "loss": 5.1918, "step": 1023500 }, { "epoch": 6.554229206003776, "grad_norm": 0.5078125, "learning_rate": 0.0003475661663519698, "loss": 5.1978, "step": 1024000 }, { "epoch": 6.55742951323327, "grad_norm": 0.58203125, "learning_rate": 0.00034754056389413383, "loss": 5.189, "step": 1024500 }, { "epoch": 6.560629820462765, "grad_norm": 0.49609375, "learning_rate": 0.0003475149614362979, "loss": 5.1892, "step": 1025000 }, { "epoch": 6.563830127692259, "grad_norm": 0.55859375, "learning_rate": 0.00034748935897846195, "loss": 5.1897, "step": 1025500 }, { "epoch": 6.567030434921753, "grad_norm": 0.4921875, "learning_rate": 0.00034746375652062604, "loss": 5.1944, "step": 1026000 }, { "epoch": 6.570230742151247, "grad_norm": 0.48828125, "learning_rate": 0.0003474381540627901, "loss": 5.192, "step": 1026500 }, { "epoch": 6.573431049380741, "grad_norm": 0.55078125, "learning_rate": 0.0003474125516049541, "loss": 5.1942, "step": 1027000 }, { "epoch": 6.576631356610235, "grad_norm": 0.53515625, "learning_rate": 0.00034738694914711814, "loss": 5.185, "step": 1027500 }, { "epoch": 6.5798316638397285, "grad_norm": 0.51171875, "learning_rate": 0.0003473613466892822, "loss": 5.1918, "step": 1028000 }, { "epoch": 6.583031971069222, "grad_norm": 0.546875, "learning_rate": 0.0003473357442314462, "loss": 5.1896, "step": 1028500 }, { "epoch": 6.586232278298716, "grad_norm": 0.46875, "learning_rate": 0.0003473101417736103, "loss": 5.1934, "step": 1029000 }, { "epoch": 6.589432585528211, "grad_norm": 0.578125, "learning_rate": 0.00034728453931577434, "loss": 5.1935, "step": 1029500 }, { "epoch": 6.592632892757704, "grad_norm": 0.53515625, "learning_rate": 0.00034725893685793837, "loss": 5.1943, "step": 1030000 }, { "epoch": 6.595833199987199, "grad_norm": 0.51171875, "learning_rate": 0.0003472333344001024, "loss": 5.1917, "step": 1030500 }, { "epoch": 6.599033507216693, "grad_norm": 0.5625, "learning_rate": 0.00034720773194226644, "loss": 5.1918, "step": 1031000 }, { "epoch": 6.602233814446187, "grad_norm": 0.55859375, "learning_rate": 0.0003471821294844305, "loss": 5.1897, "step": 1031500 }, { "epoch": 6.605434121675681, "grad_norm": 0.474609375, "learning_rate": 0.00034715652702659456, "loss": 5.1899, "step": 1032000 }, { "epoch": 6.608634428905175, "grad_norm": 0.52734375, "learning_rate": 0.0003471309245687586, "loss": 5.1846, "step": 1032500 }, { "epoch": 6.611834736134669, "grad_norm": 0.5390625, "learning_rate": 0.0003471053221109227, "loss": 5.1905, "step": 1033000 }, { "epoch": 6.615035043364163, "grad_norm": 0.55859375, "learning_rate": 0.0003470797196530867, "loss": 5.1933, "step": 1033500 }, { "epoch": 6.618235350593657, "grad_norm": 0.52734375, "learning_rate": 0.00034705411719525075, "loss": 5.197, "step": 1034000 }, { "epoch": 6.621435657823151, "grad_norm": 0.52734375, "learning_rate": 0.00034702851473741484, "loss": 5.1931, "step": 1034500 }, { "epoch": 6.624635965052645, "grad_norm": 0.54296875, "learning_rate": 0.0003470029122795789, "loss": 5.1942, "step": 1035000 }, { "epoch": 6.627836272282139, "grad_norm": 0.4765625, "learning_rate": 0.0003469773098217429, "loss": 5.1872, "step": 1035500 }, { "epoch": 6.631036579511633, "grad_norm": 0.56640625, "learning_rate": 0.00034695170736390694, "loss": 5.1943, "step": 1036000 }, { "epoch": 6.634236886741127, "grad_norm": 0.52734375, "learning_rate": 0.000346926104906071, "loss": 5.1812, "step": 1036500 }, { "epoch": 6.637437193970621, "grad_norm": 0.53515625, "learning_rate": 0.00034690050244823507, "loss": 5.1939, "step": 1037000 }, { "epoch": 6.640637501200115, "grad_norm": 0.55078125, "learning_rate": 0.0003468748999903991, "loss": 5.1886, "step": 1037500 }, { "epoch": 6.643837808429609, "grad_norm": 0.578125, "learning_rate": 0.00034684929753256313, "loss": 5.1897, "step": 1038000 }, { "epoch": 6.647038115659103, "grad_norm": 0.52734375, "learning_rate": 0.00034682369507472717, "loss": 5.1931, "step": 1038500 }, { "epoch": 6.650238422888597, "grad_norm": 0.53125, "learning_rate": 0.0003467980926168912, "loss": 5.1965, "step": 1039000 }, { "epoch": 6.653438730118092, "grad_norm": 0.53125, "learning_rate": 0.0003467724901590553, "loss": 5.1879, "step": 1039500 }, { "epoch": 6.656639037347586, "grad_norm": 0.58984375, "learning_rate": 0.0003467468877012193, "loss": 5.1892, "step": 1040000 }, { "epoch": 6.65983934457708, "grad_norm": 0.5703125, "learning_rate": 0.0003467212852433834, "loss": 5.1931, "step": 1040500 }, { "epoch": 6.6630396518065735, "grad_norm": 0.640625, "learning_rate": 0.00034669568278554745, "loss": 5.1952, "step": 1041000 }, { "epoch": 6.6662399590360675, "grad_norm": 0.55078125, "learning_rate": 0.0003466700803277115, "loss": 5.1862, "step": 1041500 }, { "epoch": 6.669440266265561, "grad_norm": 0.5078125, "learning_rate": 0.0003466444778698755, "loss": 5.1895, "step": 1042000 }, { "epoch": 6.672640573495055, "grad_norm": 0.53515625, "learning_rate": 0.0003466188754120396, "loss": 5.1934, "step": 1042500 }, { "epoch": 6.675840880724549, "grad_norm": 0.63671875, "learning_rate": 0.00034659327295420364, "loss": 5.1865, "step": 1043000 }, { "epoch": 6.679041187954043, "grad_norm": 0.5390625, "learning_rate": 0.0003465676704963677, "loss": 5.1901, "step": 1043500 }, { "epoch": 6.682241495183538, "grad_norm": 0.48046875, "learning_rate": 0.0003465420680385317, "loss": 5.1898, "step": 1044000 }, { "epoch": 6.685441802413032, "grad_norm": 0.546875, "learning_rate": 0.00034651646558069574, "loss": 5.1869, "step": 1044500 }, { "epoch": 6.688642109642526, "grad_norm": 0.53515625, "learning_rate": 0.00034649086312285983, "loss": 5.1884, "step": 1045000 }, { "epoch": 6.69184241687202, "grad_norm": 0.53125, "learning_rate": 0.00034646526066502386, "loss": 5.1853, "step": 1045500 }, { "epoch": 6.695042724101514, "grad_norm": 0.5390625, "learning_rate": 0.0003464396582071879, "loss": 5.1859, "step": 1046000 }, { "epoch": 6.698243031331008, "grad_norm": 0.546875, "learning_rate": 0.00034641405574935193, "loss": 5.1969, "step": 1046500 }, { "epoch": 6.701443338560502, "grad_norm": 0.59375, "learning_rate": 0.00034638845329151597, "loss": 5.1975, "step": 1047000 }, { "epoch": 6.704643645789996, "grad_norm": 0.49609375, "learning_rate": 0.00034636285083368006, "loss": 5.1999, "step": 1047500 }, { "epoch": 6.7078439530194895, "grad_norm": 0.5390625, "learning_rate": 0.0003463372483758441, "loss": 5.1876, "step": 1048000 }, { "epoch": 6.711044260248984, "grad_norm": 0.50390625, "learning_rate": 0.0003463116459180082, "loss": 5.1959, "step": 1048500 }, { "epoch": 6.714244567478478, "grad_norm": 0.5234375, "learning_rate": 0.0003462860434601722, "loss": 5.1904, "step": 1049000 }, { "epoch": 6.717444874707972, "grad_norm": 0.55859375, "learning_rate": 0.00034626044100233625, "loss": 5.1928, "step": 1049500 }, { "epoch": 6.720645181937466, "grad_norm": 0.6640625, "learning_rate": 0.0003462348385445003, "loss": 5.1965, "step": 1050000 }, { "epoch": 6.72384548916696, "grad_norm": 0.5390625, "learning_rate": 0.00034620923608666437, "loss": 5.1949, "step": 1050500 }, { "epoch": 6.727045796396454, "grad_norm": 0.61328125, "learning_rate": 0.0003461836336288284, "loss": 5.187, "step": 1051000 }, { "epoch": 6.730246103625948, "grad_norm": 0.5078125, "learning_rate": 0.00034615803117099244, "loss": 5.1885, "step": 1051500 }, { "epoch": 6.733446410855442, "grad_norm": 0.59375, "learning_rate": 0.00034613242871315647, "loss": 5.1862, "step": 1052000 }, { "epoch": 6.736646718084936, "grad_norm": 0.54296875, "learning_rate": 0.0003461068262553205, "loss": 5.1924, "step": 1052500 }, { "epoch": 6.73984702531443, "grad_norm": 0.48828125, "learning_rate": 0.00034608122379748454, "loss": 5.1912, "step": 1053000 }, { "epoch": 6.743047332543924, "grad_norm": 0.59765625, "learning_rate": 0.00034605562133964863, "loss": 5.1894, "step": 1053500 }, { "epoch": 6.746247639773419, "grad_norm": 0.59375, "learning_rate": 0.00034603001888181266, "loss": 5.1912, "step": 1054000 }, { "epoch": 6.7494479470029125, "grad_norm": 0.5390625, "learning_rate": 0.0003460044164239767, "loss": 5.1874, "step": 1054500 }, { "epoch": 6.752648254232406, "grad_norm": 0.52734375, "learning_rate": 0.00034597881396614073, "loss": 5.1947, "step": 1055000 }, { "epoch": 6.7558485614619, "grad_norm": 0.498046875, "learning_rate": 0.0003459532115083048, "loss": 5.1855, "step": 1055500 }, { "epoch": 6.759048868691394, "grad_norm": 0.56640625, "learning_rate": 0.00034592760905046885, "loss": 5.1935, "step": 1056000 }, { "epoch": 6.762249175920888, "grad_norm": 0.55859375, "learning_rate": 0.00034590200659263294, "loss": 5.1905, "step": 1056500 }, { "epoch": 6.765449483150382, "grad_norm": 0.5234375, "learning_rate": 0.000345876404134797, "loss": 5.1933, "step": 1057000 }, { "epoch": 6.768649790379876, "grad_norm": 0.62890625, "learning_rate": 0.000345850801676961, "loss": 5.1927, "step": 1057500 }, { "epoch": 6.77185009760937, "grad_norm": 0.5234375, "learning_rate": 0.00034582519921912505, "loss": 5.1847, "step": 1058000 }, { "epoch": 6.775050404838865, "grad_norm": 0.5078125, "learning_rate": 0.0003457995967612891, "loss": 5.1947, "step": 1058500 }, { "epoch": 6.778250712068359, "grad_norm": 0.57421875, "learning_rate": 0.00034577399430345317, "loss": 5.1914, "step": 1059000 }, { "epoch": 6.781451019297853, "grad_norm": 0.50390625, "learning_rate": 0.0003457483918456172, "loss": 5.1856, "step": 1059500 }, { "epoch": 6.784651326527347, "grad_norm": 0.53515625, "learning_rate": 0.00034572278938778124, "loss": 5.1888, "step": 1060000 }, { "epoch": 6.787851633756841, "grad_norm": 0.5078125, "learning_rate": 0.00034569718692994527, "loss": 5.1896, "step": 1060500 }, { "epoch": 6.791051940986335, "grad_norm": 0.494140625, "learning_rate": 0.0003456715844721093, "loss": 5.1867, "step": 1061000 }, { "epoch": 6.7942522482158285, "grad_norm": 0.58203125, "learning_rate": 0.0003456459820142734, "loss": 5.1903, "step": 1061500 }, { "epoch": 6.7974525554453225, "grad_norm": 0.53125, "learning_rate": 0.00034562037955643743, "loss": 5.1917, "step": 1062000 }, { "epoch": 6.800652862674816, "grad_norm": 0.56640625, "learning_rate": 0.00034559477709860146, "loss": 5.1901, "step": 1062500 }, { "epoch": 6.803853169904311, "grad_norm": 0.53125, "learning_rate": 0.00034556917464076555, "loss": 5.1923, "step": 1063000 }, { "epoch": 6.807053477133805, "grad_norm": 0.62890625, "learning_rate": 0.0003455435721829296, "loss": 5.1872, "step": 1063500 }, { "epoch": 6.810253784363299, "grad_norm": 0.55859375, "learning_rate": 0.0003455179697250936, "loss": 5.1963, "step": 1064000 }, { "epoch": 6.813454091592793, "grad_norm": 0.51953125, "learning_rate": 0.0003454923672672577, "loss": 5.1925, "step": 1064500 }, { "epoch": 6.816654398822287, "grad_norm": 0.5390625, "learning_rate": 0.00034546676480942174, "loss": 5.188, "step": 1065000 }, { "epoch": 6.819854706051781, "grad_norm": 0.53125, "learning_rate": 0.0003454411623515858, "loss": 5.1882, "step": 1065500 }, { "epoch": 6.823055013281275, "grad_norm": 0.58984375, "learning_rate": 0.0003454155598937498, "loss": 5.1874, "step": 1066000 }, { "epoch": 6.826255320510769, "grad_norm": 0.494140625, "learning_rate": 0.00034538995743591384, "loss": 5.1914, "step": 1066500 }, { "epoch": 6.829455627740263, "grad_norm": 0.55078125, "learning_rate": 0.00034536435497807793, "loss": 5.1978, "step": 1067000 }, { "epoch": 6.8326559349697575, "grad_norm": 0.60546875, "learning_rate": 0.00034533875252024197, "loss": 5.1938, "step": 1067500 }, { "epoch": 6.8358562421992515, "grad_norm": 0.53125, "learning_rate": 0.000345313150062406, "loss": 5.1847, "step": 1068000 }, { "epoch": 6.839056549428745, "grad_norm": 0.5546875, "learning_rate": 0.00034528754760457004, "loss": 5.1907, "step": 1068500 }, { "epoch": 6.842256856658239, "grad_norm": 0.578125, "learning_rate": 0.00034526194514673407, "loss": 5.1946, "step": 1069000 }, { "epoch": 6.845457163887733, "grad_norm": 0.4765625, "learning_rate": 0.0003452363426888981, "loss": 5.1881, "step": 1069500 }, { "epoch": 6.848657471117227, "grad_norm": 0.55078125, "learning_rate": 0.0003452107402310622, "loss": 5.1939, "step": 1070000 }, { "epoch": 6.851857778346721, "grad_norm": 0.51953125, "learning_rate": 0.00034518513777322623, "loss": 5.1925, "step": 1070500 }, { "epoch": 6.855058085576215, "grad_norm": 0.5546875, "learning_rate": 0.0003451595353153903, "loss": 5.1939, "step": 1071000 }, { "epoch": 6.858258392805709, "grad_norm": 0.5078125, "learning_rate": 0.00034513393285755435, "loss": 5.1893, "step": 1071500 }, { "epoch": 6.861458700035204, "grad_norm": 0.58984375, "learning_rate": 0.0003451083303997184, "loss": 5.1903, "step": 1072000 }, { "epoch": 6.864659007264698, "grad_norm": 0.5625, "learning_rate": 0.00034508272794188247, "loss": 5.1848, "step": 1072500 }, { "epoch": 6.867859314494192, "grad_norm": 0.515625, "learning_rate": 0.0003450571254840465, "loss": 5.1885, "step": 1073000 }, { "epoch": 6.871059621723686, "grad_norm": 0.5703125, "learning_rate": 0.00034503152302621054, "loss": 5.1888, "step": 1073500 }, { "epoch": 6.87425992895318, "grad_norm": 0.6171875, "learning_rate": 0.0003450059205683746, "loss": 5.1897, "step": 1074000 }, { "epoch": 6.8774602361826735, "grad_norm": 0.5859375, "learning_rate": 0.0003449803181105386, "loss": 5.1896, "step": 1074500 }, { "epoch": 6.8806605434121675, "grad_norm": 0.640625, "learning_rate": 0.0003449547156527027, "loss": 5.1949, "step": 1075000 }, { "epoch": 6.883860850641661, "grad_norm": 0.58984375, "learning_rate": 0.00034492911319486673, "loss": 5.1969, "step": 1075500 }, { "epoch": 6.887061157871155, "grad_norm": 0.54296875, "learning_rate": 0.00034490351073703077, "loss": 5.1871, "step": 1076000 }, { "epoch": 6.890261465100649, "grad_norm": 0.546875, "learning_rate": 0.0003448779082791948, "loss": 5.1961, "step": 1076500 }, { "epoch": 6.893461772330143, "grad_norm": 0.5859375, "learning_rate": 0.00034485230582135884, "loss": 5.1902, "step": 1077000 }, { "epoch": 6.896662079559638, "grad_norm": 0.52734375, "learning_rate": 0.00034482670336352287, "loss": 5.1938, "step": 1077500 }, { "epoch": 6.899862386789132, "grad_norm": 0.54296875, "learning_rate": 0.00034480110090568696, "loss": 5.1915, "step": 1078000 }, { "epoch": 6.903062694018626, "grad_norm": 0.5, "learning_rate": 0.00034477549844785105, "loss": 5.1905, "step": 1078500 }, { "epoch": 6.90626300124812, "grad_norm": 0.546875, "learning_rate": 0.0003447498959900151, "loss": 5.1938, "step": 1079000 }, { "epoch": 6.909463308477614, "grad_norm": 0.53515625, "learning_rate": 0.0003447242935321791, "loss": 5.1896, "step": 1079500 }, { "epoch": 6.912663615707108, "grad_norm": 0.50390625, "learning_rate": 0.00034469869107434315, "loss": 5.1903, "step": 1080000 }, { "epoch": 6.915863922936602, "grad_norm": 0.57421875, "learning_rate": 0.00034467308861650724, "loss": 5.195, "step": 1080500 }, { "epoch": 6.919064230166096, "grad_norm": 0.49609375, "learning_rate": 0.00034464748615867127, "loss": 5.1908, "step": 1081000 }, { "epoch": 6.92226453739559, "grad_norm": 0.5625, "learning_rate": 0.0003446218837008353, "loss": 5.1953, "step": 1081500 }, { "epoch": 6.925464844625084, "grad_norm": 0.5234375, "learning_rate": 0.00034459628124299934, "loss": 5.1934, "step": 1082000 }, { "epoch": 6.928665151854578, "grad_norm": 0.546875, "learning_rate": 0.0003445706787851634, "loss": 5.1893, "step": 1082500 }, { "epoch": 6.931865459084072, "grad_norm": 0.5859375, "learning_rate": 0.0003445450763273274, "loss": 5.1902, "step": 1083000 }, { "epoch": 6.935065766313566, "grad_norm": 0.55859375, "learning_rate": 0.0003445194738694915, "loss": 5.1884, "step": 1083500 }, { "epoch": 6.93826607354306, "grad_norm": 0.51953125, "learning_rate": 0.00034449387141165553, "loss": 5.1871, "step": 1084000 }, { "epoch": 6.941466380772554, "grad_norm": 0.62890625, "learning_rate": 0.00034446826895381957, "loss": 5.1905, "step": 1084500 }, { "epoch": 6.944666688002048, "grad_norm": 0.53125, "learning_rate": 0.0003444426664959836, "loss": 5.1862, "step": 1085000 }, { "epoch": 6.947866995231542, "grad_norm": 0.546875, "learning_rate": 0.0003444170640381477, "loss": 5.1877, "step": 1085500 }, { "epoch": 6.951067302461036, "grad_norm": 0.625, "learning_rate": 0.0003443914615803117, "loss": 5.1917, "step": 1086000 }, { "epoch": 6.954267609690531, "grad_norm": 0.5390625, "learning_rate": 0.0003443658591224758, "loss": 5.1928, "step": 1086500 }, { "epoch": 6.957467916920025, "grad_norm": 0.5390625, "learning_rate": 0.00034434025666463985, "loss": 5.1903, "step": 1087000 }, { "epoch": 6.960668224149519, "grad_norm": 0.52734375, "learning_rate": 0.0003443146542068039, "loss": 5.1885, "step": 1087500 }, { "epoch": 6.9638685313790125, "grad_norm": 0.546875, "learning_rate": 0.0003442890517489679, "loss": 5.1883, "step": 1088000 }, { "epoch": 6.9670688386085065, "grad_norm": 0.5859375, "learning_rate": 0.00034426344929113195, "loss": 5.184, "step": 1088500 }, { "epoch": 6.970269145838, "grad_norm": 0.57421875, "learning_rate": 0.00034423784683329604, "loss": 5.1899, "step": 1089000 }, { "epoch": 6.973469453067494, "grad_norm": 0.59765625, "learning_rate": 0.00034421224437546007, "loss": 5.1946, "step": 1089500 }, { "epoch": 6.976669760296988, "grad_norm": 0.5625, "learning_rate": 0.0003441866419176241, "loss": 5.1946, "step": 1090000 }, { "epoch": 6.979870067526482, "grad_norm": 0.494140625, "learning_rate": 0.00034416103945978814, "loss": 5.1905, "step": 1090500 }, { "epoch": 6.983070374755977, "grad_norm": 0.515625, "learning_rate": 0.0003441354370019522, "loss": 5.1917, "step": 1091000 }, { "epoch": 6.986270681985471, "grad_norm": 0.52734375, "learning_rate": 0.00034410983454411626, "loss": 5.1895, "step": 1091500 }, { "epoch": 6.989470989214965, "grad_norm": 0.52734375, "learning_rate": 0.0003440842320862803, "loss": 5.1894, "step": 1092000 }, { "epoch": 6.992671296444459, "grad_norm": 0.58203125, "learning_rate": 0.00034405862962844433, "loss": 5.1908, "step": 1092500 }, { "epoch": 6.995871603673953, "grad_norm": 0.578125, "learning_rate": 0.00034403302717060836, "loss": 5.1952, "step": 1093000 }, { "epoch": 6.999071910903447, "grad_norm": 0.57421875, "learning_rate": 0.00034400742471277245, "loss": 5.1905, "step": 1093500 }, { "epoch": 7.0, "eval_loss": 5.1858015060424805, "eval_runtime": 1.1382, "eval_samples_per_second": 878.603, "eval_steps_per_second": 14.058, "step": 1093645 } ], "logging_steps": 500, "max_steps": 7811750, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.473295707163853e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }