diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15407 @@ +{ + "best_metric": 5.182580947875977, + "best_model_checkpoint": "./results/models/mistral-prot/checkpoint-937410", + "epoch": 7.0, + "eval_steps": 500, + "global_step": 1093645, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032003072294940313, + "grad_norm": 0.291015625, + "learning_rate": 0.00039997439754216405, + "loss": 5.83, + "step": 500 + }, + { + "epoch": 0.006400614458988063, + "grad_norm": 0.30859375, + "learning_rate": 0.0003999487950843281, + "loss": 5.6885, + "step": 1000 + }, + { + "epoch": 0.009600921688482094, + "grad_norm": 0.271484375, + "learning_rate": 0.0003999231926264922, + "loss": 5.6553, + "step": 1500 + }, + { + "epoch": 0.012801228917976125, + "grad_norm": 0.28125, + "learning_rate": 0.0003998975901686562, + "loss": 5.6407, + "step": 2000 + }, + { + "epoch": 0.016001536147470158, + "grad_norm": 0.2578125, + "learning_rate": 0.00039987198771082024, + "loss": 5.6315, + "step": 2500 + }, + { + "epoch": 0.01920184337696419, + "grad_norm": 0.2470703125, + "learning_rate": 0.0003998463852529843, + "loss": 5.6252, + "step": 3000 + }, + { + "epoch": 0.02240215060645822, + "grad_norm": 0.26953125, + "learning_rate": 0.00039982078279514837, + "loss": 5.6176, + "step": 3500 + }, + { + "epoch": 0.02560245783595225, + "grad_norm": 0.30078125, + "learning_rate": 0.0003997951803373124, + "loss": 5.6118, + "step": 4000 + }, + { + "epoch": 0.02880276506544628, + "grad_norm": 0.259765625, + "learning_rate": 0.0003997695778794765, + "loss": 5.6065, + "step": 4500 + }, + { + "epoch": 0.032003072294940316, + "grad_norm": 0.26171875, + "learning_rate": 0.0003997439754216405, + "loss": 5.6034, + "step": 5000 + }, + { + "epoch": 0.03520337952443434, + "grad_norm": 0.271484375, + "learning_rate": 0.00039971837296380456, + "loss": 5.5962, + "step": 5500 + }, + { + "epoch": 0.03840368675392838, + "grad_norm": 0.26953125, + "learning_rate": 0.0003996927705059686, + "loss": 5.5968, + "step": 6000 + }, + { + "epoch": 0.04160399398342241, + "grad_norm": 0.265625, + "learning_rate": 0.0003996671680481326, + "loss": 5.5895, + "step": 6500 + }, + { + "epoch": 0.04480430121291644, + "grad_norm": 0.2578125, + "learning_rate": 0.0003996415655902967, + "loss": 5.5867, + "step": 7000 + }, + { + "epoch": 0.048004608442410474, + "grad_norm": 0.328125, + "learning_rate": 0.00039961596313246075, + "loss": 5.5837, + "step": 7500 + }, + { + "epoch": 0.0512049156719045, + "grad_norm": 0.328125, + "learning_rate": 0.0003995903606746248, + "loss": 5.5813, + "step": 8000 + }, + { + "epoch": 0.054405222901398535, + "grad_norm": 0.267578125, + "learning_rate": 0.0003995647582167888, + "loss": 5.577, + "step": 8500 + }, + { + "epoch": 0.05760553013089256, + "grad_norm": 0.251953125, + "learning_rate": 0.00039953915575895285, + "loss": 5.571, + "step": 9000 + }, + { + "epoch": 0.0608058373603866, + "grad_norm": 0.25390625, + "learning_rate": 0.0003995135533011169, + "loss": 5.5684, + "step": 9500 + }, + { + "epoch": 0.06400614458988063, + "grad_norm": 0.25390625, + "learning_rate": 0.000399487950843281, + "loss": 5.566, + "step": 10000 + }, + { + "epoch": 0.06720645181937467, + "grad_norm": 0.255859375, + "learning_rate": 0.000399462348385445, + "loss": 5.5603, + "step": 10500 + }, + { + "epoch": 0.07040675904886869, + "grad_norm": 0.2734375, + "learning_rate": 0.00039943674592760904, + "loss": 5.5588, + "step": 11000 + }, + { + "epoch": 0.07360706627836272, + "grad_norm": 0.2392578125, + "learning_rate": 0.00039941114346977313, + "loss": 5.5549, + "step": 11500 + }, + { + "epoch": 0.07680737350785675, + "grad_norm": 0.265625, + "learning_rate": 0.00039938554101193717, + "loss": 5.5491, + "step": 12000 + }, + { + "epoch": 0.08000768073735079, + "grad_norm": 0.244140625, + "learning_rate": 0.00039935993855410125, + "loss": 5.5479, + "step": 12500 + }, + { + "epoch": 0.08320798796684482, + "grad_norm": 0.27734375, + "learning_rate": 0.0003993343360962653, + "loss": 5.5413, + "step": 13000 + }, + { + "epoch": 0.08640829519633884, + "grad_norm": 0.26171875, + "learning_rate": 0.0003993087336384293, + "loss": 5.5361, + "step": 13500 + }, + { + "epoch": 0.08960860242583288, + "grad_norm": 0.263671875, + "learning_rate": 0.00039928313118059336, + "loss": 5.5309, + "step": 14000 + }, + { + "epoch": 0.09280890965532691, + "grad_norm": 0.314453125, + "learning_rate": 0.0003992575287227574, + "loss": 5.5225, + "step": 14500 + }, + { + "epoch": 0.09600921688482095, + "grad_norm": 0.265625, + "learning_rate": 0.0003992319262649214, + "loss": 5.5158, + "step": 15000 + }, + { + "epoch": 0.09920952411431497, + "grad_norm": 0.28125, + "learning_rate": 0.0003992063238070855, + "loss": 5.5118, + "step": 15500 + }, + { + "epoch": 0.102409831343809, + "grad_norm": 0.2421875, + "learning_rate": 0.00039918072134924955, + "loss": 5.5003, + "step": 16000 + }, + { + "epoch": 0.10561013857330304, + "grad_norm": 0.255859375, + "learning_rate": 0.0003991551188914136, + "loss": 5.4983, + "step": 16500 + }, + { + "epoch": 0.10881044580279707, + "grad_norm": 0.25390625, + "learning_rate": 0.0003991295164335776, + "loss": 5.4975, + "step": 17000 + }, + { + "epoch": 0.1120107530322911, + "grad_norm": 0.26953125, + "learning_rate": 0.00039910391397574165, + "loss": 5.4921, + "step": 17500 + }, + { + "epoch": 0.11521106026178513, + "grad_norm": 0.26171875, + "learning_rate": 0.00039907831151790574, + "loss": 5.4822, + "step": 18000 + }, + { + "epoch": 0.11841136749127916, + "grad_norm": 0.25390625, + "learning_rate": 0.0003990527090600698, + "loss": 5.476, + "step": 18500 + }, + { + "epoch": 0.1216116747207732, + "grad_norm": 0.251953125, + "learning_rate": 0.00039902710660223386, + "loss": 5.4742, + "step": 19000 + }, + { + "epoch": 0.12481198195026723, + "grad_norm": 0.248046875, + "learning_rate": 0.0003990015041443979, + "loss": 5.4732, + "step": 19500 + }, + { + "epoch": 0.12801228917976126, + "grad_norm": 0.248046875, + "learning_rate": 0.00039897590168656193, + "loss": 5.4715, + "step": 20000 + }, + { + "epoch": 0.13121259640925528, + "grad_norm": 0.2578125, + "learning_rate": 0.000398950299228726, + "loss": 5.4682, + "step": 20500 + }, + { + "epoch": 0.13441290363874933, + "grad_norm": 0.2734375, + "learning_rate": 0.00039892469677089005, + "loss": 5.4649, + "step": 21000 + }, + { + "epoch": 0.13761321086824335, + "grad_norm": 0.26171875, + "learning_rate": 0.0003988990943130541, + "loss": 5.4609, + "step": 21500 + }, + { + "epoch": 0.14081351809773737, + "grad_norm": 0.271484375, + "learning_rate": 0.0003988734918552181, + "loss": 5.4567, + "step": 22000 + }, + { + "epoch": 0.14401382532723142, + "grad_norm": 0.255859375, + "learning_rate": 0.00039884788939738216, + "loss": 5.4546, + "step": 22500 + }, + { + "epoch": 0.14721413255672544, + "grad_norm": 0.2578125, + "learning_rate": 0.0003988222869395462, + "loss": 5.4503, + "step": 23000 + }, + { + "epoch": 0.1504144397862195, + "grad_norm": 0.267578125, + "learning_rate": 0.0003987966844817103, + "loss": 5.4432, + "step": 23500 + }, + { + "epoch": 0.1536147470157135, + "grad_norm": 0.25390625, + "learning_rate": 0.0003987710820238743, + "loss": 5.4463, + "step": 24000 + }, + { + "epoch": 0.15681505424520753, + "grad_norm": 0.267578125, + "learning_rate": 0.00039874547956603835, + "loss": 5.4436, + "step": 24500 + }, + { + "epoch": 0.16001536147470158, + "grad_norm": 0.279296875, + "learning_rate": 0.0003987198771082024, + "loss": 5.4418, + "step": 25000 + }, + { + "epoch": 0.1632156687041956, + "grad_norm": 0.259765625, + "learning_rate": 0.0003986942746503664, + "loss": 5.4418, + "step": 25500 + }, + { + "epoch": 0.16641597593368965, + "grad_norm": 0.26953125, + "learning_rate": 0.0003986686721925305, + "loss": 5.4369, + "step": 26000 + }, + { + "epoch": 0.16961628316318367, + "grad_norm": 0.2490234375, + "learning_rate": 0.00039864306973469454, + "loss": 5.4352, + "step": 26500 + }, + { + "epoch": 0.1728165903926777, + "grad_norm": 0.2578125, + "learning_rate": 0.00039861746727685863, + "loss": 5.4318, + "step": 27000 + }, + { + "epoch": 0.17601689762217174, + "grad_norm": 0.25390625, + "learning_rate": 0.00039859186481902266, + "loss": 5.4271, + "step": 27500 + }, + { + "epoch": 0.17921720485166576, + "grad_norm": 0.265625, + "learning_rate": 0.0003985662623611867, + "loss": 5.4238, + "step": 28000 + }, + { + "epoch": 0.1824175120811598, + "grad_norm": 0.251953125, + "learning_rate": 0.00039854065990335073, + "loss": 5.4235, + "step": 28500 + }, + { + "epoch": 0.18561781931065383, + "grad_norm": 0.271484375, + "learning_rate": 0.0003985150574455148, + "loss": 5.4201, + "step": 29000 + }, + { + "epoch": 0.18881812654014785, + "grad_norm": 0.263671875, + "learning_rate": 0.00039848945498767885, + "loss": 5.4201, + "step": 29500 + }, + { + "epoch": 0.1920184337696419, + "grad_norm": 0.259765625, + "learning_rate": 0.0003984638525298429, + "loss": 5.4195, + "step": 30000 + }, + { + "epoch": 0.19521874099913591, + "grad_norm": 0.255859375, + "learning_rate": 0.0003984382500720069, + "loss": 5.4198, + "step": 30500 + }, + { + "epoch": 0.19841904822862994, + "grad_norm": 0.263671875, + "learning_rate": 0.00039841264761417096, + "loss": 5.4172, + "step": 31000 + }, + { + "epoch": 0.20161935545812398, + "grad_norm": 0.255859375, + "learning_rate": 0.00039838704515633504, + "loss": 5.4132, + "step": 31500 + }, + { + "epoch": 0.204819662687618, + "grad_norm": 0.271484375, + "learning_rate": 0.0003983614426984991, + "loss": 5.4128, + "step": 32000 + }, + { + "epoch": 0.20801996991711205, + "grad_norm": 0.25, + "learning_rate": 0.0003983358402406631, + "loss": 5.4085, + "step": 32500 + }, + { + "epoch": 0.21122027714660607, + "grad_norm": 0.2578125, + "learning_rate": 0.00039831023778282715, + "loss": 5.4076, + "step": 33000 + }, + { + "epoch": 0.2144205843761001, + "grad_norm": 0.26953125, + "learning_rate": 0.0003982846353249912, + "loss": 5.4045, + "step": 33500 + }, + { + "epoch": 0.21762089160559414, + "grad_norm": 0.2578125, + "learning_rate": 0.00039825903286715527, + "loss": 5.4033, + "step": 34000 + }, + { + "epoch": 0.22082119883508816, + "grad_norm": 0.2490234375, + "learning_rate": 0.0003982334304093193, + "loss": 5.4007, + "step": 34500 + }, + { + "epoch": 0.2240215060645822, + "grad_norm": 0.26953125, + "learning_rate": 0.0003982078279514834, + "loss": 5.3985, + "step": 35000 + }, + { + "epoch": 0.22722181329407623, + "grad_norm": 0.265625, + "learning_rate": 0.0003981822254936474, + "loss": 5.3999, + "step": 35500 + }, + { + "epoch": 0.23042212052357025, + "grad_norm": 0.2490234375, + "learning_rate": 0.00039815662303581146, + "loss": 5.3982, + "step": 36000 + }, + { + "epoch": 0.2336224277530643, + "grad_norm": 0.28515625, + "learning_rate": 0.0003981310205779755, + "loss": 5.3966, + "step": 36500 + }, + { + "epoch": 0.23682273498255832, + "grad_norm": 0.265625, + "learning_rate": 0.0003981054181201396, + "loss": 5.3965, + "step": 37000 + }, + { + "epoch": 0.24002304221205237, + "grad_norm": 0.26953125, + "learning_rate": 0.0003980798156623036, + "loss": 5.3941, + "step": 37500 + }, + { + "epoch": 0.2432233494415464, + "grad_norm": 0.267578125, + "learning_rate": 0.00039805421320446765, + "loss": 5.3914, + "step": 38000 + }, + { + "epoch": 0.2464236566710404, + "grad_norm": 0.265625, + "learning_rate": 0.0003980286107466317, + "loss": 5.387, + "step": 38500 + }, + { + "epoch": 0.24962396390053446, + "grad_norm": 0.265625, + "learning_rate": 0.0003980030082887957, + "loss": 5.3889, + "step": 39000 + }, + { + "epoch": 0.2528242711300285, + "grad_norm": 0.2734375, + "learning_rate": 0.00039797740583095975, + "loss": 5.3829, + "step": 39500 + }, + { + "epoch": 0.2560245783595225, + "grad_norm": 0.2734375, + "learning_rate": 0.00039795180337312384, + "loss": 5.3887, + "step": 40000 + }, + { + "epoch": 0.2592248855890165, + "grad_norm": 0.26953125, + "learning_rate": 0.0003979262009152879, + "loss": 5.3815, + "step": 40500 + }, + { + "epoch": 0.26242519281851057, + "grad_norm": 0.28515625, + "learning_rate": 0.0003979005984574519, + "loss": 5.3823, + "step": 41000 + }, + { + "epoch": 0.2656255000480046, + "grad_norm": 0.2734375, + "learning_rate": 0.000397874995999616, + "loss": 5.3804, + "step": 41500 + }, + { + "epoch": 0.26882580727749866, + "grad_norm": 0.2734375, + "learning_rate": 0.00039784939354178003, + "loss": 5.3816, + "step": 42000 + }, + { + "epoch": 0.27202611450699266, + "grad_norm": 0.26171875, + "learning_rate": 0.0003978237910839441, + "loss": 5.3798, + "step": 42500 + }, + { + "epoch": 0.2752264217364867, + "grad_norm": 0.25390625, + "learning_rate": 0.00039779818862610816, + "loss": 5.3706, + "step": 43000 + }, + { + "epoch": 0.27842672896598075, + "grad_norm": 0.26953125, + "learning_rate": 0.0003977725861682722, + "loss": 5.3747, + "step": 43500 + }, + { + "epoch": 0.28162703619547474, + "grad_norm": 0.279296875, + "learning_rate": 0.0003977469837104362, + "loss": 5.3684, + "step": 44000 + }, + { + "epoch": 0.2848273434249688, + "grad_norm": 0.26953125, + "learning_rate": 0.00039772138125260026, + "loss": 5.3669, + "step": 44500 + }, + { + "epoch": 0.28802765065446284, + "grad_norm": 0.259765625, + "learning_rate": 0.0003976957787947643, + "loss": 5.3725, + "step": 45000 + }, + { + "epoch": 0.29122795788395683, + "grad_norm": 0.2734375, + "learning_rate": 0.0003976701763369284, + "loss": 5.3737, + "step": 45500 + }, + { + "epoch": 0.2944282651134509, + "grad_norm": 0.2734375, + "learning_rate": 0.0003976445738790924, + "loss": 5.3694, + "step": 46000 + }, + { + "epoch": 0.29762857234294493, + "grad_norm": 0.265625, + "learning_rate": 0.00039761897142125645, + "loss": 5.3707, + "step": 46500 + }, + { + "epoch": 0.300828879572439, + "grad_norm": 0.283203125, + "learning_rate": 0.0003975933689634205, + "loss": 5.3625, + "step": 47000 + }, + { + "epoch": 0.30402918680193297, + "grad_norm": 0.267578125, + "learning_rate": 0.0003975677665055845, + "loss": 5.3647, + "step": 47500 + }, + { + "epoch": 0.307229494031427, + "grad_norm": 0.2578125, + "learning_rate": 0.0003975421640477486, + "loss": 5.3624, + "step": 48000 + }, + { + "epoch": 0.31042980126092107, + "grad_norm": 0.267578125, + "learning_rate": 0.00039751656158991264, + "loss": 5.3623, + "step": 48500 + }, + { + "epoch": 0.31363010849041506, + "grad_norm": 0.2734375, + "learning_rate": 0.0003974909591320767, + "loss": 5.3642, + "step": 49000 + }, + { + "epoch": 0.3168304157199091, + "grad_norm": 0.28125, + "learning_rate": 0.00039746535667424076, + "loss": 5.3574, + "step": 49500 + }, + { + "epoch": 0.32003072294940316, + "grad_norm": 0.267578125, + "learning_rate": 0.0003974397542164048, + "loss": 5.361, + "step": 50000 + }, + { + "epoch": 0.32323103017889715, + "grad_norm": 0.279296875, + "learning_rate": 0.00039741415175856883, + "loss": 5.3607, + "step": 50500 + }, + { + "epoch": 0.3264313374083912, + "grad_norm": 0.265625, + "learning_rate": 0.0003973885493007329, + "loss": 5.3574, + "step": 51000 + }, + { + "epoch": 0.32963164463788525, + "grad_norm": 0.271484375, + "learning_rate": 0.00039736294684289696, + "loss": 5.353, + "step": 51500 + }, + { + "epoch": 0.3328319518673793, + "grad_norm": 0.263671875, + "learning_rate": 0.000397337344385061, + "loss": 5.3515, + "step": 52000 + }, + { + "epoch": 0.3360322590968733, + "grad_norm": 0.283203125, + "learning_rate": 0.000397311741927225, + "loss": 5.3555, + "step": 52500 + }, + { + "epoch": 0.33923256632636734, + "grad_norm": 0.279296875, + "learning_rate": 0.00039728613946938906, + "loss": 5.3451, + "step": 53000 + }, + { + "epoch": 0.3424328735558614, + "grad_norm": 0.267578125, + "learning_rate": 0.00039726053701155315, + "loss": 5.3454, + "step": 53500 + }, + { + "epoch": 0.3456331807853554, + "grad_norm": 0.27734375, + "learning_rate": 0.0003972349345537172, + "loss": 5.3493, + "step": 54000 + }, + { + "epoch": 0.3488334880148494, + "grad_norm": 0.279296875, + "learning_rate": 0.0003972093320958812, + "loss": 5.3508, + "step": 54500 + }, + { + "epoch": 0.3520337952443435, + "grad_norm": 0.27734375, + "learning_rate": 0.00039718372963804525, + "loss": 5.3413, + "step": 55000 + }, + { + "epoch": 0.35523410247383747, + "grad_norm": 0.275390625, + "learning_rate": 0.0003971581271802093, + "loss": 5.3471, + "step": 55500 + }, + { + "epoch": 0.3584344097033315, + "grad_norm": 0.26953125, + "learning_rate": 0.0003971325247223733, + "loss": 5.3464, + "step": 56000 + }, + { + "epoch": 0.36163471693282556, + "grad_norm": 0.2890625, + "learning_rate": 0.0003971069222645374, + "loss": 5.3434, + "step": 56500 + }, + { + "epoch": 0.3648350241623196, + "grad_norm": 0.263671875, + "learning_rate": 0.0003970813198067015, + "loss": 5.3393, + "step": 57000 + }, + { + "epoch": 0.3680353313918136, + "grad_norm": 0.28515625, + "learning_rate": 0.00039705571734886553, + "loss": 5.3449, + "step": 57500 + }, + { + "epoch": 0.37123563862130765, + "grad_norm": 0.291015625, + "learning_rate": 0.00039703011489102956, + "loss": 5.3393, + "step": 58000 + }, + { + "epoch": 0.3744359458508017, + "grad_norm": 0.28515625, + "learning_rate": 0.0003970045124331936, + "loss": 5.3411, + "step": 58500 + }, + { + "epoch": 0.3776362530802957, + "grad_norm": 0.271484375, + "learning_rate": 0.0003969789099753577, + "loss": 5.3293, + "step": 59000 + }, + { + "epoch": 0.38083656030978974, + "grad_norm": 0.28515625, + "learning_rate": 0.0003969533075175217, + "loss": 5.3352, + "step": 59500 + }, + { + "epoch": 0.3840368675392838, + "grad_norm": 0.2734375, + "learning_rate": 0.00039692770505968575, + "loss": 5.334, + "step": 60000 + }, + { + "epoch": 0.3872371747687778, + "grad_norm": 0.271484375, + "learning_rate": 0.0003969021026018498, + "loss": 5.3365, + "step": 60500 + }, + { + "epoch": 0.39043748199827183, + "grad_norm": 0.2734375, + "learning_rate": 0.0003968765001440138, + "loss": 5.3376, + "step": 61000 + }, + { + "epoch": 0.3936377892277659, + "grad_norm": 0.275390625, + "learning_rate": 0.0003968508976861779, + "loss": 5.332, + "step": 61500 + }, + { + "epoch": 0.39683809645725987, + "grad_norm": 0.275390625, + "learning_rate": 0.00039682529522834195, + "loss": 5.3327, + "step": 62000 + }, + { + "epoch": 0.4000384036867539, + "grad_norm": 0.28515625, + "learning_rate": 0.000396799692770506, + "loss": 5.3324, + "step": 62500 + }, + { + "epoch": 0.40323871091624797, + "grad_norm": 0.29296875, + "learning_rate": 0.00039677409031267, + "loss": 5.333, + "step": 63000 + }, + { + "epoch": 0.406439018145742, + "grad_norm": 0.283203125, + "learning_rate": 0.00039674848785483405, + "loss": 5.3365, + "step": 63500 + }, + { + "epoch": 0.409639325375236, + "grad_norm": 0.271484375, + "learning_rate": 0.00039672288539699814, + "loss": 5.3299, + "step": 64000 + }, + { + "epoch": 0.41283963260473006, + "grad_norm": 0.291015625, + "learning_rate": 0.00039669728293916217, + "loss": 5.3274, + "step": 64500 + }, + { + "epoch": 0.4160399398342241, + "grad_norm": 0.279296875, + "learning_rate": 0.00039667168048132626, + "loss": 5.3297, + "step": 65000 + }, + { + "epoch": 0.4192402470637181, + "grad_norm": 0.27734375, + "learning_rate": 0.0003966460780234903, + "loss": 5.3297, + "step": 65500 + }, + { + "epoch": 0.42244055429321214, + "grad_norm": 0.310546875, + "learning_rate": 0.00039662047556565433, + "loss": 5.3258, + "step": 66000 + }, + { + "epoch": 0.4256408615227062, + "grad_norm": 0.279296875, + "learning_rate": 0.00039659487310781836, + "loss": 5.3202, + "step": 66500 + }, + { + "epoch": 0.4288411687522002, + "grad_norm": 0.279296875, + "learning_rate": 0.00039656927064998245, + "loss": 5.3167, + "step": 67000 + }, + { + "epoch": 0.43204147598169423, + "grad_norm": 0.2890625, + "learning_rate": 0.0003965436681921465, + "loss": 5.3217, + "step": 67500 + }, + { + "epoch": 0.4352417832111883, + "grad_norm": 0.291015625, + "learning_rate": 0.0003965180657343105, + "loss": 5.3199, + "step": 68000 + }, + { + "epoch": 0.43844209044068233, + "grad_norm": 0.310546875, + "learning_rate": 0.00039649246327647455, + "loss": 5.3201, + "step": 68500 + }, + { + "epoch": 0.4416423976701763, + "grad_norm": 0.2734375, + "learning_rate": 0.0003964668608186386, + "loss": 5.3223, + "step": 69000 + }, + { + "epoch": 0.44484270489967037, + "grad_norm": 0.279296875, + "learning_rate": 0.0003964412583608026, + "loss": 5.3276, + "step": 69500 + }, + { + "epoch": 0.4480430121291644, + "grad_norm": 0.271484375, + "learning_rate": 0.0003964156559029667, + "loss": 5.3137, + "step": 70000 + }, + { + "epoch": 0.4512433193586584, + "grad_norm": 0.2890625, + "learning_rate": 0.00039639005344513074, + "loss": 5.3179, + "step": 70500 + }, + { + "epoch": 0.45444362658815246, + "grad_norm": 0.283203125, + "learning_rate": 0.0003963644509872948, + "loss": 5.3192, + "step": 71000 + }, + { + "epoch": 0.4576439338176465, + "grad_norm": 0.291015625, + "learning_rate": 0.0003963388485294588, + "loss": 5.317, + "step": 71500 + }, + { + "epoch": 0.4608442410471405, + "grad_norm": 0.291015625, + "learning_rate": 0.0003963132460716229, + "loss": 5.3145, + "step": 72000 + }, + { + "epoch": 0.46404454827663455, + "grad_norm": 0.283203125, + "learning_rate": 0.00039628764361378694, + "loss": 5.3123, + "step": 72500 + }, + { + "epoch": 0.4672448555061286, + "grad_norm": 0.287109375, + "learning_rate": 0.000396262041155951, + "loss": 5.3127, + "step": 73000 + }, + { + "epoch": 0.47044516273562265, + "grad_norm": 0.287109375, + "learning_rate": 0.00039623643869811506, + "loss": 5.3195, + "step": 73500 + }, + { + "epoch": 0.47364546996511664, + "grad_norm": 0.28515625, + "learning_rate": 0.0003962108362402791, + "loss": 5.3111, + "step": 74000 + }, + { + "epoch": 0.4768457771946107, + "grad_norm": 0.2890625, + "learning_rate": 0.00039618523378244313, + "loss": 5.3141, + "step": 74500 + }, + { + "epoch": 0.48004608442410474, + "grad_norm": 0.287109375, + "learning_rate": 0.00039615963132460716, + "loss": 5.3062, + "step": 75000 + }, + { + "epoch": 0.48324639165359873, + "grad_norm": 0.283203125, + "learning_rate": 0.00039613402886677125, + "loss": 5.3127, + "step": 75500 + }, + { + "epoch": 0.4864466988830928, + "grad_norm": 0.2890625, + "learning_rate": 0.0003961084264089353, + "loss": 5.31, + "step": 76000 + }, + { + "epoch": 0.4896470061125868, + "grad_norm": 0.287109375, + "learning_rate": 0.0003960828239510993, + "loss": 5.3139, + "step": 76500 + }, + { + "epoch": 0.4928473133420808, + "grad_norm": 0.27734375, + "learning_rate": 0.00039605722149326335, + "loss": 5.3109, + "step": 77000 + }, + { + "epoch": 0.49604762057157487, + "grad_norm": 0.279296875, + "learning_rate": 0.0003960316190354274, + "loss": 5.3037, + "step": 77500 + }, + { + "epoch": 0.4992479278010689, + "grad_norm": 0.275390625, + "learning_rate": 0.0003960060165775915, + "loss": 5.3057, + "step": 78000 + }, + { + "epoch": 0.5024482350305629, + "grad_norm": 0.29296875, + "learning_rate": 0.0003959804141197555, + "loss": 5.3097, + "step": 78500 + }, + { + "epoch": 0.505648542260057, + "grad_norm": 0.3046875, + "learning_rate": 0.00039595481166191954, + "loss": 5.3118, + "step": 79000 + }, + { + "epoch": 0.508848849489551, + "grad_norm": 0.291015625, + "learning_rate": 0.00039592920920408363, + "loss": 5.3031, + "step": 79500 + }, + { + "epoch": 0.512049156719045, + "grad_norm": 0.287109375, + "learning_rate": 0.00039590360674624767, + "loss": 5.3047, + "step": 80000 + }, + { + "epoch": 0.5152494639485391, + "grad_norm": 0.283203125, + "learning_rate": 0.0003958780042884117, + "loss": 5.307, + "step": 80500 + }, + { + "epoch": 0.518449771178033, + "grad_norm": 0.291015625, + "learning_rate": 0.0003958524018305758, + "loss": 5.3015, + "step": 81000 + }, + { + "epoch": 0.5216500784075271, + "grad_norm": 0.3046875, + "learning_rate": 0.0003958267993727398, + "loss": 5.2987, + "step": 81500 + }, + { + "epoch": 0.5248503856370211, + "grad_norm": 0.28515625, + "learning_rate": 0.00039580119691490386, + "loss": 5.2999, + "step": 82000 + }, + { + "epoch": 0.5280506928665152, + "grad_norm": 0.279296875, + "learning_rate": 0.0003957755944570679, + "loss": 5.2987, + "step": 82500 + }, + { + "epoch": 0.5312510000960092, + "grad_norm": 0.294921875, + "learning_rate": 0.0003957499919992319, + "loss": 5.301, + "step": 83000 + }, + { + "epoch": 0.5344513073255033, + "grad_norm": 0.296875, + "learning_rate": 0.000395724389541396, + "loss": 5.3026, + "step": 83500 + }, + { + "epoch": 0.5376516145549973, + "grad_norm": 0.2890625, + "learning_rate": 0.00039569878708356005, + "loss": 5.2979, + "step": 84000 + }, + { + "epoch": 0.5408519217844913, + "grad_norm": 0.298828125, + "learning_rate": 0.0003956731846257241, + "loss": 5.3044, + "step": 84500 + }, + { + "epoch": 0.5440522290139853, + "grad_norm": 0.30078125, + "learning_rate": 0.0003956475821678881, + "loss": 5.3011, + "step": 85000 + }, + { + "epoch": 0.5472525362434794, + "grad_norm": 0.27734375, + "learning_rate": 0.00039562197971005215, + "loss": 5.3003, + "step": 85500 + }, + { + "epoch": 0.5504528434729734, + "grad_norm": 0.287109375, + "learning_rate": 0.0003955963772522162, + "loss": 5.3009, + "step": 86000 + }, + { + "epoch": 0.5536531507024675, + "grad_norm": 0.2890625, + "learning_rate": 0.0003955707747943803, + "loss": 5.299, + "step": 86500 + }, + { + "epoch": 0.5568534579319615, + "grad_norm": 0.27734375, + "learning_rate": 0.0003955451723365443, + "loss": 5.295, + "step": 87000 + }, + { + "epoch": 0.5600537651614556, + "grad_norm": 0.28515625, + "learning_rate": 0.0003955195698787084, + "loss": 5.2971, + "step": 87500 + }, + { + "epoch": 0.5632540723909495, + "grad_norm": 0.29296875, + "learning_rate": 0.00039549396742087243, + "loss": 5.2982, + "step": 88000 + }, + { + "epoch": 0.5664543796204435, + "grad_norm": 0.30078125, + "learning_rate": 0.00039546836496303647, + "loss": 5.2923, + "step": 88500 + }, + { + "epoch": 0.5696546868499376, + "grad_norm": 0.283203125, + "learning_rate": 0.00039544276250520055, + "loss": 5.2898, + "step": 89000 + }, + { + "epoch": 0.5728549940794316, + "grad_norm": 0.296875, + "learning_rate": 0.0003954171600473646, + "loss": 5.298, + "step": 89500 + }, + { + "epoch": 0.5760553013089257, + "grad_norm": 0.291015625, + "learning_rate": 0.0003953915575895286, + "loss": 5.2947, + "step": 90000 + }, + { + "epoch": 0.5792556085384197, + "grad_norm": 0.28125, + "learning_rate": 0.00039536595513169266, + "loss": 5.2937, + "step": 90500 + }, + { + "epoch": 0.5824559157679137, + "grad_norm": 0.283203125, + "learning_rate": 0.0003953403526738567, + "loss": 5.287, + "step": 91000 + }, + { + "epoch": 0.5856562229974077, + "grad_norm": 0.28515625, + "learning_rate": 0.0003953147502160207, + "loss": 5.2907, + "step": 91500 + }, + { + "epoch": 0.5888565302269018, + "grad_norm": 0.294921875, + "learning_rate": 0.0003952891477581848, + "loss": 5.2879, + "step": 92000 + }, + { + "epoch": 0.5920568374563958, + "grad_norm": 0.29296875, + "learning_rate": 0.00039526354530034885, + "loss": 5.2897, + "step": 92500 + }, + { + "epoch": 0.5952571446858899, + "grad_norm": 0.30078125, + "learning_rate": 0.0003952379428425129, + "loss": 5.2899, + "step": 93000 + }, + { + "epoch": 0.5984574519153839, + "grad_norm": 0.287109375, + "learning_rate": 0.0003952123403846769, + "loss": 5.2895, + "step": 93500 + }, + { + "epoch": 0.601657759144878, + "grad_norm": 0.279296875, + "learning_rate": 0.00039518673792684095, + "loss": 5.293, + "step": 94000 + }, + { + "epoch": 0.6048580663743719, + "grad_norm": 0.296875, + "learning_rate": 0.00039516113546900504, + "loss": 5.2785, + "step": 94500 + }, + { + "epoch": 0.6080583736038659, + "grad_norm": 0.291015625, + "learning_rate": 0.00039513553301116913, + "loss": 5.2833, + "step": 95000 + }, + { + "epoch": 0.61125868083336, + "grad_norm": 0.287109375, + "learning_rate": 0.00039510993055333316, + "loss": 5.2893, + "step": 95500 + }, + { + "epoch": 0.614458988062854, + "grad_norm": 0.283203125, + "learning_rate": 0.0003950843280954972, + "loss": 5.2882, + "step": 96000 + }, + { + "epoch": 0.6176592952923481, + "grad_norm": 0.302734375, + "learning_rate": 0.00039505872563766123, + "loss": 5.2862, + "step": 96500 + }, + { + "epoch": 0.6208596025218421, + "grad_norm": 0.298828125, + "learning_rate": 0.0003950331231798253, + "loss": 5.2826, + "step": 97000 + }, + { + "epoch": 0.6240599097513362, + "grad_norm": 0.306640625, + "learning_rate": 0.00039500752072198935, + "loss": 5.2812, + "step": 97500 + }, + { + "epoch": 0.6272602169808301, + "grad_norm": 0.291015625, + "learning_rate": 0.0003949819182641534, + "loss": 5.2875, + "step": 98000 + }, + { + "epoch": 0.6304605242103242, + "grad_norm": 0.29296875, + "learning_rate": 0.0003949563158063174, + "loss": 5.2858, + "step": 98500 + }, + { + "epoch": 0.6336608314398182, + "grad_norm": 0.30078125, + "learning_rate": 0.00039493071334848146, + "loss": 5.2816, + "step": 99000 + }, + { + "epoch": 0.6368611386693123, + "grad_norm": 0.294921875, + "learning_rate": 0.0003949051108906455, + "loss": 5.2786, + "step": 99500 + }, + { + "epoch": 0.6400614458988063, + "grad_norm": 0.28125, + "learning_rate": 0.0003948795084328096, + "loss": 5.2865, + "step": 100000 + }, + { + "epoch": 0.6432617531283004, + "grad_norm": 0.294921875, + "learning_rate": 0.0003948539059749736, + "loss": 5.2846, + "step": 100500 + }, + { + "epoch": 0.6464620603577943, + "grad_norm": 0.310546875, + "learning_rate": 0.00039482830351713765, + "loss": 5.2812, + "step": 101000 + }, + { + "epoch": 0.6496623675872883, + "grad_norm": 0.3125, + "learning_rate": 0.0003948027010593017, + "loss": 5.2776, + "step": 101500 + }, + { + "epoch": 0.6528626748167824, + "grad_norm": 0.30859375, + "learning_rate": 0.00039477709860146577, + "loss": 5.2828, + "step": 102000 + }, + { + "epoch": 0.6560629820462764, + "grad_norm": 0.30859375, + "learning_rate": 0.0003947514961436298, + "loss": 5.28, + "step": 102500 + }, + { + "epoch": 0.6592632892757705, + "grad_norm": 0.29296875, + "learning_rate": 0.0003947258936857939, + "loss": 5.2795, + "step": 103000 + }, + { + "epoch": 0.6624635965052645, + "grad_norm": 0.287109375, + "learning_rate": 0.0003947002912279579, + "loss": 5.2777, + "step": 103500 + }, + { + "epoch": 0.6656639037347586, + "grad_norm": 0.30859375, + "learning_rate": 0.00039467468877012196, + "loss": 5.2839, + "step": 104000 + }, + { + "epoch": 0.6688642109642525, + "grad_norm": 0.298828125, + "learning_rate": 0.000394649086312286, + "loss": 5.2749, + "step": 104500 + }, + { + "epoch": 0.6720645181937466, + "grad_norm": 0.283203125, + "learning_rate": 0.00039462348385445003, + "loss": 5.2791, + "step": 105000 + }, + { + "epoch": 0.6752648254232406, + "grad_norm": 0.3125, + "learning_rate": 0.0003945978813966141, + "loss": 5.2742, + "step": 105500 + }, + { + "epoch": 0.6784651326527347, + "grad_norm": 0.29296875, + "learning_rate": 0.00039457227893877815, + "loss": 5.2746, + "step": 106000 + }, + { + "epoch": 0.6816654398822287, + "grad_norm": 0.294921875, + "learning_rate": 0.0003945466764809422, + "loss": 5.2777, + "step": 106500 + }, + { + "epoch": 0.6848657471117228, + "grad_norm": 0.298828125, + "learning_rate": 0.0003945210740231062, + "loss": 5.2717, + "step": 107000 + }, + { + "epoch": 0.6880660543412167, + "grad_norm": 0.314453125, + "learning_rate": 0.00039449547156527025, + "loss": 5.2749, + "step": 107500 + }, + { + "epoch": 0.6912663615707108, + "grad_norm": 0.2890625, + "learning_rate": 0.00039446986910743434, + "loss": 5.2783, + "step": 108000 + }, + { + "epoch": 0.6944666688002048, + "grad_norm": 0.306640625, + "learning_rate": 0.0003944442666495984, + "loss": 5.2739, + "step": 108500 + }, + { + "epoch": 0.6976669760296988, + "grad_norm": 0.333984375, + "learning_rate": 0.0003944186641917624, + "loss": 5.2767, + "step": 109000 + }, + { + "epoch": 0.7008672832591929, + "grad_norm": 0.3046875, + "learning_rate": 0.00039439306173392645, + "loss": 5.2801, + "step": 109500 + }, + { + "epoch": 0.704067590488687, + "grad_norm": 0.302734375, + "learning_rate": 0.00039436745927609053, + "loss": 5.2789, + "step": 110000 + }, + { + "epoch": 0.707267897718181, + "grad_norm": 0.302734375, + "learning_rate": 0.00039434185681825457, + "loss": 5.2754, + "step": 110500 + }, + { + "epoch": 0.7104682049476749, + "grad_norm": 0.3046875, + "learning_rate": 0.00039431625436041866, + "loss": 5.2722, + "step": 111000 + }, + { + "epoch": 0.713668512177169, + "grad_norm": 0.298828125, + "learning_rate": 0.0003942906519025827, + "loss": 5.2671, + "step": 111500 + }, + { + "epoch": 0.716868819406663, + "grad_norm": 0.314453125, + "learning_rate": 0.0003942650494447467, + "loss": 5.27, + "step": 112000 + }, + { + "epoch": 0.7200691266361571, + "grad_norm": 0.294921875, + "learning_rate": 0.00039423944698691076, + "loss": 5.2712, + "step": 112500 + }, + { + "epoch": 0.7232694338656511, + "grad_norm": 0.296875, + "learning_rate": 0.0003942138445290748, + "loss": 5.272, + "step": 113000 + }, + { + "epoch": 0.7264697410951452, + "grad_norm": 0.30859375, + "learning_rate": 0.0003941882420712389, + "loss": 5.2714, + "step": 113500 + }, + { + "epoch": 0.7296700483246392, + "grad_norm": 0.310546875, + "learning_rate": 0.0003941626396134029, + "loss": 5.266, + "step": 114000 + }, + { + "epoch": 0.7328703555541332, + "grad_norm": 0.3125, + "learning_rate": 0.00039413703715556695, + "loss": 5.2666, + "step": 114500 + }, + { + "epoch": 0.7360706627836272, + "grad_norm": 0.302734375, + "learning_rate": 0.000394111434697731, + "loss": 5.2648, + "step": 115000 + }, + { + "epoch": 0.7392709700131213, + "grad_norm": 0.294921875, + "learning_rate": 0.000394085832239895, + "loss": 5.266, + "step": 115500 + }, + { + "epoch": 0.7424712772426153, + "grad_norm": 0.32421875, + "learning_rate": 0.00039406022978205905, + "loss": 5.2665, + "step": 116000 + }, + { + "epoch": 0.7456715844721094, + "grad_norm": 0.296875, + "learning_rate": 0.00039403462732422314, + "loss": 5.269, + "step": 116500 + }, + { + "epoch": 0.7488718917016034, + "grad_norm": 0.302734375, + "learning_rate": 0.0003940090248663872, + "loss": 5.2671, + "step": 117000 + }, + { + "epoch": 0.7520721989310973, + "grad_norm": 0.306640625, + "learning_rate": 0.00039398342240855126, + "loss": 5.2694, + "step": 117500 + }, + { + "epoch": 0.7552725061605914, + "grad_norm": 0.3046875, + "learning_rate": 0.0003939578199507153, + "loss": 5.2679, + "step": 118000 + }, + { + "epoch": 0.7584728133900854, + "grad_norm": 0.30859375, + "learning_rate": 0.00039393221749287933, + "loss": 5.2682, + "step": 118500 + }, + { + "epoch": 0.7616731206195795, + "grad_norm": 0.326171875, + "learning_rate": 0.0003939066150350434, + "loss": 5.2669, + "step": 119000 + }, + { + "epoch": 0.7648734278490735, + "grad_norm": 0.30859375, + "learning_rate": 0.00039388101257720746, + "loss": 5.2708, + "step": 119500 + }, + { + "epoch": 0.7680737350785676, + "grad_norm": 0.306640625, + "learning_rate": 0.0003938554101193715, + "loss": 5.2702, + "step": 120000 + }, + { + "epoch": 0.7712740423080616, + "grad_norm": 0.30078125, + "learning_rate": 0.0003938298076615355, + "loss": 5.2592, + "step": 120500 + }, + { + "epoch": 0.7744743495375556, + "grad_norm": 0.306640625, + "learning_rate": 0.00039380420520369956, + "loss": 5.261, + "step": 121000 + }, + { + "epoch": 0.7776746567670496, + "grad_norm": 0.3125, + "learning_rate": 0.0003937786027458636, + "loss": 5.2653, + "step": 121500 + }, + { + "epoch": 0.7808749639965437, + "grad_norm": 0.310546875, + "learning_rate": 0.0003937530002880277, + "loss": 5.2716, + "step": 122000 + }, + { + "epoch": 0.7840752712260377, + "grad_norm": 0.322265625, + "learning_rate": 0.0003937273978301917, + "loss": 5.2643, + "step": 122500 + }, + { + "epoch": 0.7872755784555318, + "grad_norm": 0.314453125, + "learning_rate": 0.00039370179537235575, + "loss": 5.2618, + "step": 123000 + }, + { + "epoch": 0.7904758856850258, + "grad_norm": 0.306640625, + "learning_rate": 0.0003936761929145198, + "loss": 5.2592, + "step": 123500 + }, + { + "epoch": 0.7936761929145197, + "grad_norm": 0.30859375, + "learning_rate": 0.0003936505904566838, + "loss": 5.262, + "step": 124000 + }, + { + "epoch": 0.7968765001440138, + "grad_norm": 0.294921875, + "learning_rate": 0.0003936249879988479, + "loss": 5.2561, + "step": 124500 + }, + { + "epoch": 0.8000768073735078, + "grad_norm": 0.318359375, + "learning_rate": 0.00039359938554101194, + "loss": 5.2625, + "step": 125000 + }, + { + "epoch": 0.8032771146030019, + "grad_norm": 0.318359375, + "learning_rate": 0.00039357378308317603, + "loss": 5.2566, + "step": 125500 + }, + { + "epoch": 0.8064774218324959, + "grad_norm": 0.298828125, + "learning_rate": 0.00039354818062534006, + "loss": 5.2632, + "step": 126000 + }, + { + "epoch": 0.80967772906199, + "grad_norm": 0.322265625, + "learning_rate": 0.0003935225781675041, + "loss": 5.2591, + "step": 126500 + }, + { + "epoch": 0.812878036291484, + "grad_norm": 0.3125, + "learning_rate": 0.00039349697570966813, + "loss": 5.26, + "step": 127000 + }, + { + "epoch": 0.816078343520978, + "grad_norm": 0.328125, + "learning_rate": 0.0003934713732518322, + "loss": 5.261, + "step": 127500 + }, + { + "epoch": 0.819278650750472, + "grad_norm": 0.30859375, + "learning_rate": 0.00039344577079399625, + "loss": 5.2639, + "step": 128000 + }, + { + "epoch": 0.8224789579799661, + "grad_norm": 0.294921875, + "learning_rate": 0.0003934201683361603, + "loss": 5.258, + "step": 128500 + }, + { + "epoch": 0.8256792652094601, + "grad_norm": 0.3125, + "learning_rate": 0.0003933945658783243, + "loss": 5.2634, + "step": 129000 + }, + { + "epoch": 0.8288795724389542, + "grad_norm": 0.30859375, + "learning_rate": 0.00039336896342048836, + "loss": 5.2536, + "step": 129500 + }, + { + "epoch": 0.8320798796684482, + "grad_norm": 0.3046875, + "learning_rate": 0.00039334336096265245, + "loss": 5.259, + "step": 130000 + }, + { + "epoch": 0.8352801868979423, + "grad_norm": 0.318359375, + "learning_rate": 0.0003933177585048165, + "loss": 5.256, + "step": 130500 + }, + { + "epoch": 0.8384804941274362, + "grad_norm": 0.314453125, + "learning_rate": 0.0003932921560469805, + "loss": 5.263, + "step": 131000 + }, + { + "epoch": 0.8416808013569302, + "grad_norm": 0.291015625, + "learning_rate": 0.00039326655358914455, + "loss": 5.2594, + "step": 131500 + }, + { + "epoch": 0.8448811085864243, + "grad_norm": 0.32421875, + "learning_rate": 0.00039324095113130864, + "loss": 5.254, + "step": 132000 + }, + { + "epoch": 0.8480814158159183, + "grad_norm": 0.32421875, + "learning_rate": 0.00039321534867347267, + "loss": 5.2611, + "step": 132500 + }, + { + "epoch": 0.8512817230454124, + "grad_norm": 0.310546875, + "learning_rate": 0.00039318974621563676, + "loss": 5.2558, + "step": 133000 + }, + { + "epoch": 0.8544820302749064, + "grad_norm": 0.30859375, + "learning_rate": 0.0003931641437578008, + "loss": 5.2568, + "step": 133500 + }, + { + "epoch": 0.8576823375044004, + "grad_norm": 0.3359375, + "learning_rate": 0.00039313854129996483, + "loss": 5.2554, + "step": 134000 + }, + { + "epoch": 0.8608826447338944, + "grad_norm": 0.322265625, + "learning_rate": 0.00039311293884212886, + "loss": 5.2527, + "step": 134500 + }, + { + "epoch": 0.8640829519633885, + "grad_norm": 0.3125, + "learning_rate": 0.0003930873363842929, + "loss": 5.2552, + "step": 135000 + }, + { + "epoch": 0.8672832591928825, + "grad_norm": 0.30078125, + "learning_rate": 0.000393061733926457, + "loss": 5.2521, + "step": 135500 + }, + { + "epoch": 0.8704835664223766, + "grad_norm": 0.33203125, + "learning_rate": 0.000393036131468621, + "loss": 5.2537, + "step": 136000 + }, + { + "epoch": 0.8736838736518706, + "grad_norm": 0.3203125, + "learning_rate": 0.00039301052901078505, + "loss": 5.2509, + "step": 136500 + }, + { + "epoch": 0.8768841808813647, + "grad_norm": 0.318359375, + "learning_rate": 0.0003929849265529491, + "loss": 5.2577, + "step": 137000 + }, + { + "epoch": 0.8800844881108586, + "grad_norm": 0.3125, + "learning_rate": 0.0003929593240951131, + "loss": 5.2591, + "step": 137500 + }, + { + "epoch": 0.8832847953403526, + "grad_norm": 0.326171875, + "learning_rate": 0.0003929337216372772, + "loss": 5.2543, + "step": 138000 + }, + { + "epoch": 0.8864851025698467, + "grad_norm": 0.31640625, + "learning_rate": 0.00039290811917944124, + "loss": 5.2522, + "step": 138500 + }, + { + "epoch": 0.8896854097993407, + "grad_norm": 0.337890625, + "learning_rate": 0.0003928825167216053, + "loss": 5.2445, + "step": 139000 + }, + { + "epoch": 0.8928857170288348, + "grad_norm": 0.3203125, + "learning_rate": 0.0003928569142637693, + "loss": 5.2539, + "step": 139500 + }, + { + "epoch": 0.8960860242583288, + "grad_norm": 0.30859375, + "learning_rate": 0.0003928313118059334, + "loss": 5.2501, + "step": 140000 + }, + { + "epoch": 0.8992863314878228, + "grad_norm": 0.32421875, + "learning_rate": 0.00039280570934809744, + "loss": 5.256, + "step": 140500 + }, + { + "epoch": 0.9024866387173168, + "grad_norm": 0.326171875, + "learning_rate": 0.0003927801068902615, + "loss": 5.256, + "step": 141000 + }, + { + "epoch": 0.9056869459468109, + "grad_norm": 0.3203125, + "learning_rate": 0.00039275450443242556, + "loss": 5.253, + "step": 141500 + }, + { + "epoch": 0.9088872531763049, + "grad_norm": 0.30859375, + "learning_rate": 0.0003927289019745896, + "loss": 5.25, + "step": 142000 + }, + { + "epoch": 0.912087560405799, + "grad_norm": 0.318359375, + "learning_rate": 0.00039270329951675363, + "loss": 5.2534, + "step": 142500 + }, + { + "epoch": 0.915287867635293, + "grad_norm": 0.32421875, + "learning_rate": 0.00039267769705891766, + "loss": 5.253, + "step": 143000 + }, + { + "epoch": 0.9184881748647871, + "grad_norm": 0.3203125, + "learning_rate": 0.00039265209460108175, + "loss": 5.2497, + "step": 143500 + }, + { + "epoch": 0.921688482094281, + "grad_norm": 0.314453125, + "learning_rate": 0.0003926264921432458, + "loss": 5.2563, + "step": 144000 + }, + { + "epoch": 0.924888789323775, + "grad_norm": 0.30078125, + "learning_rate": 0.0003926008896854098, + "loss": 5.2449, + "step": 144500 + }, + { + "epoch": 0.9280890965532691, + "grad_norm": 0.326171875, + "learning_rate": 0.00039257528722757385, + "loss": 5.2514, + "step": 145000 + }, + { + "epoch": 0.9312894037827631, + "grad_norm": 0.3046875, + "learning_rate": 0.0003925496847697379, + "loss": 5.2508, + "step": 145500 + }, + { + "epoch": 0.9344897110122572, + "grad_norm": 0.330078125, + "learning_rate": 0.0003925240823119019, + "loss": 5.2492, + "step": 146000 + }, + { + "epoch": 0.9376900182417512, + "grad_norm": 0.32421875, + "learning_rate": 0.000392498479854066, + "loss": 5.2414, + "step": 146500 + }, + { + "epoch": 0.9408903254712453, + "grad_norm": 0.330078125, + "learning_rate": 0.00039247287739623004, + "loss": 5.2449, + "step": 147000 + }, + { + "epoch": 0.9440906327007392, + "grad_norm": 0.310546875, + "learning_rate": 0.0003924472749383941, + "loss": 5.2465, + "step": 147500 + }, + { + "epoch": 0.9472909399302333, + "grad_norm": 0.322265625, + "learning_rate": 0.00039242167248055817, + "loss": 5.246, + "step": 148000 + }, + { + "epoch": 0.9504912471597273, + "grad_norm": 0.3046875, + "learning_rate": 0.0003923960700227222, + "loss": 5.2489, + "step": 148500 + }, + { + "epoch": 0.9536915543892214, + "grad_norm": 0.341796875, + "learning_rate": 0.0003923704675648863, + "loss": 5.2438, + "step": 149000 + }, + { + "epoch": 0.9568918616187154, + "grad_norm": 0.318359375, + "learning_rate": 0.0003923448651070503, + "loss": 5.2472, + "step": 149500 + }, + { + "epoch": 0.9600921688482095, + "grad_norm": 0.314453125, + "learning_rate": 0.00039231926264921436, + "loss": 5.25, + "step": 150000 + }, + { + "epoch": 0.9632924760777034, + "grad_norm": 0.29296875, + "learning_rate": 0.0003922936601913784, + "loss": 5.2486, + "step": 150500 + }, + { + "epoch": 0.9664927833071975, + "grad_norm": 0.330078125, + "learning_rate": 0.0003922680577335424, + "loss": 5.2468, + "step": 151000 + }, + { + "epoch": 0.9696930905366915, + "grad_norm": 0.3203125, + "learning_rate": 0.00039224245527570646, + "loss": 5.2491, + "step": 151500 + }, + { + "epoch": 0.9728933977661856, + "grad_norm": 0.314453125, + "learning_rate": 0.00039221685281787055, + "loss": 5.2458, + "step": 152000 + }, + { + "epoch": 0.9760937049956796, + "grad_norm": 0.318359375, + "learning_rate": 0.0003921912503600346, + "loss": 5.2511, + "step": 152500 + }, + { + "epoch": 0.9792940122251736, + "grad_norm": 0.318359375, + "learning_rate": 0.0003921656479021986, + "loss": 5.2354, + "step": 153000 + }, + { + "epoch": 0.9824943194546677, + "grad_norm": 0.333984375, + "learning_rate": 0.00039214004544436265, + "loss": 5.2433, + "step": 153500 + }, + { + "epoch": 0.9856946266841616, + "grad_norm": 0.326171875, + "learning_rate": 0.0003921144429865267, + "loss": 5.2477, + "step": 154000 + }, + { + "epoch": 0.9888949339136557, + "grad_norm": 0.318359375, + "learning_rate": 0.0003920888405286908, + "loss": 5.2436, + "step": 154500 + }, + { + "epoch": 0.9920952411431497, + "grad_norm": 0.318359375, + "learning_rate": 0.0003920632380708548, + "loss": 5.2468, + "step": 155000 + }, + { + "epoch": 0.9952955483726438, + "grad_norm": 0.322265625, + "learning_rate": 0.0003920376356130189, + "loss": 5.2409, + "step": 155500 + }, + { + "epoch": 0.9984958556021378, + "grad_norm": 0.330078125, + "learning_rate": 0.00039201203315518293, + "loss": 5.2425, + "step": 156000 + }, + { + "epoch": 1.0, + "eval_loss": 5.233697414398193, + "eval_runtime": 1.6051, + "eval_samples_per_second": 623.012, + "eval_steps_per_second": 9.968, + "step": 156235 + }, + { + "epoch": 1.0016961628316319, + "grad_norm": 0.314453125, + "learning_rate": 0.00039198643069734697, + "loss": 5.2388, + "step": 156500 + }, + { + "epoch": 1.0048964700611258, + "grad_norm": 0.328125, + "learning_rate": 0.000391960828239511, + "loss": 5.2386, + "step": 157000 + }, + { + "epoch": 1.00809677729062, + "grad_norm": 0.3046875, + "learning_rate": 0.0003919352257816751, + "loss": 5.2406, + "step": 157500 + }, + { + "epoch": 1.011297084520114, + "grad_norm": 0.337890625, + "learning_rate": 0.0003919096233238391, + "loss": 5.2425, + "step": 158000 + }, + { + "epoch": 1.014497391749608, + "grad_norm": 0.337890625, + "learning_rate": 0.00039188402086600316, + "loss": 5.2406, + "step": 158500 + }, + { + "epoch": 1.017697698979102, + "grad_norm": 0.330078125, + "learning_rate": 0.0003918584184081672, + "loss": 5.2416, + "step": 159000 + }, + { + "epoch": 1.020898006208596, + "grad_norm": 0.33984375, + "learning_rate": 0.0003918328159503312, + "loss": 5.242, + "step": 159500 + }, + { + "epoch": 1.02409831343809, + "grad_norm": 0.32421875, + "learning_rate": 0.0003918072134924953, + "loss": 5.2358, + "step": 160000 + }, + { + "epoch": 1.027298620667584, + "grad_norm": 0.341796875, + "learning_rate": 0.00039178161103465935, + "loss": 5.2389, + "step": 160500 + }, + { + "epoch": 1.0304989278970782, + "grad_norm": 0.333984375, + "learning_rate": 0.0003917560085768234, + "loss": 5.2403, + "step": 161000 + }, + { + "epoch": 1.0336992351265721, + "grad_norm": 0.318359375, + "learning_rate": 0.0003917304061189874, + "loss": 5.238, + "step": 161500 + }, + { + "epoch": 1.0368995423560663, + "grad_norm": 0.3828125, + "learning_rate": 0.00039170480366115145, + "loss": 5.2392, + "step": 162000 + }, + { + "epoch": 1.0400998495855602, + "grad_norm": 0.333984375, + "learning_rate": 0.00039167920120331554, + "loss": 5.2381, + "step": 162500 + }, + { + "epoch": 1.0433001568150542, + "grad_norm": 0.31640625, + "learning_rate": 0.0003916535987454796, + "loss": 5.2354, + "step": 163000 + }, + { + "epoch": 1.0465004640445483, + "grad_norm": 0.3203125, + "learning_rate": 0.00039162799628764366, + "loss": 5.2342, + "step": 163500 + }, + { + "epoch": 1.0497007712740423, + "grad_norm": 0.318359375, + "learning_rate": 0.0003916023938298077, + "loss": 5.2378, + "step": 164000 + }, + { + "epoch": 1.0529010785035364, + "grad_norm": 0.314453125, + "learning_rate": 0.00039157679137197173, + "loss": 5.2414, + "step": 164500 + }, + { + "epoch": 1.0561013857330304, + "grad_norm": 0.337890625, + "learning_rate": 0.00039155118891413576, + "loss": 5.2382, + "step": 165000 + }, + { + "epoch": 1.0593016929625243, + "grad_norm": 0.33203125, + "learning_rate": 0.00039152558645629985, + "loss": 5.2386, + "step": 165500 + }, + { + "epoch": 1.0625020001920185, + "grad_norm": 0.3203125, + "learning_rate": 0.0003914999839984639, + "loss": 5.2348, + "step": 166000 + }, + { + "epoch": 1.0657023074215124, + "grad_norm": 0.32421875, + "learning_rate": 0.0003914743815406279, + "loss": 5.2314, + "step": 166500 + }, + { + "epoch": 1.0689026146510066, + "grad_norm": 0.337890625, + "learning_rate": 0.00039144877908279196, + "loss": 5.23, + "step": 167000 + }, + { + "epoch": 1.0721029218805005, + "grad_norm": 0.31640625, + "learning_rate": 0.000391423176624956, + "loss": 5.2312, + "step": 167500 + }, + { + "epoch": 1.0753032291099947, + "grad_norm": 0.349609375, + "learning_rate": 0.00039139757416712, + "loss": 5.2448, + "step": 168000 + }, + { + "epoch": 1.0785035363394886, + "grad_norm": 0.310546875, + "learning_rate": 0.0003913719717092841, + "loss": 5.2313, + "step": 168500 + }, + { + "epoch": 1.0817038435689825, + "grad_norm": 0.328125, + "learning_rate": 0.00039134636925144815, + "loss": 5.2329, + "step": 169000 + }, + { + "epoch": 1.0849041507984767, + "grad_norm": 0.341796875, + "learning_rate": 0.0003913207667936122, + "loss": 5.2307, + "step": 169500 + }, + { + "epoch": 1.0881044580279706, + "grad_norm": 0.32421875, + "learning_rate": 0.00039129516433577627, + "loss": 5.2377, + "step": 170000 + }, + { + "epoch": 1.0913047652574648, + "grad_norm": 0.33984375, + "learning_rate": 0.0003912695618779403, + "loss": 5.2423, + "step": 170500 + }, + { + "epoch": 1.0945050724869587, + "grad_norm": 0.32421875, + "learning_rate": 0.0003912439594201044, + "loss": 5.2413, + "step": 171000 + }, + { + "epoch": 1.0977053797164529, + "grad_norm": 0.318359375, + "learning_rate": 0.0003912183569622684, + "loss": 5.2345, + "step": 171500 + }, + { + "epoch": 1.1009056869459468, + "grad_norm": 0.326171875, + "learning_rate": 0.00039119275450443246, + "loss": 5.2354, + "step": 172000 + }, + { + "epoch": 1.1041059941754408, + "grad_norm": 0.33203125, + "learning_rate": 0.0003911671520465965, + "loss": 5.2357, + "step": 172500 + }, + { + "epoch": 1.107306301404935, + "grad_norm": 0.310546875, + "learning_rate": 0.00039114154958876053, + "loss": 5.2332, + "step": 173000 + }, + { + "epoch": 1.1105066086344288, + "grad_norm": 0.333984375, + "learning_rate": 0.0003911159471309246, + "loss": 5.2334, + "step": 173500 + }, + { + "epoch": 1.113706915863923, + "grad_norm": 0.3515625, + "learning_rate": 0.00039109034467308865, + "loss": 5.2318, + "step": 174000 + }, + { + "epoch": 1.116907223093417, + "grad_norm": 0.328125, + "learning_rate": 0.0003910647422152527, + "loss": 5.2356, + "step": 174500 + }, + { + "epoch": 1.1201075303229109, + "grad_norm": 0.326171875, + "learning_rate": 0.0003910391397574167, + "loss": 5.232, + "step": 175000 + }, + { + "epoch": 1.123307837552405, + "grad_norm": 0.32421875, + "learning_rate": 0.00039101353729958075, + "loss": 5.233, + "step": 175500 + }, + { + "epoch": 1.126508144781899, + "grad_norm": 0.34375, + "learning_rate": 0.0003909879348417448, + "loss": 5.2262, + "step": 176000 + }, + { + "epoch": 1.1297084520113931, + "grad_norm": 0.345703125, + "learning_rate": 0.0003909623323839089, + "loss": 5.2293, + "step": 176500 + }, + { + "epoch": 1.132908759240887, + "grad_norm": 0.322265625, + "learning_rate": 0.0003909367299260729, + "loss": 5.2283, + "step": 177000 + }, + { + "epoch": 1.1361090664703812, + "grad_norm": 0.337890625, + "learning_rate": 0.00039091112746823695, + "loss": 5.2333, + "step": 177500 + }, + { + "epoch": 1.1393093736998752, + "grad_norm": 0.337890625, + "learning_rate": 0.00039088552501040103, + "loss": 5.2351, + "step": 178000 + }, + { + "epoch": 1.1425096809293693, + "grad_norm": 0.330078125, + "learning_rate": 0.00039085992255256507, + "loss": 5.2342, + "step": 178500 + }, + { + "epoch": 1.1457099881588633, + "grad_norm": 0.30859375, + "learning_rate": 0.00039083432009472916, + "loss": 5.2338, + "step": 179000 + }, + { + "epoch": 1.1489102953883572, + "grad_norm": 0.34765625, + "learning_rate": 0.0003908087176368932, + "loss": 5.2308, + "step": 179500 + }, + { + "epoch": 1.1521106026178514, + "grad_norm": 0.3359375, + "learning_rate": 0.0003907831151790572, + "loss": 5.2279, + "step": 180000 + }, + { + "epoch": 1.1553109098473453, + "grad_norm": 0.30859375, + "learning_rate": 0.00039075751272122126, + "loss": 5.2357, + "step": 180500 + }, + { + "epoch": 1.1585112170768395, + "grad_norm": 0.369140625, + "learning_rate": 0.0003907319102633853, + "loss": 5.2233, + "step": 181000 + }, + { + "epoch": 1.1617115243063334, + "grad_norm": 0.326171875, + "learning_rate": 0.00039070630780554933, + "loss": 5.2309, + "step": 181500 + }, + { + "epoch": 1.1649118315358273, + "grad_norm": 0.328125, + "learning_rate": 0.0003906807053477134, + "loss": 5.2303, + "step": 182000 + }, + { + "epoch": 1.1681121387653215, + "grad_norm": 0.326171875, + "learning_rate": 0.00039065510288987745, + "loss": 5.2274, + "step": 182500 + }, + { + "epoch": 1.1713124459948154, + "grad_norm": 0.341796875, + "learning_rate": 0.0003906295004320415, + "loss": 5.2295, + "step": 183000 + }, + { + "epoch": 1.1745127532243096, + "grad_norm": 0.337890625, + "learning_rate": 0.0003906038979742055, + "loss": 5.2258, + "step": 183500 + }, + { + "epoch": 1.1777130604538035, + "grad_norm": 0.341796875, + "learning_rate": 0.00039057829551636955, + "loss": 5.2288, + "step": 184000 + }, + { + "epoch": 1.1809133676832977, + "grad_norm": 0.35546875, + "learning_rate": 0.00039055269305853364, + "loss": 5.2293, + "step": 184500 + }, + { + "epoch": 1.1841136749127916, + "grad_norm": 0.33984375, + "learning_rate": 0.0003905270906006977, + "loss": 5.2275, + "step": 185000 + }, + { + "epoch": 1.1873139821422858, + "grad_norm": 0.345703125, + "learning_rate": 0.00039050148814286176, + "loss": 5.2306, + "step": 185500 + }, + { + "epoch": 1.1905142893717797, + "grad_norm": 0.333984375, + "learning_rate": 0.0003904758856850258, + "loss": 5.2261, + "step": 186000 + }, + { + "epoch": 1.1937145966012737, + "grad_norm": 0.349609375, + "learning_rate": 0.00039045028322718983, + "loss": 5.2227, + "step": 186500 + }, + { + "epoch": 1.1969149038307678, + "grad_norm": 0.34375, + "learning_rate": 0.00039042468076935387, + "loss": 5.2282, + "step": 187000 + }, + { + "epoch": 1.2001152110602618, + "grad_norm": 0.34765625, + "learning_rate": 0.00039039907831151796, + "loss": 5.2293, + "step": 187500 + }, + { + "epoch": 1.203315518289756, + "grad_norm": 0.337890625, + "learning_rate": 0.000390373475853682, + "loss": 5.2249, + "step": 188000 + }, + { + "epoch": 1.2065158255192499, + "grad_norm": 0.3203125, + "learning_rate": 0.000390347873395846, + "loss": 5.2263, + "step": 188500 + }, + { + "epoch": 1.2097161327487438, + "grad_norm": 0.322265625, + "learning_rate": 0.00039032227093801006, + "loss": 5.227, + "step": 189000 + }, + { + "epoch": 1.212916439978238, + "grad_norm": 0.33203125, + "learning_rate": 0.0003902966684801741, + "loss": 5.2288, + "step": 189500 + }, + { + "epoch": 1.2161167472077319, + "grad_norm": 0.337890625, + "learning_rate": 0.0003902710660223382, + "loss": 5.2248, + "step": 190000 + }, + { + "epoch": 1.219317054437226, + "grad_norm": 0.333984375, + "learning_rate": 0.0003902454635645022, + "loss": 5.2225, + "step": 190500 + }, + { + "epoch": 1.22251736166672, + "grad_norm": 0.349609375, + "learning_rate": 0.00039021986110666625, + "loss": 5.2203, + "step": 191000 + }, + { + "epoch": 1.225717668896214, + "grad_norm": 0.333984375, + "learning_rate": 0.0003901942586488303, + "loss": 5.2242, + "step": 191500 + }, + { + "epoch": 1.228917976125708, + "grad_norm": 0.337890625, + "learning_rate": 0.0003901686561909943, + "loss": 5.2195, + "step": 192000 + }, + { + "epoch": 1.232118283355202, + "grad_norm": 0.34765625, + "learning_rate": 0.0003901430537331584, + "loss": 5.2254, + "step": 192500 + }, + { + "epoch": 1.2353185905846962, + "grad_norm": 0.365234375, + "learning_rate": 0.00039011745127532244, + "loss": 5.2238, + "step": 193000 + }, + { + "epoch": 1.2385188978141901, + "grad_norm": 0.33984375, + "learning_rate": 0.00039009184881748653, + "loss": 5.23, + "step": 193500 + }, + { + "epoch": 1.2417192050436843, + "grad_norm": 0.333984375, + "learning_rate": 0.00039006624635965056, + "loss": 5.2221, + "step": 194000 + }, + { + "epoch": 1.2449195122731782, + "grad_norm": 0.328125, + "learning_rate": 0.0003900406439018146, + "loss": 5.2264, + "step": 194500 + }, + { + "epoch": 1.2481198195026724, + "grad_norm": 0.373046875, + "learning_rate": 0.00039001504144397863, + "loss": 5.2214, + "step": 195000 + }, + { + "epoch": 1.2513201267321663, + "grad_norm": 0.32421875, + "learning_rate": 0.0003899894389861427, + "loss": 5.2222, + "step": 195500 + }, + { + "epoch": 1.2545204339616602, + "grad_norm": 0.34375, + "learning_rate": 0.00038996383652830676, + "loss": 5.2248, + "step": 196000 + }, + { + "epoch": 1.2577207411911544, + "grad_norm": 0.33984375, + "learning_rate": 0.0003899382340704708, + "loss": 5.2203, + "step": 196500 + }, + { + "epoch": 1.2609210484206483, + "grad_norm": 0.3515625, + "learning_rate": 0.0003899126316126348, + "loss": 5.222, + "step": 197000 + }, + { + "epoch": 1.2641213556501425, + "grad_norm": 0.333984375, + "learning_rate": 0.00038988702915479886, + "loss": 5.2234, + "step": 197500 + }, + { + "epoch": 1.2673216628796364, + "grad_norm": 0.33984375, + "learning_rate": 0.0003898614266969629, + "loss": 5.2289, + "step": 198000 + }, + { + "epoch": 1.2705219701091304, + "grad_norm": 0.341796875, + "learning_rate": 0.000389835824239127, + "loss": 5.2214, + "step": 198500 + }, + { + "epoch": 1.2737222773386245, + "grad_norm": 0.345703125, + "learning_rate": 0.000389810221781291, + "loss": 5.2235, + "step": 199000 + }, + { + "epoch": 1.2769225845681185, + "grad_norm": 0.333984375, + "learning_rate": 0.00038978461932345505, + "loss": 5.2256, + "step": 199500 + }, + { + "epoch": 1.2801228917976126, + "grad_norm": 0.3515625, + "learning_rate": 0.0003897590168656191, + "loss": 5.2266, + "step": 200000 + }, + { + "epoch": 1.2833231990271066, + "grad_norm": 0.349609375, + "learning_rate": 0.00038973341440778317, + "loss": 5.2288, + "step": 200500 + }, + { + "epoch": 1.2865235062566005, + "grad_norm": 0.3671875, + "learning_rate": 0.0003897078119499472, + "loss": 5.2253, + "step": 201000 + }, + { + "epoch": 1.2897238134860947, + "grad_norm": 0.3359375, + "learning_rate": 0.0003896822094921113, + "loss": 5.2216, + "step": 201500 + }, + { + "epoch": 1.2929241207155888, + "grad_norm": 0.345703125, + "learning_rate": 0.00038965660703427533, + "loss": 5.2229, + "step": 202000 + }, + { + "epoch": 1.2961244279450828, + "grad_norm": 0.357421875, + "learning_rate": 0.00038963100457643936, + "loss": 5.2239, + "step": 202500 + }, + { + "epoch": 1.2993247351745767, + "grad_norm": 0.33984375, + "learning_rate": 0.0003896054021186034, + "loss": 5.217, + "step": 203000 + }, + { + "epoch": 1.3025250424040709, + "grad_norm": 0.36328125, + "learning_rate": 0.00038957979966076743, + "loss": 5.2295, + "step": 203500 + }, + { + "epoch": 1.3057253496335648, + "grad_norm": 0.32421875, + "learning_rate": 0.0003895541972029315, + "loss": 5.2226, + "step": 204000 + }, + { + "epoch": 1.308925656863059, + "grad_norm": 0.3203125, + "learning_rate": 0.00038952859474509555, + "loss": 5.224, + "step": 204500 + }, + { + "epoch": 1.3121259640925529, + "grad_norm": 0.333984375, + "learning_rate": 0.0003895029922872596, + "loss": 5.2216, + "step": 205000 + }, + { + "epoch": 1.3153262713220468, + "grad_norm": 0.349609375, + "learning_rate": 0.0003894773898294236, + "loss": 5.2219, + "step": 205500 + }, + { + "epoch": 1.318526578551541, + "grad_norm": 0.3359375, + "learning_rate": 0.00038945178737158766, + "loss": 5.2175, + "step": 206000 + }, + { + "epoch": 1.321726885781035, + "grad_norm": 0.359375, + "learning_rate": 0.00038942618491375175, + "loss": 5.2167, + "step": 206500 + }, + { + "epoch": 1.324927193010529, + "grad_norm": 0.33203125, + "learning_rate": 0.0003894005824559158, + "loss": 5.2227, + "step": 207000 + }, + { + "epoch": 1.328127500240023, + "grad_norm": 0.359375, + "learning_rate": 0.0003893749799980798, + "loss": 5.2199, + "step": 207500 + }, + { + "epoch": 1.331327807469517, + "grad_norm": 0.33984375, + "learning_rate": 0.0003893493775402439, + "loss": 5.2198, + "step": 208000 + }, + { + "epoch": 1.3345281146990111, + "grad_norm": 0.345703125, + "learning_rate": 0.00038932377508240794, + "loss": 5.2206, + "step": 208500 + }, + { + "epoch": 1.3377284219285053, + "grad_norm": 0.337890625, + "learning_rate": 0.000389298172624572, + "loss": 5.2217, + "step": 209000 + }, + { + "epoch": 1.3409287291579992, + "grad_norm": 0.345703125, + "learning_rate": 0.00038927257016673606, + "loss": 5.2231, + "step": 209500 + }, + { + "epoch": 1.3441290363874931, + "grad_norm": 0.39453125, + "learning_rate": 0.0003892469677089001, + "loss": 5.2211, + "step": 210000 + }, + { + "epoch": 1.3473293436169873, + "grad_norm": 0.33984375, + "learning_rate": 0.00038922136525106413, + "loss": 5.2185, + "step": 210500 + }, + { + "epoch": 1.3505296508464812, + "grad_norm": 0.33984375, + "learning_rate": 0.00038919576279322816, + "loss": 5.2207, + "step": 211000 + }, + { + "epoch": 1.3537299580759754, + "grad_norm": 0.322265625, + "learning_rate": 0.0003891701603353922, + "loss": 5.23, + "step": 211500 + }, + { + "epoch": 1.3569302653054693, + "grad_norm": 0.34765625, + "learning_rate": 0.0003891445578775563, + "loss": 5.2224, + "step": 212000 + }, + { + "epoch": 1.3601305725349633, + "grad_norm": 0.3203125, + "learning_rate": 0.0003891189554197203, + "loss": 5.2179, + "step": 212500 + }, + { + "epoch": 1.3633308797644574, + "grad_norm": 0.34375, + "learning_rate": 0.00038909335296188435, + "loss": 5.2214, + "step": 213000 + }, + { + "epoch": 1.3665311869939514, + "grad_norm": 0.345703125, + "learning_rate": 0.0003890677505040484, + "loss": 5.2253, + "step": 213500 + }, + { + "epoch": 1.3697314942234455, + "grad_norm": 0.3515625, + "learning_rate": 0.0003890421480462124, + "loss": 5.2184, + "step": 214000 + }, + { + "epoch": 1.3729318014529395, + "grad_norm": 0.3515625, + "learning_rate": 0.0003890165455883765, + "loss": 5.2235, + "step": 214500 + }, + { + "epoch": 1.3761321086824334, + "grad_norm": 0.36328125, + "learning_rate": 0.00038899094313054054, + "loss": 5.2209, + "step": 215000 + }, + { + "epoch": 1.3793324159119276, + "grad_norm": 0.33984375, + "learning_rate": 0.0003889653406727046, + "loss": 5.2172, + "step": 215500 + }, + { + "epoch": 1.3825327231414215, + "grad_norm": 0.3671875, + "learning_rate": 0.00038893973821486867, + "loss": 5.2177, + "step": 216000 + }, + { + "epoch": 1.3857330303709157, + "grad_norm": 0.373046875, + "learning_rate": 0.0003889141357570327, + "loss": 5.2171, + "step": 216500 + }, + { + "epoch": 1.3889333376004096, + "grad_norm": 0.33203125, + "learning_rate": 0.00038888853329919674, + "loss": 5.2189, + "step": 217000 + }, + { + "epoch": 1.3921336448299035, + "grad_norm": 0.333984375, + "learning_rate": 0.0003888629308413608, + "loss": 5.2184, + "step": 217500 + }, + { + "epoch": 1.3953339520593977, + "grad_norm": 0.349609375, + "learning_rate": 0.00038883732838352486, + "loss": 5.2201, + "step": 218000 + }, + { + "epoch": 1.3985342592888919, + "grad_norm": 0.353515625, + "learning_rate": 0.0003888117259256889, + "loss": 5.2189, + "step": 218500 + }, + { + "epoch": 1.4017345665183858, + "grad_norm": 0.341796875, + "learning_rate": 0.0003887861234678529, + "loss": 5.219, + "step": 219000 + }, + { + "epoch": 1.4049348737478797, + "grad_norm": 0.34375, + "learning_rate": 0.00038876052101001696, + "loss": 5.2243, + "step": 219500 + }, + { + "epoch": 1.408135180977374, + "grad_norm": 0.337890625, + "learning_rate": 0.00038873491855218105, + "loss": 5.218, + "step": 220000 + }, + { + "epoch": 1.4113354882068678, + "grad_norm": 0.35546875, + "learning_rate": 0.0003887093160943451, + "loss": 5.2208, + "step": 220500 + }, + { + "epoch": 1.414535795436362, + "grad_norm": 0.35546875, + "learning_rate": 0.0003886837136365091, + "loss": 5.2175, + "step": 221000 + }, + { + "epoch": 1.417736102665856, + "grad_norm": 0.349609375, + "learning_rate": 0.00038865811117867315, + "loss": 5.2186, + "step": 221500 + }, + { + "epoch": 1.4209364098953499, + "grad_norm": 0.369140625, + "learning_rate": 0.0003886325087208372, + "loss": 5.215, + "step": 222000 + }, + { + "epoch": 1.424136717124844, + "grad_norm": 0.359375, + "learning_rate": 0.0003886069062630012, + "loss": 5.2184, + "step": 222500 + }, + { + "epoch": 1.427337024354338, + "grad_norm": 0.341796875, + "learning_rate": 0.0003885813038051653, + "loss": 5.2215, + "step": 223000 + }, + { + "epoch": 1.4305373315838321, + "grad_norm": 0.326171875, + "learning_rate": 0.0003885557013473294, + "loss": 5.2166, + "step": 223500 + }, + { + "epoch": 1.433737638813326, + "grad_norm": 0.33984375, + "learning_rate": 0.00038853009888949343, + "loss": 5.2194, + "step": 224000 + }, + { + "epoch": 1.43693794604282, + "grad_norm": 0.33203125, + "learning_rate": 0.00038850449643165747, + "loss": 5.2188, + "step": 224500 + }, + { + "epoch": 1.4401382532723142, + "grad_norm": 0.330078125, + "learning_rate": 0.0003884788939738215, + "loss": 5.216, + "step": 225000 + }, + { + "epoch": 1.4433385605018083, + "grad_norm": 0.357421875, + "learning_rate": 0.0003884532915159856, + "loss": 5.216, + "step": 225500 + }, + { + "epoch": 1.4465388677313022, + "grad_norm": 0.349609375, + "learning_rate": 0.0003884276890581496, + "loss": 5.2194, + "step": 226000 + }, + { + "epoch": 1.4497391749607962, + "grad_norm": 0.359375, + "learning_rate": 0.00038840208660031366, + "loss": 5.2202, + "step": 226500 + }, + { + "epoch": 1.4529394821902903, + "grad_norm": 0.3359375, + "learning_rate": 0.0003883764841424777, + "loss": 5.2207, + "step": 227000 + }, + { + "epoch": 1.4561397894197843, + "grad_norm": 0.34375, + "learning_rate": 0.0003883508816846417, + "loss": 5.2209, + "step": 227500 + }, + { + "epoch": 1.4593400966492784, + "grad_norm": 0.3359375, + "learning_rate": 0.00038832527922680576, + "loss": 5.2265, + "step": 228000 + }, + { + "epoch": 1.4625404038787724, + "grad_norm": 0.35546875, + "learning_rate": 0.00038829967676896985, + "loss": 5.2122, + "step": 228500 + }, + { + "epoch": 1.4657407111082663, + "grad_norm": 0.359375, + "learning_rate": 0.0003882740743111339, + "loss": 5.211, + "step": 229000 + }, + { + "epoch": 1.4689410183377605, + "grad_norm": 0.36328125, + "learning_rate": 0.0003882484718532979, + "loss": 5.2213, + "step": 229500 + }, + { + "epoch": 1.4721413255672544, + "grad_norm": 0.345703125, + "learning_rate": 0.00038822286939546195, + "loss": 5.2215, + "step": 230000 + }, + { + "epoch": 1.4753416327967486, + "grad_norm": 0.33203125, + "learning_rate": 0.00038819726693762604, + "loss": 5.2211, + "step": 230500 + }, + { + "epoch": 1.4785419400262425, + "grad_norm": 0.33984375, + "learning_rate": 0.0003881716644797901, + "loss": 5.2163, + "step": 231000 + }, + { + "epoch": 1.4817422472557364, + "grad_norm": 0.34375, + "learning_rate": 0.00038814606202195416, + "loss": 5.2206, + "step": 231500 + }, + { + "epoch": 1.4849425544852306, + "grad_norm": 0.353515625, + "learning_rate": 0.0003881204595641182, + "loss": 5.2235, + "step": 232000 + }, + { + "epoch": 1.4881428617147245, + "grad_norm": 0.34765625, + "learning_rate": 0.00038809485710628223, + "loss": 5.2167, + "step": 232500 + }, + { + "epoch": 1.4913431689442187, + "grad_norm": 0.330078125, + "learning_rate": 0.00038806925464844626, + "loss": 5.2211, + "step": 233000 + }, + { + "epoch": 1.4945434761737126, + "grad_norm": 0.33984375, + "learning_rate": 0.0003880436521906103, + "loss": 5.2137, + "step": 233500 + }, + { + "epoch": 1.4977437834032066, + "grad_norm": 0.337890625, + "learning_rate": 0.0003880180497327744, + "loss": 5.2173, + "step": 234000 + }, + { + "epoch": 1.5009440906327007, + "grad_norm": 0.361328125, + "learning_rate": 0.0003879924472749384, + "loss": 5.2144, + "step": 234500 + }, + { + "epoch": 1.504144397862195, + "grad_norm": 0.345703125, + "learning_rate": 0.00038796684481710246, + "loss": 5.2201, + "step": 235000 + }, + { + "epoch": 1.5073447050916888, + "grad_norm": 0.36328125, + "learning_rate": 0.0003879412423592665, + "loss": 5.2115, + "step": 235500 + }, + { + "epoch": 1.5105450123211828, + "grad_norm": 0.333984375, + "learning_rate": 0.0003879156399014305, + "loss": 5.2141, + "step": 236000 + }, + { + "epoch": 1.5137453195506767, + "grad_norm": 0.330078125, + "learning_rate": 0.0003878900374435946, + "loss": 5.2155, + "step": 236500 + }, + { + "epoch": 1.5169456267801709, + "grad_norm": 0.34375, + "learning_rate": 0.00038786443498575865, + "loss": 5.2129, + "step": 237000 + }, + { + "epoch": 1.520145934009665, + "grad_norm": 0.36328125, + "learning_rate": 0.0003878388325279227, + "loss": 5.2213, + "step": 237500 + }, + { + "epoch": 1.523346241239159, + "grad_norm": 0.365234375, + "learning_rate": 0.0003878132300700867, + "loss": 5.2209, + "step": 238000 + }, + { + "epoch": 1.526546548468653, + "grad_norm": 0.35546875, + "learning_rate": 0.0003877876276122508, + "loss": 5.2169, + "step": 238500 + }, + { + "epoch": 1.529746855698147, + "grad_norm": 0.37109375, + "learning_rate": 0.00038776202515441484, + "loss": 5.2133, + "step": 239000 + }, + { + "epoch": 1.5329471629276412, + "grad_norm": 0.34375, + "learning_rate": 0.0003877364226965789, + "loss": 5.2157, + "step": 239500 + }, + { + "epoch": 1.5361474701571352, + "grad_norm": 0.37109375, + "learning_rate": 0.00038771082023874296, + "loss": 5.2146, + "step": 240000 + }, + { + "epoch": 1.539347777386629, + "grad_norm": 0.369140625, + "learning_rate": 0.000387685217780907, + "loss": 5.2147, + "step": 240500 + }, + { + "epoch": 1.542548084616123, + "grad_norm": 0.359375, + "learning_rate": 0.00038765961532307103, + "loss": 5.2205, + "step": 241000 + }, + { + "epoch": 1.5457483918456172, + "grad_norm": 0.353515625, + "learning_rate": 0.00038763401286523506, + "loss": 5.215, + "step": 241500 + }, + { + "epoch": 1.5489486990751113, + "grad_norm": 0.357421875, + "learning_rate": 0.00038760841040739915, + "loss": 5.2172, + "step": 242000 + }, + { + "epoch": 1.5521490063046053, + "grad_norm": 0.337890625, + "learning_rate": 0.0003875828079495632, + "loss": 5.2126, + "step": 242500 + }, + { + "epoch": 1.5553493135340992, + "grad_norm": 0.328125, + "learning_rate": 0.0003875572054917272, + "loss": 5.2145, + "step": 243000 + }, + { + "epoch": 1.5585496207635932, + "grad_norm": 0.33984375, + "learning_rate": 0.00038753160303389125, + "loss": 5.2084, + "step": 243500 + }, + { + "epoch": 1.5617499279930873, + "grad_norm": 0.357421875, + "learning_rate": 0.0003875060005760553, + "loss": 5.2203, + "step": 244000 + }, + { + "epoch": 1.5649502352225815, + "grad_norm": 0.35546875, + "learning_rate": 0.0003874803981182193, + "loss": 5.2093, + "step": 244500 + }, + { + "epoch": 1.5681505424520754, + "grad_norm": 0.359375, + "learning_rate": 0.0003874547956603834, + "loss": 5.2172, + "step": 245000 + }, + { + "epoch": 1.5713508496815694, + "grad_norm": 0.322265625, + "learning_rate": 0.00038742919320254745, + "loss": 5.2117, + "step": 245500 + }, + { + "epoch": 1.5745511569110635, + "grad_norm": 0.357421875, + "learning_rate": 0.00038740359074471153, + "loss": 5.2167, + "step": 246000 + }, + { + "epoch": 1.5777514641405574, + "grad_norm": 0.3515625, + "learning_rate": 0.00038737798828687557, + "loss": 5.2152, + "step": 246500 + }, + { + "epoch": 1.5809517713700516, + "grad_norm": 0.337890625, + "learning_rate": 0.0003873523858290396, + "loss": 5.2127, + "step": 247000 + }, + { + "epoch": 1.5841520785995455, + "grad_norm": 0.357421875, + "learning_rate": 0.0003873267833712037, + "loss": 5.2125, + "step": 247500 + }, + { + "epoch": 1.5873523858290395, + "grad_norm": 0.365234375, + "learning_rate": 0.0003873011809133677, + "loss": 5.215, + "step": 248000 + }, + { + "epoch": 1.5905526930585336, + "grad_norm": 0.37890625, + "learning_rate": 0.00038727557845553176, + "loss": 5.2106, + "step": 248500 + }, + { + "epoch": 1.5937530002880278, + "grad_norm": 0.380859375, + "learning_rate": 0.0003872499759976958, + "loss": 5.2123, + "step": 249000 + }, + { + "epoch": 1.5969533075175217, + "grad_norm": 0.361328125, + "learning_rate": 0.00038722437353985983, + "loss": 5.2162, + "step": 249500 + }, + { + "epoch": 1.6001536147470157, + "grad_norm": 0.357421875, + "learning_rate": 0.0003871987710820239, + "loss": 5.2224, + "step": 250000 + }, + { + "epoch": 1.6033539219765096, + "grad_norm": 0.341796875, + "learning_rate": 0.00038717316862418795, + "loss": 5.2105, + "step": 250500 + }, + { + "epoch": 1.6065542292060038, + "grad_norm": 0.33984375, + "learning_rate": 0.000387147566166352, + "loss": 5.2144, + "step": 251000 + }, + { + "epoch": 1.609754536435498, + "grad_norm": 0.38671875, + "learning_rate": 0.000387121963708516, + "loss": 5.2169, + "step": 251500 + }, + { + "epoch": 1.6129548436649919, + "grad_norm": 0.3515625, + "learning_rate": 0.00038709636125068005, + "loss": 5.2141, + "step": 252000 + }, + { + "epoch": 1.6161551508944858, + "grad_norm": 0.353515625, + "learning_rate": 0.0003870707587928441, + "loss": 5.2145, + "step": 252500 + }, + { + "epoch": 1.6193554581239797, + "grad_norm": 0.34375, + "learning_rate": 0.0003870451563350082, + "loss": 5.2192, + "step": 253000 + }, + { + "epoch": 1.622555765353474, + "grad_norm": 0.33984375, + "learning_rate": 0.0003870195538771722, + "loss": 5.2116, + "step": 253500 + }, + { + "epoch": 1.625756072582968, + "grad_norm": 0.330078125, + "learning_rate": 0.0003869939514193363, + "loss": 5.2121, + "step": 254000 + }, + { + "epoch": 1.628956379812462, + "grad_norm": 0.35546875, + "learning_rate": 0.00038696834896150033, + "loss": 5.2086, + "step": 254500 + }, + { + "epoch": 1.632156687041956, + "grad_norm": 0.359375, + "learning_rate": 0.00038694274650366437, + "loss": 5.2137, + "step": 255000 + }, + { + "epoch": 1.63535699427145, + "grad_norm": 0.3359375, + "learning_rate": 0.00038691714404582846, + "loss": 5.2148, + "step": 255500 + }, + { + "epoch": 1.6385573015009443, + "grad_norm": 0.375, + "learning_rate": 0.0003868915415879925, + "loss": 5.2191, + "step": 256000 + }, + { + "epoch": 1.6417576087304382, + "grad_norm": 0.349609375, + "learning_rate": 0.0003868659391301565, + "loss": 5.2119, + "step": 256500 + }, + { + "epoch": 1.6449579159599321, + "grad_norm": 0.39453125, + "learning_rate": 0.00038684033667232056, + "loss": 5.2129, + "step": 257000 + }, + { + "epoch": 1.648158223189426, + "grad_norm": 0.359375, + "learning_rate": 0.0003868147342144846, + "loss": 5.2107, + "step": 257500 + }, + { + "epoch": 1.6513585304189202, + "grad_norm": 0.36328125, + "learning_rate": 0.00038678913175664863, + "loss": 5.2092, + "step": 258000 + }, + { + "epoch": 1.6545588376484144, + "grad_norm": 0.408203125, + "learning_rate": 0.0003867635292988127, + "loss": 5.2142, + "step": 258500 + }, + { + "epoch": 1.6577591448779083, + "grad_norm": 0.375, + "learning_rate": 0.00038673792684097675, + "loss": 5.2097, + "step": 259000 + }, + { + "epoch": 1.6609594521074023, + "grad_norm": 0.361328125, + "learning_rate": 0.0003867123243831408, + "loss": 5.2178, + "step": 259500 + }, + { + "epoch": 1.6641597593368962, + "grad_norm": 0.36328125, + "learning_rate": 0.0003866867219253048, + "loss": 5.2202, + "step": 260000 + }, + { + "epoch": 1.6673600665663904, + "grad_norm": 0.375, + "learning_rate": 0.00038666111946746885, + "loss": 5.2108, + "step": 260500 + }, + { + "epoch": 1.6705603737958845, + "grad_norm": 0.353515625, + "learning_rate": 0.00038663551700963294, + "loss": 5.2111, + "step": 261000 + }, + { + "epoch": 1.6737606810253784, + "grad_norm": 0.36328125, + "learning_rate": 0.00038660991455179703, + "loss": 5.2115, + "step": 261500 + }, + { + "epoch": 1.6769609882548724, + "grad_norm": 0.34375, + "learning_rate": 0.00038658431209396106, + "loss": 5.2137, + "step": 262000 + }, + { + "epoch": 1.6801612954843665, + "grad_norm": 0.369140625, + "learning_rate": 0.0003865587096361251, + "loss": 5.2147, + "step": 262500 + }, + { + "epoch": 1.6833616027138605, + "grad_norm": 0.373046875, + "learning_rate": 0.00038653310717828913, + "loss": 5.2157, + "step": 263000 + }, + { + "epoch": 1.6865619099433546, + "grad_norm": 0.369140625, + "learning_rate": 0.00038650750472045317, + "loss": 5.2153, + "step": 263500 + }, + { + "epoch": 1.6897622171728486, + "grad_norm": 0.345703125, + "learning_rate": 0.00038648190226261726, + "loss": 5.2073, + "step": 264000 + }, + { + "epoch": 1.6929625244023425, + "grad_norm": 0.359375, + "learning_rate": 0.0003864562998047813, + "loss": 5.2185, + "step": 264500 + }, + { + "epoch": 1.6961628316318367, + "grad_norm": 0.359375, + "learning_rate": 0.0003864306973469453, + "loss": 5.2131, + "step": 265000 + }, + { + "epoch": 1.6993631388613308, + "grad_norm": 0.38671875, + "learning_rate": 0.00038640509488910936, + "loss": 5.2177, + "step": 265500 + }, + { + "epoch": 1.7025634460908248, + "grad_norm": 0.3984375, + "learning_rate": 0.0003863794924312734, + "loss": 5.2098, + "step": 266000 + }, + { + "epoch": 1.7057637533203187, + "grad_norm": 0.36328125, + "learning_rate": 0.0003863538899734375, + "loss": 5.2088, + "step": 266500 + }, + { + "epoch": 1.7089640605498126, + "grad_norm": 0.35546875, + "learning_rate": 0.0003863282875156015, + "loss": 5.2095, + "step": 267000 + }, + { + "epoch": 1.7121643677793068, + "grad_norm": 0.36328125, + "learning_rate": 0.00038630268505776555, + "loss": 5.2109, + "step": 267500 + }, + { + "epoch": 1.715364675008801, + "grad_norm": 0.359375, + "learning_rate": 0.0003862770825999296, + "loss": 5.2124, + "step": 268000 + }, + { + "epoch": 1.718564982238295, + "grad_norm": 0.36328125, + "learning_rate": 0.00038625148014209367, + "loss": 5.2093, + "step": 268500 + }, + { + "epoch": 1.7217652894677888, + "grad_norm": 0.365234375, + "learning_rate": 0.0003862258776842577, + "loss": 5.2065, + "step": 269000 + }, + { + "epoch": 1.7249655966972828, + "grad_norm": 0.408203125, + "learning_rate": 0.0003862002752264218, + "loss": 5.2177, + "step": 269500 + }, + { + "epoch": 1.728165903926777, + "grad_norm": 0.361328125, + "learning_rate": 0.00038617467276858583, + "loss": 5.2129, + "step": 270000 + }, + { + "epoch": 1.731366211156271, + "grad_norm": 0.376953125, + "learning_rate": 0.00038614907031074986, + "loss": 5.2104, + "step": 270500 + }, + { + "epoch": 1.734566518385765, + "grad_norm": 0.3515625, + "learning_rate": 0.0003861234678529139, + "loss": 5.2146, + "step": 271000 + }, + { + "epoch": 1.737766825615259, + "grad_norm": 0.341796875, + "learning_rate": 0.00038609786539507793, + "loss": 5.21, + "step": 271500 + }, + { + "epoch": 1.7409671328447531, + "grad_norm": 0.349609375, + "learning_rate": 0.000386072262937242, + "loss": 5.2105, + "step": 272000 + }, + { + "epoch": 1.7441674400742473, + "grad_norm": 0.3671875, + "learning_rate": 0.00038604666047940605, + "loss": 5.2113, + "step": 272500 + }, + { + "epoch": 1.7473677473037412, + "grad_norm": 0.38671875, + "learning_rate": 0.0003860210580215701, + "loss": 5.2127, + "step": 273000 + }, + { + "epoch": 1.7505680545332352, + "grad_norm": 0.3671875, + "learning_rate": 0.0003859954555637341, + "loss": 5.2113, + "step": 273500 + }, + { + "epoch": 1.753768361762729, + "grad_norm": 0.345703125, + "learning_rate": 0.00038596985310589816, + "loss": 5.2137, + "step": 274000 + }, + { + "epoch": 1.7569686689922233, + "grad_norm": 0.3515625, + "learning_rate": 0.0003859442506480622, + "loss": 5.2108, + "step": 274500 + }, + { + "epoch": 1.7601689762217174, + "grad_norm": 0.37890625, + "learning_rate": 0.0003859186481902263, + "loss": 5.2108, + "step": 275000 + }, + { + "epoch": 1.7633692834512114, + "grad_norm": 0.345703125, + "learning_rate": 0.0003858930457323903, + "loss": 5.2107, + "step": 275500 + }, + { + "epoch": 1.7665695906807053, + "grad_norm": 0.34765625, + "learning_rate": 0.00038586744327455435, + "loss": 5.2126, + "step": 276000 + }, + { + "epoch": 1.7697698979101992, + "grad_norm": 0.36328125, + "learning_rate": 0.00038584184081671844, + "loss": 5.2125, + "step": 276500 + }, + { + "epoch": 1.7729702051396934, + "grad_norm": 0.384765625, + "learning_rate": 0.00038581623835888247, + "loss": 5.2136, + "step": 277000 + }, + { + "epoch": 1.7761705123691875, + "grad_norm": 0.369140625, + "learning_rate": 0.00038579063590104656, + "loss": 5.2136, + "step": 277500 + }, + { + "epoch": 1.7793708195986815, + "grad_norm": 0.353515625, + "learning_rate": 0.0003857650334432106, + "loss": 5.2113, + "step": 278000 + }, + { + "epoch": 1.7825711268281754, + "grad_norm": 0.36328125, + "learning_rate": 0.00038573943098537463, + "loss": 5.2126, + "step": 278500 + }, + { + "epoch": 1.7857714340576696, + "grad_norm": 0.3359375, + "learning_rate": 0.00038571382852753866, + "loss": 5.2121, + "step": 279000 + }, + { + "epoch": 1.7889717412871635, + "grad_norm": 0.37890625, + "learning_rate": 0.0003856882260697027, + "loss": 5.2061, + "step": 279500 + }, + { + "epoch": 1.7921720485166577, + "grad_norm": 0.380859375, + "learning_rate": 0.00038566262361186673, + "loss": 5.2132, + "step": 280000 + }, + { + "epoch": 1.7953723557461516, + "grad_norm": 0.359375, + "learning_rate": 0.0003856370211540308, + "loss": 5.2158, + "step": 280500 + }, + { + "epoch": 1.7985726629756456, + "grad_norm": 0.369140625, + "learning_rate": 0.00038561141869619485, + "loss": 5.2113, + "step": 281000 + }, + { + "epoch": 1.8017729702051397, + "grad_norm": 0.34375, + "learning_rate": 0.0003855858162383589, + "loss": 5.2072, + "step": 281500 + }, + { + "epoch": 1.8049732774346339, + "grad_norm": 0.384765625, + "learning_rate": 0.0003855602137805229, + "loss": 5.2132, + "step": 282000 + }, + { + "epoch": 1.8081735846641278, + "grad_norm": 0.369140625, + "learning_rate": 0.00038553461132268696, + "loss": 5.2134, + "step": 282500 + }, + { + "epoch": 1.8113738918936217, + "grad_norm": 0.357421875, + "learning_rate": 0.00038550900886485104, + "loss": 5.2122, + "step": 283000 + }, + { + "epoch": 1.8145741991231157, + "grad_norm": 0.375, + "learning_rate": 0.0003854834064070151, + "loss": 5.2105, + "step": 283500 + }, + { + "epoch": 1.8177745063526098, + "grad_norm": 0.3515625, + "learning_rate": 0.00038545780394917917, + "loss": 5.2108, + "step": 284000 + }, + { + "epoch": 1.820974813582104, + "grad_norm": 0.400390625, + "learning_rate": 0.0003854322014913432, + "loss": 5.2122, + "step": 284500 + }, + { + "epoch": 1.824175120811598, + "grad_norm": 0.359375, + "learning_rate": 0.00038540659903350724, + "loss": 5.2076, + "step": 285000 + }, + { + "epoch": 1.8273754280410919, + "grad_norm": 0.359375, + "learning_rate": 0.0003853809965756713, + "loss": 5.2107, + "step": 285500 + }, + { + "epoch": 1.8305757352705858, + "grad_norm": 0.349609375, + "learning_rate": 0.00038535539411783536, + "loss": 5.2128, + "step": 286000 + }, + { + "epoch": 1.83377604250008, + "grad_norm": 0.365234375, + "learning_rate": 0.0003853297916599994, + "loss": 5.2076, + "step": 286500 + }, + { + "epoch": 1.8369763497295741, + "grad_norm": 0.357421875, + "learning_rate": 0.0003853041892021634, + "loss": 5.2113, + "step": 287000 + }, + { + "epoch": 1.840176656959068, + "grad_norm": 0.365234375, + "learning_rate": 0.00038527858674432746, + "loss": 5.2084, + "step": 287500 + }, + { + "epoch": 1.843376964188562, + "grad_norm": 0.375, + "learning_rate": 0.0003852529842864915, + "loss": 5.2152, + "step": 288000 + }, + { + "epoch": 1.8465772714180562, + "grad_norm": 0.384765625, + "learning_rate": 0.0003852273818286556, + "loss": 5.2082, + "step": 288500 + }, + { + "epoch": 1.8497775786475503, + "grad_norm": 0.37109375, + "learning_rate": 0.0003852017793708196, + "loss": 5.2145, + "step": 289000 + }, + { + "epoch": 1.8529778858770443, + "grad_norm": 0.359375, + "learning_rate": 0.00038517617691298365, + "loss": 5.2083, + "step": 289500 + }, + { + "epoch": 1.8561781931065382, + "grad_norm": 0.36328125, + "learning_rate": 0.0003851505744551477, + "loss": 5.2103, + "step": 290000 + }, + { + "epoch": 1.8593785003360321, + "grad_norm": 0.375, + "learning_rate": 0.0003851249719973117, + "loss": 5.2066, + "step": 290500 + }, + { + "epoch": 1.8625788075655263, + "grad_norm": 0.36328125, + "learning_rate": 0.0003850993695394758, + "loss": 5.2152, + "step": 291000 + }, + { + "epoch": 1.8657791147950205, + "grad_norm": 0.357421875, + "learning_rate": 0.00038507376708163984, + "loss": 5.2144, + "step": 291500 + }, + { + "epoch": 1.8689794220245144, + "grad_norm": 0.369140625, + "learning_rate": 0.00038504816462380393, + "loss": 5.2091, + "step": 292000 + }, + { + "epoch": 1.8721797292540083, + "grad_norm": 0.375, + "learning_rate": 0.00038502256216596797, + "loss": 5.2137, + "step": 292500 + }, + { + "epoch": 1.8753800364835023, + "grad_norm": 0.35546875, + "learning_rate": 0.000384996959708132, + "loss": 5.2023, + "step": 293000 + }, + { + "epoch": 1.8785803437129964, + "grad_norm": 0.353515625, + "learning_rate": 0.00038497135725029603, + "loss": 5.2073, + "step": 293500 + }, + { + "epoch": 1.8817806509424906, + "grad_norm": 0.380859375, + "learning_rate": 0.0003849457547924601, + "loss": 5.2089, + "step": 294000 + }, + { + "epoch": 1.8849809581719845, + "grad_norm": 0.408203125, + "learning_rate": 0.00038492015233462416, + "loss": 5.2069, + "step": 294500 + }, + { + "epoch": 1.8881812654014785, + "grad_norm": 0.408203125, + "learning_rate": 0.0003848945498767882, + "loss": 5.2084, + "step": 295000 + }, + { + "epoch": 1.8913815726309726, + "grad_norm": 0.380859375, + "learning_rate": 0.0003848689474189522, + "loss": 5.2065, + "step": 295500 + }, + { + "epoch": 1.8945818798604666, + "grad_norm": 0.3671875, + "learning_rate": 0.00038484334496111626, + "loss": 5.2123, + "step": 296000 + }, + { + "epoch": 1.8977821870899607, + "grad_norm": 0.357421875, + "learning_rate": 0.00038481774250328035, + "loss": 5.2105, + "step": 296500 + }, + { + "epoch": 1.9009824943194547, + "grad_norm": 0.375, + "learning_rate": 0.0003847921400454444, + "loss": 5.2092, + "step": 297000 + }, + { + "epoch": 1.9041828015489486, + "grad_norm": 0.365234375, + "learning_rate": 0.0003847665375876084, + "loss": 5.2042, + "step": 297500 + }, + { + "epoch": 1.9073831087784427, + "grad_norm": 0.3671875, + "learning_rate": 0.00038474093512977245, + "loss": 5.2083, + "step": 298000 + }, + { + "epoch": 1.910583416007937, + "grad_norm": 0.3984375, + "learning_rate": 0.00038471533267193654, + "loss": 5.2129, + "step": 298500 + }, + { + "epoch": 1.9137837232374308, + "grad_norm": 0.373046875, + "learning_rate": 0.0003846897302141006, + "loss": 5.2136, + "step": 299000 + }, + { + "epoch": 1.9169840304669248, + "grad_norm": 0.349609375, + "learning_rate": 0.00038466412775626466, + "loss": 5.2083, + "step": 299500 + }, + { + "epoch": 1.9201843376964187, + "grad_norm": 0.34765625, + "learning_rate": 0.0003846385252984287, + "loss": 5.2071, + "step": 300000 + }, + { + "epoch": 1.9233846449259129, + "grad_norm": 0.365234375, + "learning_rate": 0.00038461292284059273, + "loss": 5.2018, + "step": 300500 + }, + { + "epoch": 1.926584952155407, + "grad_norm": 0.375, + "learning_rate": 0.00038458732038275676, + "loss": 5.2091, + "step": 301000 + }, + { + "epoch": 1.929785259384901, + "grad_norm": 0.39453125, + "learning_rate": 0.0003845617179249208, + "loss": 5.2078, + "step": 301500 + }, + { + "epoch": 1.932985566614395, + "grad_norm": 0.41796875, + "learning_rate": 0.0003845361154670849, + "loss": 5.2102, + "step": 302000 + }, + { + "epoch": 1.9361858738438888, + "grad_norm": 0.369140625, + "learning_rate": 0.0003845105130092489, + "loss": 5.2069, + "step": 302500 + }, + { + "epoch": 1.939386181073383, + "grad_norm": 0.369140625, + "learning_rate": 0.00038448491055141296, + "loss": 5.2084, + "step": 303000 + }, + { + "epoch": 1.9425864883028772, + "grad_norm": 0.373046875, + "learning_rate": 0.000384459308093577, + "loss": 5.208, + "step": 303500 + }, + { + "epoch": 1.945786795532371, + "grad_norm": 0.376953125, + "learning_rate": 0.000384433705635741, + "loss": 5.2096, + "step": 304000 + }, + { + "epoch": 1.948987102761865, + "grad_norm": 0.361328125, + "learning_rate": 0.00038440810317790506, + "loss": 5.208, + "step": 304500 + }, + { + "epoch": 1.9521874099913592, + "grad_norm": 0.353515625, + "learning_rate": 0.00038438250072006915, + "loss": 5.2065, + "step": 305000 + }, + { + "epoch": 1.9553877172208534, + "grad_norm": 0.369140625, + "learning_rate": 0.0003843568982622332, + "loss": 5.2092, + "step": 305500 + }, + { + "epoch": 1.9585880244503473, + "grad_norm": 0.3515625, + "learning_rate": 0.0003843312958043972, + "loss": 5.2149, + "step": 306000 + }, + { + "epoch": 1.9617883316798412, + "grad_norm": 0.390625, + "learning_rate": 0.0003843056933465613, + "loss": 5.2069, + "step": 306500 + }, + { + "epoch": 1.9649886389093352, + "grad_norm": 0.37109375, + "learning_rate": 0.00038428009088872534, + "loss": 5.2079, + "step": 307000 + }, + { + "epoch": 1.9681889461388293, + "grad_norm": 0.37109375, + "learning_rate": 0.0003842544884308894, + "loss": 5.2044, + "step": 307500 + }, + { + "epoch": 1.9713892533683235, + "grad_norm": 0.37109375, + "learning_rate": 0.00038422888597305346, + "loss": 5.2109, + "step": 308000 + }, + { + "epoch": 1.9745895605978174, + "grad_norm": 0.3828125, + "learning_rate": 0.0003842032835152175, + "loss": 5.2061, + "step": 308500 + }, + { + "epoch": 1.9777898678273114, + "grad_norm": 0.357421875, + "learning_rate": 0.00038417768105738153, + "loss": 5.2095, + "step": 309000 + }, + { + "epoch": 1.9809901750568053, + "grad_norm": 0.357421875, + "learning_rate": 0.00038415207859954556, + "loss": 5.2049, + "step": 309500 + }, + { + "epoch": 1.9841904822862995, + "grad_norm": 0.365234375, + "learning_rate": 0.0003841264761417096, + "loss": 5.2041, + "step": 310000 + }, + { + "epoch": 1.9873907895157936, + "grad_norm": 0.361328125, + "learning_rate": 0.0003841008736838737, + "loss": 5.1995, + "step": 310500 + }, + { + "epoch": 1.9905910967452876, + "grad_norm": 0.365234375, + "learning_rate": 0.0003840752712260377, + "loss": 5.2057, + "step": 311000 + }, + { + "epoch": 1.9937914039747815, + "grad_norm": 0.41796875, + "learning_rate": 0.00038404966876820176, + "loss": 5.212, + "step": 311500 + }, + { + "epoch": 1.9969917112042757, + "grad_norm": 0.376953125, + "learning_rate": 0.0003840240663103658, + "loss": 5.2029, + "step": 312000 + }, + { + "epoch": 2.0, + "eval_loss": 5.198747158050537, + "eval_runtime": 1.6976, + "eval_samples_per_second": 589.07, + "eval_steps_per_second": 9.425, + "step": 312470 + }, + { + "epoch": 2.00019201843377, + "grad_norm": 0.376953125, + "learning_rate": 0.0003839984638525298, + "loss": 5.2133, + "step": 312500 + }, + { + "epoch": 2.0033923256632638, + "grad_norm": 0.375, + "learning_rate": 0.0003839728613946939, + "loss": 5.2076, + "step": 313000 + }, + { + "epoch": 2.0065926328927577, + "grad_norm": 0.37109375, + "learning_rate": 0.00038394725893685795, + "loss": 5.202, + "step": 313500 + }, + { + "epoch": 2.0097929401222516, + "grad_norm": 0.392578125, + "learning_rate": 0.000383921656479022, + "loss": 5.203, + "step": 314000 + }, + { + "epoch": 2.0129932473517456, + "grad_norm": 0.373046875, + "learning_rate": 0.00038389605402118607, + "loss": 5.2092, + "step": 314500 + }, + { + "epoch": 2.01619355458124, + "grad_norm": 0.41015625, + "learning_rate": 0.0003838704515633501, + "loss": 5.1966, + "step": 315000 + }, + { + "epoch": 2.019393861810734, + "grad_norm": 0.369140625, + "learning_rate": 0.00038384484910551414, + "loss": 5.2033, + "step": 315500 + }, + { + "epoch": 2.022594169040228, + "grad_norm": 0.376953125, + "learning_rate": 0.0003838192466476782, + "loss": 5.2001, + "step": 316000 + }, + { + "epoch": 2.0257944762697218, + "grad_norm": 0.357421875, + "learning_rate": 0.00038379364418984226, + "loss": 5.2038, + "step": 316500 + }, + { + "epoch": 2.028994783499216, + "grad_norm": 0.376953125, + "learning_rate": 0.0003837680417320063, + "loss": 5.2041, + "step": 317000 + }, + { + "epoch": 2.03219509072871, + "grad_norm": 0.380859375, + "learning_rate": 0.00038374243927417033, + "loss": 5.199, + "step": 317500 + }, + { + "epoch": 2.035395397958204, + "grad_norm": 0.3515625, + "learning_rate": 0.00038371683681633436, + "loss": 5.2118, + "step": 318000 + }, + { + "epoch": 2.038595705187698, + "grad_norm": 0.3984375, + "learning_rate": 0.00038369123435849845, + "loss": 5.1993, + "step": 318500 + }, + { + "epoch": 2.041796012417192, + "grad_norm": 0.380859375, + "learning_rate": 0.0003836656319006625, + "loss": 5.2074, + "step": 319000 + }, + { + "epoch": 2.0449963196466863, + "grad_norm": 0.392578125, + "learning_rate": 0.0003836400294428265, + "loss": 5.2091, + "step": 319500 + }, + { + "epoch": 2.04819662687618, + "grad_norm": 0.400390625, + "learning_rate": 0.00038361442698499055, + "loss": 5.2067, + "step": 320000 + }, + { + "epoch": 2.051396934105674, + "grad_norm": 0.375, + "learning_rate": 0.0003835888245271546, + "loss": 5.2012, + "step": 320500 + }, + { + "epoch": 2.054597241335168, + "grad_norm": 0.349609375, + "learning_rate": 0.0003835632220693187, + "loss": 5.2013, + "step": 321000 + }, + { + "epoch": 2.057797548564662, + "grad_norm": 0.369140625, + "learning_rate": 0.0003835376196114827, + "loss": 5.2057, + "step": 321500 + }, + { + "epoch": 2.0609978557941564, + "grad_norm": 0.376953125, + "learning_rate": 0.0003835120171536468, + "loss": 5.2053, + "step": 322000 + }, + { + "epoch": 2.0641981630236503, + "grad_norm": 0.384765625, + "learning_rate": 0.00038348641469581083, + "loss": 5.2096, + "step": 322500 + }, + { + "epoch": 2.0673984702531443, + "grad_norm": 0.361328125, + "learning_rate": 0.00038346081223797487, + "loss": 5.2038, + "step": 323000 + }, + { + "epoch": 2.070598777482638, + "grad_norm": 0.416015625, + "learning_rate": 0.0003834352097801389, + "loss": 5.2057, + "step": 323500 + }, + { + "epoch": 2.0737990847121326, + "grad_norm": 0.359375, + "learning_rate": 0.000383409607322303, + "loss": 5.2013, + "step": 324000 + }, + { + "epoch": 2.0769993919416265, + "grad_norm": 0.380859375, + "learning_rate": 0.000383384004864467, + "loss": 5.2081, + "step": 324500 + }, + { + "epoch": 2.0801996991711205, + "grad_norm": 0.40234375, + "learning_rate": 0.00038335840240663106, + "loss": 5.2038, + "step": 325000 + }, + { + "epoch": 2.0834000064006144, + "grad_norm": 0.400390625, + "learning_rate": 0.0003833327999487951, + "loss": 5.2032, + "step": 325500 + }, + { + "epoch": 2.0866003136301083, + "grad_norm": 0.39453125, + "learning_rate": 0.00038330719749095913, + "loss": 5.2018, + "step": 326000 + }, + { + "epoch": 2.0898006208596027, + "grad_norm": 0.369140625, + "learning_rate": 0.0003832815950331232, + "loss": 5.2015, + "step": 326500 + }, + { + "epoch": 2.0930009280890967, + "grad_norm": 0.37109375, + "learning_rate": 0.00038325599257528725, + "loss": 5.1985, + "step": 327000 + }, + { + "epoch": 2.0962012353185906, + "grad_norm": 0.373046875, + "learning_rate": 0.0003832303901174513, + "loss": 5.2005, + "step": 327500 + }, + { + "epoch": 2.0994015425480845, + "grad_norm": 0.400390625, + "learning_rate": 0.0003832047876596153, + "loss": 5.2005, + "step": 328000 + }, + { + "epoch": 2.1026018497775785, + "grad_norm": 0.392578125, + "learning_rate": 0.00038317918520177935, + "loss": 5.2106, + "step": 328500 + }, + { + "epoch": 2.105802157007073, + "grad_norm": 0.376953125, + "learning_rate": 0.00038315358274394344, + "loss": 5.2059, + "step": 329000 + }, + { + "epoch": 2.109002464236567, + "grad_norm": 0.37890625, + "learning_rate": 0.0003831279802861075, + "loss": 5.2012, + "step": 329500 + }, + { + "epoch": 2.1122027714660607, + "grad_norm": 0.375, + "learning_rate": 0.00038310237782827156, + "loss": 5.1964, + "step": 330000 + }, + { + "epoch": 2.1154030786955547, + "grad_norm": 0.380859375, + "learning_rate": 0.0003830767753704356, + "loss": 5.2087, + "step": 330500 + }, + { + "epoch": 2.1186033859250486, + "grad_norm": 0.390625, + "learning_rate": 0.00038305117291259963, + "loss": 5.203, + "step": 331000 + }, + { + "epoch": 2.121803693154543, + "grad_norm": 0.369140625, + "learning_rate": 0.00038302557045476367, + "loss": 5.2044, + "step": 331500 + }, + { + "epoch": 2.125004000384037, + "grad_norm": 0.365234375, + "learning_rate": 0.00038299996799692776, + "loss": 5.2065, + "step": 332000 + }, + { + "epoch": 2.128204307613531, + "grad_norm": 0.365234375, + "learning_rate": 0.0003829743655390918, + "loss": 5.2018, + "step": 332500 + }, + { + "epoch": 2.131404614843025, + "grad_norm": 0.353515625, + "learning_rate": 0.0003829487630812558, + "loss": 5.205, + "step": 333000 + }, + { + "epoch": 2.134604922072519, + "grad_norm": 0.400390625, + "learning_rate": 0.00038292316062341986, + "loss": 5.2031, + "step": 333500 + }, + { + "epoch": 2.137805229302013, + "grad_norm": 0.33984375, + "learning_rate": 0.0003828975581655839, + "loss": 5.2081, + "step": 334000 + }, + { + "epoch": 2.141005536531507, + "grad_norm": 0.373046875, + "learning_rate": 0.0003828719557077479, + "loss": 5.2036, + "step": 334500 + }, + { + "epoch": 2.144205843761001, + "grad_norm": 0.3828125, + "learning_rate": 0.000382846353249912, + "loss": 5.2067, + "step": 335000 + }, + { + "epoch": 2.147406150990495, + "grad_norm": 0.38671875, + "learning_rate": 0.00038282075079207605, + "loss": 5.2016, + "step": 335500 + }, + { + "epoch": 2.1506064582199893, + "grad_norm": 0.37109375, + "learning_rate": 0.0003827951483342401, + "loss": 5.2033, + "step": 336000 + }, + { + "epoch": 2.1538067654494832, + "grad_norm": 0.3515625, + "learning_rate": 0.00038276954587640417, + "loss": 5.2013, + "step": 336500 + }, + { + "epoch": 2.157007072678977, + "grad_norm": 0.361328125, + "learning_rate": 0.0003827439434185682, + "loss": 5.1996, + "step": 337000 + }, + { + "epoch": 2.160207379908471, + "grad_norm": 0.369140625, + "learning_rate": 0.0003827183409607323, + "loss": 5.2038, + "step": 337500 + }, + { + "epoch": 2.163407687137965, + "grad_norm": 0.400390625, + "learning_rate": 0.00038269273850289633, + "loss": 5.2004, + "step": 338000 + }, + { + "epoch": 2.1666079943674594, + "grad_norm": 0.44921875, + "learning_rate": 0.00038266713604506036, + "loss": 5.202, + "step": 338500 + }, + { + "epoch": 2.1698083015969534, + "grad_norm": 0.37890625, + "learning_rate": 0.0003826415335872244, + "loss": 5.1946, + "step": 339000 + }, + { + "epoch": 2.1730086088264473, + "grad_norm": 0.38671875, + "learning_rate": 0.00038261593112938843, + "loss": 5.205, + "step": 339500 + }, + { + "epoch": 2.1762089160559412, + "grad_norm": 0.3671875, + "learning_rate": 0.00038259032867155247, + "loss": 5.2002, + "step": 340000 + }, + { + "epoch": 2.179409223285435, + "grad_norm": 0.40234375, + "learning_rate": 0.00038256472621371655, + "loss": 5.2024, + "step": 340500 + }, + { + "epoch": 2.1826095305149296, + "grad_norm": 0.390625, + "learning_rate": 0.0003825391237558806, + "loss": 5.2081, + "step": 341000 + }, + { + "epoch": 2.1858098377444235, + "grad_norm": 0.4453125, + "learning_rate": 0.0003825135212980446, + "loss": 5.2062, + "step": 341500 + }, + { + "epoch": 2.1890101449739174, + "grad_norm": 0.3671875, + "learning_rate": 0.00038248791884020866, + "loss": 5.202, + "step": 342000 + }, + { + "epoch": 2.1922104522034114, + "grad_norm": 0.3828125, + "learning_rate": 0.0003824623163823727, + "loss": 5.2013, + "step": 342500 + }, + { + "epoch": 2.1954107594329058, + "grad_norm": 0.40625, + "learning_rate": 0.0003824367139245368, + "loss": 5.199, + "step": 343000 + }, + { + "epoch": 2.1986110666623997, + "grad_norm": 0.365234375, + "learning_rate": 0.0003824111114667008, + "loss": 5.2029, + "step": 343500 + }, + { + "epoch": 2.2018113738918936, + "grad_norm": 0.400390625, + "learning_rate": 0.00038238550900886485, + "loss": 5.208, + "step": 344000 + }, + { + "epoch": 2.2050116811213876, + "grad_norm": 0.4375, + "learning_rate": 0.00038235990655102894, + "loss": 5.2012, + "step": 344500 + }, + { + "epoch": 2.2082119883508815, + "grad_norm": 0.384765625, + "learning_rate": 0.00038233430409319297, + "loss": 5.1991, + "step": 345000 + }, + { + "epoch": 2.211412295580376, + "grad_norm": 0.3828125, + "learning_rate": 0.000382308701635357, + "loss": 5.2054, + "step": 345500 + }, + { + "epoch": 2.21461260280987, + "grad_norm": 0.390625, + "learning_rate": 0.0003822830991775211, + "loss": 5.2062, + "step": 346000 + }, + { + "epoch": 2.2178129100393638, + "grad_norm": 0.369140625, + "learning_rate": 0.00038225749671968513, + "loss": 5.202, + "step": 346500 + }, + { + "epoch": 2.2210132172688577, + "grad_norm": 0.388671875, + "learning_rate": 0.00038223189426184916, + "loss": 5.1993, + "step": 347000 + }, + { + "epoch": 2.2242135244983516, + "grad_norm": 0.37109375, + "learning_rate": 0.0003822062918040132, + "loss": 5.1995, + "step": 347500 + }, + { + "epoch": 2.227413831727846, + "grad_norm": 0.384765625, + "learning_rate": 0.00038218068934617723, + "loss": 5.1964, + "step": 348000 + }, + { + "epoch": 2.23061413895734, + "grad_norm": 0.375, + "learning_rate": 0.0003821550868883413, + "loss": 5.2061, + "step": 348500 + }, + { + "epoch": 2.233814446186834, + "grad_norm": 0.37890625, + "learning_rate": 0.00038212948443050535, + "loss": 5.2081, + "step": 349000 + }, + { + "epoch": 2.237014753416328, + "grad_norm": 0.435546875, + "learning_rate": 0.0003821038819726694, + "loss": 5.1967, + "step": 349500 + }, + { + "epoch": 2.2402150606458218, + "grad_norm": 0.37109375, + "learning_rate": 0.0003820782795148334, + "loss": 5.2061, + "step": 350000 + }, + { + "epoch": 2.243415367875316, + "grad_norm": 0.435546875, + "learning_rate": 0.00038205267705699746, + "loss": 5.2018, + "step": 350500 + }, + { + "epoch": 2.24661567510481, + "grad_norm": 0.357421875, + "learning_rate": 0.0003820270745991615, + "loss": 5.2016, + "step": 351000 + }, + { + "epoch": 2.249815982334304, + "grad_norm": 0.390625, + "learning_rate": 0.0003820014721413256, + "loss": 5.2083, + "step": 351500 + }, + { + "epoch": 2.253016289563798, + "grad_norm": 0.421875, + "learning_rate": 0.0003819758696834896, + "loss": 5.2076, + "step": 352000 + }, + { + "epoch": 2.2562165967932923, + "grad_norm": 0.38671875, + "learning_rate": 0.0003819502672256537, + "loss": 5.2068, + "step": 352500 + }, + { + "epoch": 2.2594169040227863, + "grad_norm": 0.400390625, + "learning_rate": 0.00038192466476781774, + "loss": 5.2036, + "step": 353000 + }, + { + "epoch": 2.26261721125228, + "grad_norm": 0.37890625, + "learning_rate": 0.00038189906230998177, + "loss": 5.198, + "step": 353500 + }, + { + "epoch": 2.265817518481774, + "grad_norm": 0.40234375, + "learning_rate": 0.00038187345985214586, + "loss": 5.2034, + "step": 354000 + }, + { + "epoch": 2.269017825711268, + "grad_norm": 0.392578125, + "learning_rate": 0.0003818478573943099, + "loss": 5.2037, + "step": 354500 + }, + { + "epoch": 2.2722181329407625, + "grad_norm": 0.357421875, + "learning_rate": 0.0003818222549364739, + "loss": 5.2039, + "step": 355000 + }, + { + "epoch": 2.2754184401702564, + "grad_norm": 0.412109375, + "learning_rate": 0.00038179665247863796, + "loss": 5.2, + "step": 355500 + }, + { + "epoch": 2.2786187473997503, + "grad_norm": 0.369140625, + "learning_rate": 0.000381771050020802, + "loss": 5.2056, + "step": 356000 + }, + { + "epoch": 2.2818190546292443, + "grad_norm": 0.365234375, + "learning_rate": 0.00038174544756296603, + "loss": 5.2023, + "step": 356500 + }, + { + "epoch": 2.2850193618587387, + "grad_norm": 0.3671875, + "learning_rate": 0.0003817198451051301, + "loss": 5.1957, + "step": 357000 + }, + { + "epoch": 2.2882196690882326, + "grad_norm": 0.37109375, + "learning_rate": 0.00038169424264729415, + "loss": 5.2005, + "step": 357500 + }, + { + "epoch": 2.2914199763177265, + "grad_norm": 0.375, + "learning_rate": 0.0003816686401894582, + "loss": 5.2037, + "step": 358000 + }, + { + "epoch": 2.2946202835472205, + "grad_norm": 0.40234375, + "learning_rate": 0.0003816430377316222, + "loss": 5.2036, + "step": 358500 + }, + { + "epoch": 2.2978205907767144, + "grad_norm": 0.375, + "learning_rate": 0.0003816174352737863, + "loss": 5.2034, + "step": 359000 + }, + { + "epoch": 2.3010208980062083, + "grad_norm": 0.3828125, + "learning_rate": 0.00038159183281595034, + "loss": 5.1988, + "step": 359500 + }, + { + "epoch": 2.3042212052357027, + "grad_norm": 0.404296875, + "learning_rate": 0.00038156623035811443, + "loss": 5.2031, + "step": 360000 + }, + { + "epoch": 2.3074215124651967, + "grad_norm": 0.404296875, + "learning_rate": 0.00038154062790027847, + "loss": 5.1989, + "step": 360500 + }, + { + "epoch": 2.3106218196946906, + "grad_norm": 0.392578125, + "learning_rate": 0.0003815150254424425, + "loss": 5.2058, + "step": 361000 + }, + { + "epoch": 2.3138221269241845, + "grad_norm": 0.376953125, + "learning_rate": 0.00038148942298460653, + "loss": 5.2032, + "step": 361500 + }, + { + "epoch": 2.317022434153679, + "grad_norm": 0.396484375, + "learning_rate": 0.0003814638205267706, + "loss": 5.1991, + "step": 362000 + }, + { + "epoch": 2.320222741383173, + "grad_norm": 0.400390625, + "learning_rate": 0.00038143821806893466, + "loss": 5.2017, + "step": 362500 + }, + { + "epoch": 2.323423048612667, + "grad_norm": 0.40234375, + "learning_rate": 0.0003814126156110987, + "loss": 5.2021, + "step": 363000 + }, + { + "epoch": 2.3266233558421607, + "grad_norm": 0.34765625, + "learning_rate": 0.0003813870131532627, + "loss": 5.2023, + "step": 363500 + }, + { + "epoch": 2.3298236630716547, + "grad_norm": 0.392578125, + "learning_rate": 0.00038136141069542676, + "loss": 5.2085, + "step": 364000 + }, + { + "epoch": 2.333023970301149, + "grad_norm": 0.392578125, + "learning_rate": 0.0003813358082375908, + "loss": 5.2003, + "step": 364500 + }, + { + "epoch": 2.336224277530643, + "grad_norm": 0.375, + "learning_rate": 0.0003813102057797549, + "loss": 5.2016, + "step": 365000 + }, + { + "epoch": 2.339424584760137, + "grad_norm": 0.38671875, + "learning_rate": 0.0003812846033219189, + "loss": 5.2058, + "step": 365500 + }, + { + "epoch": 2.342624891989631, + "grad_norm": 0.359375, + "learning_rate": 0.00038125900086408295, + "loss": 5.2025, + "step": 366000 + }, + { + "epoch": 2.3458251992191252, + "grad_norm": 0.390625, + "learning_rate": 0.000381233398406247, + "loss": 5.1953, + "step": 366500 + }, + { + "epoch": 2.349025506448619, + "grad_norm": 0.38671875, + "learning_rate": 0.0003812077959484111, + "loss": 5.2013, + "step": 367000 + }, + { + "epoch": 2.352225813678113, + "grad_norm": 0.3828125, + "learning_rate": 0.0003811821934905751, + "loss": 5.2023, + "step": 367500 + }, + { + "epoch": 2.355426120907607, + "grad_norm": 0.42578125, + "learning_rate": 0.0003811565910327392, + "loss": 5.2069, + "step": 368000 + }, + { + "epoch": 2.358626428137101, + "grad_norm": 0.396484375, + "learning_rate": 0.00038113098857490323, + "loss": 5.2004, + "step": 368500 + }, + { + "epoch": 2.3618267353665954, + "grad_norm": 0.390625, + "learning_rate": 0.00038110538611706727, + "loss": 5.2035, + "step": 369000 + }, + { + "epoch": 2.3650270425960893, + "grad_norm": 0.400390625, + "learning_rate": 0.0003810797836592313, + "loss": 5.2042, + "step": 369500 + }, + { + "epoch": 2.3682273498255833, + "grad_norm": 0.41796875, + "learning_rate": 0.00038105418120139533, + "loss": 5.1985, + "step": 370000 + }, + { + "epoch": 2.371427657055077, + "grad_norm": 0.408203125, + "learning_rate": 0.0003810285787435594, + "loss": 5.2041, + "step": 370500 + }, + { + "epoch": 2.3746279642845716, + "grad_norm": 0.35546875, + "learning_rate": 0.00038100297628572346, + "loss": 5.1956, + "step": 371000 + }, + { + "epoch": 2.3778282715140655, + "grad_norm": 0.37890625, + "learning_rate": 0.0003809773738278875, + "loss": 5.2046, + "step": 371500 + }, + { + "epoch": 2.3810285787435594, + "grad_norm": 0.390625, + "learning_rate": 0.0003809517713700515, + "loss": 5.1996, + "step": 372000 + }, + { + "epoch": 2.3842288859730534, + "grad_norm": 0.4140625, + "learning_rate": 0.00038092616891221556, + "loss": 5.2018, + "step": 372500 + }, + { + "epoch": 2.3874291932025473, + "grad_norm": 0.373046875, + "learning_rate": 0.00038090056645437965, + "loss": 5.2014, + "step": 373000 + }, + { + "epoch": 2.3906295004320413, + "grad_norm": 0.486328125, + "learning_rate": 0.0003808749639965437, + "loss": 5.1966, + "step": 373500 + }, + { + "epoch": 2.3938298076615356, + "grad_norm": 0.419921875, + "learning_rate": 0.0003808493615387077, + "loss": 5.2013, + "step": 374000 + }, + { + "epoch": 2.3970301148910296, + "grad_norm": 0.404296875, + "learning_rate": 0.0003808237590808718, + "loss": 5.1956, + "step": 374500 + }, + { + "epoch": 2.4002304221205235, + "grad_norm": 0.380859375, + "learning_rate": 0.00038079815662303584, + "loss": 5.1954, + "step": 375000 + }, + { + "epoch": 2.4034307293500174, + "grad_norm": 0.38671875, + "learning_rate": 0.0003807725541651999, + "loss": 5.1928, + "step": 375500 + }, + { + "epoch": 2.406631036579512, + "grad_norm": 0.44140625, + "learning_rate": 0.00038074695170736396, + "loss": 5.2002, + "step": 376000 + }, + { + "epoch": 2.4098313438090058, + "grad_norm": 0.486328125, + "learning_rate": 0.000380721349249528, + "loss": 5.1998, + "step": 376500 + }, + { + "epoch": 2.4130316510384997, + "grad_norm": 0.384765625, + "learning_rate": 0.00038069574679169203, + "loss": 5.1989, + "step": 377000 + }, + { + "epoch": 2.4162319582679936, + "grad_norm": 0.392578125, + "learning_rate": 0.00038067014433385606, + "loss": 5.205, + "step": 377500 + }, + { + "epoch": 2.4194322654974876, + "grad_norm": 0.3828125, + "learning_rate": 0.0003806445418760201, + "loss": 5.2056, + "step": 378000 + }, + { + "epoch": 2.422632572726982, + "grad_norm": 0.376953125, + "learning_rate": 0.0003806189394181842, + "loss": 5.2047, + "step": 378500 + }, + { + "epoch": 2.425832879956476, + "grad_norm": 0.3828125, + "learning_rate": 0.0003805933369603482, + "loss": 5.1933, + "step": 379000 + }, + { + "epoch": 2.42903318718597, + "grad_norm": 0.41796875, + "learning_rate": 0.00038056773450251226, + "loss": 5.1982, + "step": 379500 + }, + { + "epoch": 2.4322334944154638, + "grad_norm": 0.435546875, + "learning_rate": 0.0003805421320446763, + "loss": 5.2007, + "step": 380000 + }, + { + "epoch": 2.435433801644958, + "grad_norm": 0.412109375, + "learning_rate": 0.0003805165295868403, + "loss": 5.2001, + "step": 380500 + }, + { + "epoch": 2.438634108874452, + "grad_norm": 0.3828125, + "learning_rate": 0.00038049092712900436, + "loss": 5.2003, + "step": 381000 + }, + { + "epoch": 2.441834416103946, + "grad_norm": 0.3515625, + "learning_rate": 0.00038046532467116845, + "loss": 5.203, + "step": 381500 + }, + { + "epoch": 2.44503472333344, + "grad_norm": 0.41796875, + "learning_rate": 0.0003804397222133325, + "loss": 5.1956, + "step": 382000 + }, + { + "epoch": 2.448235030562934, + "grad_norm": 0.419921875, + "learning_rate": 0.00038041411975549657, + "loss": 5.1983, + "step": 382500 + }, + { + "epoch": 2.451435337792428, + "grad_norm": 0.4296875, + "learning_rate": 0.0003803885172976606, + "loss": 5.199, + "step": 383000 + }, + { + "epoch": 2.454635645021922, + "grad_norm": 0.396484375, + "learning_rate": 0.00038036291483982464, + "loss": 5.203, + "step": 383500 + }, + { + "epoch": 2.457835952251416, + "grad_norm": 0.3828125, + "learning_rate": 0.0003803373123819887, + "loss": 5.2004, + "step": 384000 + }, + { + "epoch": 2.46103625948091, + "grad_norm": 0.384765625, + "learning_rate": 0.00038031170992415276, + "loss": 5.2037, + "step": 384500 + }, + { + "epoch": 2.464236566710404, + "grad_norm": 0.396484375, + "learning_rate": 0.0003802861074663168, + "loss": 5.1961, + "step": 385000 + }, + { + "epoch": 2.4674368739398984, + "grad_norm": 0.392578125, + "learning_rate": 0.00038026050500848083, + "loss": 5.2003, + "step": 385500 + }, + { + "epoch": 2.4706371811693923, + "grad_norm": 0.392578125, + "learning_rate": 0.00038023490255064486, + "loss": 5.2007, + "step": 386000 + }, + { + "epoch": 2.4738374883988863, + "grad_norm": 0.408203125, + "learning_rate": 0.0003802093000928089, + "loss": 5.1987, + "step": 386500 + }, + { + "epoch": 2.4770377956283802, + "grad_norm": 0.40625, + "learning_rate": 0.000380183697634973, + "loss": 5.1996, + "step": 387000 + }, + { + "epoch": 2.480238102857874, + "grad_norm": 0.392578125, + "learning_rate": 0.000380158095177137, + "loss": 5.1935, + "step": 387500 + }, + { + "epoch": 2.4834384100873685, + "grad_norm": 0.427734375, + "learning_rate": 0.00038013249271930105, + "loss": 5.1977, + "step": 388000 + }, + { + "epoch": 2.4866387173168625, + "grad_norm": 0.390625, + "learning_rate": 0.0003801068902614651, + "loss": 5.2041, + "step": 388500 + }, + { + "epoch": 2.4898390245463564, + "grad_norm": 0.400390625, + "learning_rate": 0.0003800812878036291, + "loss": 5.2037, + "step": 389000 + }, + { + "epoch": 2.4930393317758504, + "grad_norm": 0.41796875, + "learning_rate": 0.0003800556853457932, + "loss": 5.2075, + "step": 389500 + }, + { + "epoch": 2.4962396390053447, + "grad_norm": 0.36328125, + "learning_rate": 0.0003800300828879573, + "loss": 5.2056, + "step": 390000 + }, + { + "epoch": 2.4994399462348387, + "grad_norm": 0.40234375, + "learning_rate": 0.00038000448043012133, + "loss": 5.1986, + "step": 390500 + }, + { + "epoch": 2.5026402534643326, + "grad_norm": 0.443359375, + "learning_rate": 0.00037997887797228537, + "loss": 5.2022, + "step": 391000 + }, + { + "epoch": 2.5058405606938265, + "grad_norm": 0.373046875, + "learning_rate": 0.0003799532755144494, + "loss": 5.2009, + "step": 391500 + }, + { + "epoch": 2.5090408679233205, + "grad_norm": 0.412109375, + "learning_rate": 0.00037992767305661344, + "loss": 5.197, + "step": 392000 + }, + { + "epoch": 2.5122411751528144, + "grad_norm": 0.400390625, + "learning_rate": 0.0003799020705987775, + "loss": 5.2044, + "step": 392500 + }, + { + "epoch": 2.515441482382309, + "grad_norm": 0.4375, + "learning_rate": 0.00037987646814094156, + "loss": 5.2045, + "step": 393000 + }, + { + "epoch": 2.5186417896118027, + "grad_norm": 0.37109375, + "learning_rate": 0.0003798508656831056, + "loss": 5.2081, + "step": 393500 + }, + { + "epoch": 2.5218420968412967, + "grad_norm": 0.453125, + "learning_rate": 0.00037982526322526963, + "loss": 5.2002, + "step": 394000 + }, + { + "epoch": 2.525042404070791, + "grad_norm": 0.404296875, + "learning_rate": 0.00037979966076743366, + "loss": 5.1958, + "step": 394500 + }, + { + "epoch": 2.528242711300285, + "grad_norm": 0.3828125, + "learning_rate": 0.00037977405830959775, + "loss": 5.1946, + "step": 395000 + }, + { + "epoch": 2.531443018529779, + "grad_norm": 0.396484375, + "learning_rate": 0.0003797484558517618, + "loss": 5.2012, + "step": 395500 + }, + { + "epoch": 2.534643325759273, + "grad_norm": 0.41015625, + "learning_rate": 0.0003797228533939258, + "loss": 5.1959, + "step": 396000 + }, + { + "epoch": 2.537843632988767, + "grad_norm": 0.388671875, + "learning_rate": 0.00037969725093608985, + "loss": 5.2028, + "step": 396500 + }, + { + "epoch": 2.5410439402182607, + "grad_norm": 0.38671875, + "learning_rate": 0.00037967164847825394, + "loss": 5.1976, + "step": 397000 + }, + { + "epoch": 2.544244247447755, + "grad_norm": 0.396484375, + "learning_rate": 0.000379646046020418, + "loss": 5.1984, + "step": 397500 + }, + { + "epoch": 2.547444554677249, + "grad_norm": 0.39453125, + "learning_rate": 0.00037962044356258206, + "loss": 5.199, + "step": 398000 + }, + { + "epoch": 2.550644861906743, + "grad_norm": 0.419921875, + "learning_rate": 0.0003795948411047461, + "loss": 5.1985, + "step": 398500 + }, + { + "epoch": 2.553845169136237, + "grad_norm": 0.396484375, + "learning_rate": 0.00037956923864691013, + "loss": 5.2035, + "step": 399000 + }, + { + "epoch": 2.5570454763657313, + "grad_norm": 0.44921875, + "learning_rate": 0.00037954363618907417, + "loss": 5.1987, + "step": 399500 + }, + { + "epoch": 2.5602457835952253, + "grad_norm": 0.38671875, + "learning_rate": 0.0003795180337312382, + "loss": 5.2027, + "step": 400000 + }, + { + "epoch": 2.563446090824719, + "grad_norm": 0.423828125, + "learning_rate": 0.0003794924312734023, + "loss": 5.2047, + "step": 400500 + }, + { + "epoch": 2.566646398054213, + "grad_norm": 0.400390625, + "learning_rate": 0.0003794668288155663, + "loss": 5.2031, + "step": 401000 + }, + { + "epoch": 2.569846705283707, + "grad_norm": 0.388671875, + "learning_rate": 0.00037944122635773036, + "loss": 5.2054, + "step": 401500 + }, + { + "epoch": 2.573047012513201, + "grad_norm": 0.40625, + "learning_rate": 0.0003794156238998944, + "loss": 5.203, + "step": 402000 + }, + { + "epoch": 2.5762473197426954, + "grad_norm": 0.3828125, + "learning_rate": 0.0003793900214420584, + "loss": 5.205, + "step": 402500 + }, + { + "epoch": 2.5794476269721893, + "grad_norm": 0.416015625, + "learning_rate": 0.0003793644189842225, + "loss": 5.1995, + "step": 403000 + }, + { + "epoch": 2.5826479342016833, + "grad_norm": 0.37890625, + "learning_rate": 0.00037933881652638655, + "loss": 5.2005, + "step": 403500 + }, + { + "epoch": 2.5858482414311776, + "grad_norm": 0.365234375, + "learning_rate": 0.0003793132140685506, + "loss": 5.2015, + "step": 404000 + }, + { + "epoch": 2.5890485486606716, + "grad_norm": 0.3828125, + "learning_rate": 0.0003792876116107146, + "loss": 5.201, + "step": 404500 + }, + { + "epoch": 2.5922488558901655, + "grad_norm": 0.400390625, + "learning_rate": 0.0003792620091528787, + "loss": 5.2021, + "step": 405000 + }, + { + "epoch": 2.5954491631196595, + "grad_norm": 0.40234375, + "learning_rate": 0.00037923640669504274, + "loss": 5.209, + "step": 405500 + }, + { + "epoch": 2.5986494703491534, + "grad_norm": 0.41796875, + "learning_rate": 0.00037921080423720683, + "loss": 5.1946, + "step": 406000 + }, + { + "epoch": 2.6018497775786473, + "grad_norm": 0.4296875, + "learning_rate": 0.00037918520177937086, + "loss": 5.1998, + "step": 406500 + }, + { + "epoch": 2.6050500848081417, + "grad_norm": 0.39453125, + "learning_rate": 0.0003791595993215349, + "loss": 5.1969, + "step": 407000 + }, + { + "epoch": 2.6082503920376356, + "grad_norm": 0.404296875, + "learning_rate": 0.00037913399686369893, + "loss": 5.1967, + "step": 407500 + }, + { + "epoch": 2.6114506992671296, + "grad_norm": 0.42578125, + "learning_rate": 0.00037910839440586297, + "loss": 5.2025, + "step": 408000 + }, + { + "epoch": 2.6146510064966235, + "grad_norm": 0.408203125, + "learning_rate": 0.00037908279194802705, + "loss": 5.199, + "step": 408500 + }, + { + "epoch": 2.617851313726118, + "grad_norm": 0.390625, + "learning_rate": 0.0003790571894901911, + "loss": 5.2005, + "step": 409000 + }, + { + "epoch": 2.621051620955612, + "grad_norm": 0.40625, + "learning_rate": 0.0003790315870323551, + "loss": 5.1969, + "step": 409500 + }, + { + "epoch": 2.6242519281851058, + "grad_norm": 0.396484375, + "learning_rate": 0.00037900598457451916, + "loss": 5.205, + "step": 410000 + }, + { + "epoch": 2.6274522354145997, + "grad_norm": 0.37890625, + "learning_rate": 0.0003789803821166832, + "loss": 5.2016, + "step": 410500 + }, + { + "epoch": 2.6306525426440936, + "grad_norm": 0.39453125, + "learning_rate": 0.0003789547796588472, + "loss": 5.2007, + "step": 411000 + }, + { + "epoch": 2.6338528498735876, + "grad_norm": 0.38671875, + "learning_rate": 0.0003789291772010113, + "loss": 5.2053, + "step": 411500 + }, + { + "epoch": 2.637053157103082, + "grad_norm": 0.3671875, + "learning_rate": 0.00037890357474317535, + "loss": 5.1981, + "step": 412000 + }, + { + "epoch": 2.640253464332576, + "grad_norm": 0.40234375, + "learning_rate": 0.00037887797228533944, + "loss": 5.2052, + "step": 412500 + }, + { + "epoch": 2.64345377156207, + "grad_norm": 0.38671875, + "learning_rate": 0.00037885236982750347, + "loss": 5.1987, + "step": 413000 + }, + { + "epoch": 2.6466540787915642, + "grad_norm": 0.3828125, + "learning_rate": 0.0003788267673696675, + "loss": 5.1951, + "step": 413500 + }, + { + "epoch": 2.649854386021058, + "grad_norm": 0.392578125, + "learning_rate": 0.0003788011649118316, + "loss": 5.2001, + "step": 414000 + }, + { + "epoch": 2.653054693250552, + "grad_norm": 0.375, + "learning_rate": 0.00037877556245399563, + "loss": 5.2009, + "step": 414500 + }, + { + "epoch": 2.656255000480046, + "grad_norm": 0.3984375, + "learning_rate": 0.00037874995999615966, + "loss": 5.199, + "step": 415000 + }, + { + "epoch": 2.65945530770954, + "grad_norm": 0.396484375, + "learning_rate": 0.0003787243575383237, + "loss": 5.2029, + "step": 415500 + }, + { + "epoch": 2.662655614939034, + "grad_norm": 0.427734375, + "learning_rate": 0.00037869875508048773, + "loss": 5.1956, + "step": 416000 + }, + { + "epoch": 2.6658559221685283, + "grad_norm": 0.40234375, + "learning_rate": 0.00037867315262265177, + "loss": 5.2, + "step": 416500 + }, + { + "epoch": 2.6690562293980222, + "grad_norm": 0.408203125, + "learning_rate": 0.00037864755016481585, + "loss": 5.1974, + "step": 417000 + }, + { + "epoch": 2.672256536627516, + "grad_norm": 0.396484375, + "learning_rate": 0.0003786219477069799, + "loss": 5.1977, + "step": 417500 + }, + { + "epoch": 2.6754568438570105, + "grad_norm": 0.392578125, + "learning_rate": 0.0003785963452491439, + "loss": 5.1954, + "step": 418000 + }, + { + "epoch": 2.6786571510865045, + "grad_norm": 0.4296875, + "learning_rate": 0.00037857074279130796, + "loss": 5.2027, + "step": 418500 + }, + { + "epoch": 2.6818574583159984, + "grad_norm": 0.419921875, + "learning_rate": 0.000378545140333472, + "loss": 5.1943, + "step": 419000 + }, + { + "epoch": 2.6850577655454924, + "grad_norm": 0.404296875, + "learning_rate": 0.0003785195378756361, + "loss": 5.1955, + "step": 419500 + }, + { + "epoch": 2.6882580727749863, + "grad_norm": 0.41015625, + "learning_rate": 0.0003784939354178001, + "loss": 5.206, + "step": 420000 + }, + { + "epoch": 2.6914583800044802, + "grad_norm": 0.38671875, + "learning_rate": 0.0003784683329599642, + "loss": 5.2077, + "step": 420500 + }, + { + "epoch": 2.6946586872339746, + "grad_norm": 0.392578125, + "learning_rate": 0.00037844273050212824, + "loss": 5.2, + "step": 421000 + }, + { + "epoch": 2.6978589944634686, + "grad_norm": 0.384765625, + "learning_rate": 0.00037841712804429227, + "loss": 5.1981, + "step": 421500 + }, + { + "epoch": 2.7010593016929625, + "grad_norm": 0.404296875, + "learning_rate": 0.0003783915255864563, + "loss": 5.2035, + "step": 422000 + }, + { + "epoch": 2.7042596089224564, + "grad_norm": 0.416015625, + "learning_rate": 0.0003783659231286204, + "loss": 5.2, + "step": 422500 + }, + { + "epoch": 2.707459916151951, + "grad_norm": 0.412109375, + "learning_rate": 0.0003783403206707844, + "loss": 5.2004, + "step": 423000 + }, + { + "epoch": 2.7106602233814447, + "grad_norm": 0.3984375, + "learning_rate": 0.00037831471821294846, + "loss": 5.1948, + "step": 423500 + }, + { + "epoch": 2.7138605306109387, + "grad_norm": 0.373046875, + "learning_rate": 0.0003782891157551125, + "loss": 5.1975, + "step": 424000 + }, + { + "epoch": 2.7170608378404326, + "grad_norm": 0.416015625, + "learning_rate": 0.00037826351329727653, + "loss": 5.2018, + "step": 424500 + }, + { + "epoch": 2.7202611450699266, + "grad_norm": 0.41796875, + "learning_rate": 0.0003782379108394406, + "loss": 5.2005, + "step": 425000 + }, + { + "epoch": 2.7234614522994205, + "grad_norm": 0.435546875, + "learning_rate": 0.00037821230838160465, + "loss": 5.2011, + "step": 425500 + }, + { + "epoch": 2.726661759528915, + "grad_norm": 0.419921875, + "learning_rate": 0.0003781867059237687, + "loss": 5.2041, + "step": 426000 + }, + { + "epoch": 2.729862066758409, + "grad_norm": 0.41015625, + "learning_rate": 0.0003781611034659327, + "loss": 5.1967, + "step": 426500 + }, + { + "epoch": 2.7330623739879027, + "grad_norm": 0.388671875, + "learning_rate": 0.00037813550100809676, + "loss": 5.2039, + "step": 427000 + }, + { + "epoch": 2.736262681217397, + "grad_norm": 0.3984375, + "learning_rate": 0.00037810989855026084, + "loss": 5.1936, + "step": 427500 + }, + { + "epoch": 2.739462988446891, + "grad_norm": 0.431640625, + "learning_rate": 0.00037808429609242493, + "loss": 5.1995, + "step": 428000 + }, + { + "epoch": 2.742663295676385, + "grad_norm": 0.4140625, + "learning_rate": 0.00037805869363458897, + "loss": 5.2016, + "step": 428500 + }, + { + "epoch": 2.745863602905879, + "grad_norm": 0.390625, + "learning_rate": 0.000378033091176753, + "loss": 5.1997, + "step": 429000 + }, + { + "epoch": 2.749063910135373, + "grad_norm": 0.421875, + "learning_rate": 0.00037800748871891703, + "loss": 5.2064, + "step": 429500 + }, + { + "epoch": 2.752264217364867, + "grad_norm": 0.43359375, + "learning_rate": 0.00037798188626108107, + "loss": 5.1999, + "step": 430000 + }, + { + "epoch": 2.755464524594361, + "grad_norm": 0.40234375, + "learning_rate": 0.00037795628380324516, + "loss": 5.1978, + "step": 430500 + }, + { + "epoch": 2.758664831823855, + "grad_norm": 0.404296875, + "learning_rate": 0.0003779306813454092, + "loss": 5.1986, + "step": 431000 + }, + { + "epoch": 2.761865139053349, + "grad_norm": 0.390625, + "learning_rate": 0.0003779050788875732, + "loss": 5.202, + "step": 431500 + }, + { + "epoch": 2.765065446282843, + "grad_norm": 0.37890625, + "learning_rate": 0.00037787947642973726, + "loss": 5.1967, + "step": 432000 + }, + { + "epoch": 2.7682657535123374, + "grad_norm": 0.392578125, + "learning_rate": 0.0003778538739719013, + "loss": 5.2028, + "step": 432500 + }, + { + "epoch": 2.7714660607418313, + "grad_norm": 0.4296875, + "learning_rate": 0.0003778282715140654, + "loss": 5.1978, + "step": 433000 + }, + { + "epoch": 2.7746663679713253, + "grad_norm": 0.421875, + "learning_rate": 0.0003778026690562294, + "loss": 5.197, + "step": 433500 + }, + { + "epoch": 2.777866675200819, + "grad_norm": 0.40625, + "learning_rate": 0.00037777706659839345, + "loss": 5.1927, + "step": 434000 + }, + { + "epoch": 2.781066982430313, + "grad_norm": 0.41796875, + "learning_rate": 0.0003777514641405575, + "loss": 5.1988, + "step": 434500 + }, + { + "epoch": 2.784267289659807, + "grad_norm": 0.435546875, + "learning_rate": 0.0003777258616827216, + "loss": 5.1968, + "step": 435000 + }, + { + "epoch": 2.7874675968893015, + "grad_norm": 0.41015625, + "learning_rate": 0.0003777002592248856, + "loss": 5.1956, + "step": 435500 + }, + { + "epoch": 2.7906679041187954, + "grad_norm": 0.4296875, + "learning_rate": 0.0003776746567670497, + "loss": 5.1986, + "step": 436000 + }, + { + "epoch": 2.7938682113482893, + "grad_norm": 0.44140625, + "learning_rate": 0.00037764905430921373, + "loss": 5.1952, + "step": 436500 + }, + { + "epoch": 2.7970685185777837, + "grad_norm": 0.37890625, + "learning_rate": 0.00037762345185137777, + "loss": 5.2054, + "step": 437000 + }, + { + "epoch": 2.8002688258072777, + "grad_norm": 0.416015625, + "learning_rate": 0.0003775978493935418, + "loss": 5.1935, + "step": 437500 + }, + { + "epoch": 2.8034691330367716, + "grad_norm": 0.380859375, + "learning_rate": 0.00037757224693570583, + "loss": 5.1976, + "step": 438000 + }, + { + "epoch": 2.8066694402662655, + "grad_norm": 0.404296875, + "learning_rate": 0.0003775466444778699, + "loss": 5.2001, + "step": 438500 + }, + { + "epoch": 2.8098697474957595, + "grad_norm": 0.46875, + "learning_rate": 0.00037752104202003396, + "loss": 5.1961, + "step": 439000 + }, + { + "epoch": 2.8130700547252534, + "grad_norm": 0.38671875, + "learning_rate": 0.000377495439562198, + "loss": 5.2053, + "step": 439500 + }, + { + "epoch": 2.816270361954748, + "grad_norm": 0.38671875, + "learning_rate": 0.000377469837104362, + "loss": 5.2005, + "step": 440000 + }, + { + "epoch": 2.8194706691842417, + "grad_norm": 0.390625, + "learning_rate": 0.00037744423464652606, + "loss": 5.2043, + "step": 440500 + }, + { + "epoch": 2.8226709764137357, + "grad_norm": 0.421875, + "learning_rate": 0.0003774186321886901, + "loss": 5.1988, + "step": 441000 + }, + { + "epoch": 2.8258712836432296, + "grad_norm": 0.421875, + "learning_rate": 0.0003773930297308542, + "loss": 5.2003, + "step": 441500 + }, + { + "epoch": 2.829071590872724, + "grad_norm": 0.419921875, + "learning_rate": 0.0003773674272730182, + "loss": 5.1983, + "step": 442000 + }, + { + "epoch": 2.832271898102218, + "grad_norm": 0.4296875, + "learning_rate": 0.00037734182481518225, + "loss": 5.1978, + "step": 442500 + }, + { + "epoch": 2.835472205331712, + "grad_norm": 0.42578125, + "learning_rate": 0.00037731622235734634, + "loss": 5.1958, + "step": 443000 + }, + { + "epoch": 2.838672512561206, + "grad_norm": 0.419921875, + "learning_rate": 0.0003772906198995104, + "loss": 5.2009, + "step": 443500 + }, + { + "epoch": 2.8418728197906997, + "grad_norm": 0.39453125, + "learning_rate": 0.00037726501744167446, + "loss": 5.1955, + "step": 444000 + }, + { + "epoch": 2.845073127020194, + "grad_norm": 0.388671875, + "learning_rate": 0.0003772394149838385, + "loss": 5.2014, + "step": 444500 + }, + { + "epoch": 2.848273434249688, + "grad_norm": 0.421875, + "learning_rate": 0.00037721381252600253, + "loss": 5.2015, + "step": 445000 + }, + { + "epoch": 2.851473741479182, + "grad_norm": 0.3984375, + "learning_rate": 0.00037718821006816656, + "loss": 5.1953, + "step": 445500 + }, + { + "epoch": 2.854674048708676, + "grad_norm": 0.3828125, + "learning_rate": 0.0003771626076103306, + "loss": 5.1958, + "step": 446000 + }, + { + "epoch": 2.8578743559381703, + "grad_norm": 0.443359375, + "learning_rate": 0.00037713700515249463, + "loss": 5.2021, + "step": 446500 + }, + { + "epoch": 2.8610746631676642, + "grad_norm": 0.40625, + "learning_rate": 0.0003771114026946587, + "loss": 5.1982, + "step": 447000 + }, + { + "epoch": 2.864274970397158, + "grad_norm": 0.466796875, + "learning_rate": 0.00037708580023682276, + "loss": 5.1954, + "step": 447500 + }, + { + "epoch": 2.867475277626652, + "grad_norm": 0.416015625, + "learning_rate": 0.0003770601977789868, + "loss": 5.1985, + "step": 448000 + }, + { + "epoch": 2.870675584856146, + "grad_norm": 0.439453125, + "learning_rate": 0.0003770345953211508, + "loss": 5.2033, + "step": 448500 + }, + { + "epoch": 2.87387589208564, + "grad_norm": 0.419921875, + "learning_rate": 0.00037700899286331486, + "loss": 5.1984, + "step": 449000 + }, + { + "epoch": 2.8770761993151344, + "grad_norm": 0.404296875, + "learning_rate": 0.00037698339040547895, + "loss": 5.201, + "step": 449500 + }, + { + "epoch": 2.8802765065446283, + "grad_norm": 0.419921875, + "learning_rate": 0.000376957787947643, + "loss": 5.1991, + "step": 450000 + }, + { + "epoch": 2.8834768137741222, + "grad_norm": 0.40234375, + "learning_rate": 0.00037693218548980707, + "loss": 5.1997, + "step": 450500 + }, + { + "epoch": 2.8866771210036166, + "grad_norm": 0.3984375, + "learning_rate": 0.0003769065830319711, + "loss": 5.1972, + "step": 451000 + }, + { + "epoch": 2.8898774282331106, + "grad_norm": 0.3984375, + "learning_rate": 0.00037688098057413514, + "loss": 5.2039, + "step": 451500 + }, + { + "epoch": 2.8930777354626045, + "grad_norm": 0.40234375, + "learning_rate": 0.00037685537811629917, + "loss": 5.1969, + "step": 452000 + }, + { + "epoch": 2.8962780426920984, + "grad_norm": 0.40625, + "learning_rate": 0.00037682977565846326, + "loss": 5.1992, + "step": 452500 + }, + { + "epoch": 2.8994783499215924, + "grad_norm": 0.388671875, + "learning_rate": 0.0003768041732006273, + "loss": 5.1965, + "step": 453000 + }, + { + "epoch": 2.9026786571510863, + "grad_norm": 0.412109375, + "learning_rate": 0.00037677857074279133, + "loss": 5.1972, + "step": 453500 + }, + { + "epoch": 2.9058789643805807, + "grad_norm": 0.458984375, + "learning_rate": 0.00037675296828495536, + "loss": 5.1968, + "step": 454000 + }, + { + "epoch": 2.9090792716100746, + "grad_norm": 0.3984375, + "learning_rate": 0.0003767273658271194, + "loss": 5.1998, + "step": 454500 + }, + { + "epoch": 2.9122795788395686, + "grad_norm": 0.431640625, + "learning_rate": 0.0003767017633692835, + "loss": 5.1934, + "step": 455000 + }, + { + "epoch": 2.9154798860690625, + "grad_norm": 0.3984375, + "learning_rate": 0.0003766761609114475, + "loss": 5.1931, + "step": 455500 + }, + { + "epoch": 2.918680193298557, + "grad_norm": 0.43359375, + "learning_rate": 0.00037665055845361155, + "loss": 5.2004, + "step": 456000 + }, + { + "epoch": 2.921880500528051, + "grad_norm": 0.419921875, + "learning_rate": 0.0003766249559957756, + "loss": 5.2008, + "step": 456500 + }, + { + "epoch": 2.9250808077575448, + "grad_norm": 0.396484375, + "learning_rate": 0.0003765993535379396, + "loss": 5.1973, + "step": 457000 + }, + { + "epoch": 2.9282811149870387, + "grad_norm": 0.40234375, + "learning_rate": 0.0003765737510801037, + "loss": 5.1917, + "step": 457500 + }, + { + "epoch": 2.9314814222165326, + "grad_norm": 0.42578125, + "learning_rate": 0.00037654814862226775, + "loss": 5.1964, + "step": 458000 + }, + { + "epoch": 2.9346817294460266, + "grad_norm": 0.4140625, + "learning_rate": 0.00037652254616443183, + "loss": 5.196, + "step": 458500 + }, + { + "epoch": 2.937882036675521, + "grad_norm": 0.41015625, + "learning_rate": 0.00037649694370659587, + "loss": 5.1973, + "step": 459000 + }, + { + "epoch": 2.941082343905015, + "grad_norm": 0.458984375, + "learning_rate": 0.0003764713412487599, + "loss": 5.203, + "step": 459500 + }, + { + "epoch": 2.944282651134509, + "grad_norm": 0.439453125, + "learning_rate": 0.00037644573879092394, + "loss": 5.1963, + "step": 460000 + }, + { + "epoch": 2.947482958364003, + "grad_norm": 0.44140625, + "learning_rate": 0.000376420136333088, + "loss": 5.1969, + "step": 460500 + }, + { + "epoch": 2.950683265593497, + "grad_norm": 0.494140625, + "learning_rate": 0.00037639453387525206, + "loss": 5.1959, + "step": 461000 + }, + { + "epoch": 2.953883572822991, + "grad_norm": 0.39453125, + "learning_rate": 0.0003763689314174161, + "loss": 5.197, + "step": 461500 + }, + { + "epoch": 2.957083880052485, + "grad_norm": 0.4140625, + "learning_rate": 0.00037634332895958013, + "loss": 5.1998, + "step": 462000 + }, + { + "epoch": 2.960284187281979, + "grad_norm": 0.443359375, + "learning_rate": 0.00037631772650174416, + "loss": 5.2028, + "step": 462500 + }, + { + "epoch": 2.963484494511473, + "grad_norm": 0.447265625, + "learning_rate": 0.0003762921240439082, + "loss": 5.1969, + "step": 463000 + }, + { + "epoch": 2.9666848017409673, + "grad_norm": 0.44921875, + "learning_rate": 0.0003762665215860723, + "loss": 5.202, + "step": 463500 + }, + { + "epoch": 2.969885108970461, + "grad_norm": 0.482421875, + "learning_rate": 0.0003762409191282363, + "loss": 5.1935, + "step": 464000 + }, + { + "epoch": 2.973085416199955, + "grad_norm": 0.416015625, + "learning_rate": 0.00037621531667040035, + "loss": 5.1952, + "step": 464500 + }, + { + "epoch": 2.976285723429449, + "grad_norm": 0.40234375, + "learning_rate": 0.0003761897142125644, + "loss": 5.1999, + "step": 465000 + }, + { + "epoch": 2.9794860306589435, + "grad_norm": 0.4140625, + "learning_rate": 0.0003761641117547285, + "loss": 5.1928, + "step": 465500 + }, + { + "epoch": 2.9826863378884374, + "grad_norm": 0.3984375, + "learning_rate": 0.00037613850929689256, + "loss": 5.1958, + "step": 466000 + }, + { + "epoch": 2.9858866451179313, + "grad_norm": 0.40234375, + "learning_rate": 0.0003761129068390566, + "loss": 5.1954, + "step": 466500 + }, + { + "epoch": 2.9890869523474253, + "grad_norm": 0.41015625, + "learning_rate": 0.00037608730438122063, + "loss": 5.1946, + "step": 467000 + }, + { + "epoch": 2.992287259576919, + "grad_norm": 0.380859375, + "learning_rate": 0.00037606170192338467, + "loss": 5.2003, + "step": 467500 + }, + { + "epoch": 2.995487566806413, + "grad_norm": 0.41015625, + "learning_rate": 0.0003760360994655487, + "loss": 5.1928, + "step": 468000 + }, + { + "epoch": 2.9986878740359075, + "grad_norm": 0.52734375, + "learning_rate": 0.00037601049700771274, + "loss": 5.2001, + "step": 468500 + }, + { + "epoch": 3.0, + "eval_loss": 5.188437461853027, + "eval_runtime": 1.1511, + "eval_samples_per_second": 868.755, + "eval_steps_per_second": 13.9, + "step": 468705 + }, + { + "epoch": 3.0018881812654015, + "grad_norm": 0.435546875, + "learning_rate": 0.0003759848945498768, + "loss": 5.2012, + "step": 469000 + }, + { + "epoch": 3.0050884884948954, + "grad_norm": 0.40625, + "learning_rate": 0.00037595929209204086, + "loss": 5.195, + "step": 469500 + }, + { + "epoch": 3.0082887957243893, + "grad_norm": 0.40234375, + "learning_rate": 0.0003759336896342049, + "loss": 5.1946, + "step": 470000 + }, + { + "epoch": 3.0114891029538837, + "grad_norm": 0.419921875, + "learning_rate": 0.0003759080871763689, + "loss": 5.1945, + "step": 470500 + }, + { + "epoch": 3.0146894101833777, + "grad_norm": 0.41796875, + "learning_rate": 0.00037588248471853296, + "loss": 5.1927, + "step": 471000 + }, + { + "epoch": 3.0178897174128716, + "grad_norm": 0.404296875, + "learning_rate": 0.00037585688226069705, + "loss": 5.194, + "step": 471500 + }, + { + "epoch": 3.0210900246423655, + "grad_norm": 0.423828125, + "learning_rate": 0.0003758312798028611, + "loss": 5.187, + "step": 472000 + }, + { + "epoch": 3.02429033187186, + "grad_norm": 0.37890625, + "learning_rate": 0.0003758056773450251, + "loss": 5.195, + "step": 472500 + }, + { + "epoch": 3.027490639101354, + "grad_norm": 0.41015625, + "learning_rate": 0.0003757800748871892, + "loss": 5.1955, + "step": 473000 + }, + { + "epoch": 3.030690946330848, + "grad_norm": 0.408203125, + "learning_rate": 0.00037575447242935324, + "loss": 5.1973, + "step": 473500 + }, + { + "epoch": 3.0338912535603417, + "grad_norm": 0.38671875, + "learning_rate": 0.00037572886997151733, + "loss": 5.1982, + "step": 474000 + }, + { + "epoch": 3.0370915607898357, + "grad_norm": 0.38671875, + "learning_rate": 0.00037570326751368136, + "loss": 5.1939, + "step": 474500 + }, + { + "epoch": 3.04029186801933, + "grad_norm": 0.45703125, + "learning_rate": 0.0003756776650558454, + "loss": 5.202, + "step": 475000 + }, + { + "epoch": 3.043492175248824, + "grad_norm": 0.4453125, + "learning_rate": 0.00037565206259800943, + "loss": 5.1921, + "step": 475500 + }, + { + "epoch": 3.046692482478318, + "grad_norm": 0.439453125, + "learning_rate": 0.00037562646014017347, + "loss": 5.1992, + "step": 476000 + }, + { + "epoch": 3.049892789707812, + "grad_norm": 0.41796875, + "learning_rate": 0.0003756008576823375, + "loss": 5.1936, + "step": 476500 + }, + { + "epoch": 3.053093096937306, + "grad_norm": 0.421875, + "learning_rate": 0.0003755752552245016, + "loss": 5.1884, + "step": 477000 + }, + { + "epoch": 3.0562934041668, + "grad_norm": 0.404296875, + "learning_rate": 0.0003755496527666656, + "loss": 5.1873, + "step": 477500 + }, + { + "epoch": 3.059493711396294, + "grad_norm": 0.40625, + "learning_rate": 0.00037552405030882966, + "loss": 5.1863, + "step": 478000 + }, + { + "epoch": 3.062694018625788, + "grad_norm": 0.423828125, + "learning_rate": 0.0003754984478509937, + "loss": 5.197, + "step": 478500 + }, + { + "epoch": 3.065894325855282, + "grad_norm": 0.40234375, + "learning_rate": 0.0003754728453931577, + "loss": 5.1983, + "step": 479000 + }, + { + "epoch": 3.069094633084776, + "grad_norm": 0.43359375, + "learning_rate": 0.0003754472429353218, + "loss": 5.1934, + "step": 479500 + }, + { + "epoch": 3.0722949403142703, + "grad_norm": 0.447265625, + "learning_rate": 0.00037542164047748585, + "loss": 5.1938, + "step": 480000 + }, + { + "epoch": 3.0754952475437642, + "grad_norm": 0.4375, + "learning_rate": 0.0003753960380196499, + "loss": 5.1926, + "step": 480500 + }, + { + "epoch": 3.078695554773258, + "grad_norm": 0.431640625, + "learning_rate": 0.00037537043556181397, + "loss": 5.1942, + "step": 481000 + }, + { + "epoch": 3.081895862002752, + "grad_norm": 0.447265625, + "learning_rate": 0.000375344833103978, + "loss": 5.1923, + "step": 481500 + }, + { + "epoch": 3.0850961692322465, + "grad_norm": 0.431640625, + "learning_rate": 0.00037531923064614204, + "loss": 5.1809, + "step": 482000 + }, + { + "epoch": 3.0882964764617404, + "grad_norm": 0.419921875, + "learning_rate": 0.00037529362818830613, + "loss": 5.1921, + "step": 482500 + }, + { + "epoch": 3.0914967836912344, + "grad_norm": 0.419921875, + "learning_rate": 0.00037526802573047016, + "loss": 5.1906, + "step": 483000 + }, + { + "epoch": 3.0946970909207283, + "grad_norm": 0.44140625, + "learning_rate": 0.0003752424232726342, + "loss": 5.1914, + "step": 483500 + }, + { + "epoch": 3.0978973981502222, + "grad_norm": 0.42578125, + "learning_rate": 0.00037521682081479823, + "loss": 5.1966, + "step": 484000 + }, + { + "epoch": 3.1010977053797166, + "grad_norm": 0.408203125, + "learning_rate": 0.00037519121835696227, + "loss": 5.1933, + "step": 484500 + }, + { + "epoch": 3.1042980126092106, + "grad_norm": 0.43359375, + "learning_rate": 0.00037516561589912635, + "loss": 5.1917, + "step": 485000 + }, + { + "epoch": 3.1074983198387045, + "grad_norm": 0.408203125, + "learning_rate": 0.0003751400134412904, + "loss": 5.1991, + "step": 485500 + }, + { + "epoch": 3.1106986270681984, + "grad_norm": 0.423828125, + "learning_rate": 0.0003751144109834544, + "loss": 5.1917, + "step": 486000 + }, + { + "epoch": 3.1138989342976924, + "grad_norm": 0.435546875, + "learning_rate": 0.00037508880852561846, + "loss": 5.192, + "step": 486500 + }, + { + "epoch": 3.1170992415271868, + "grad_norm": 0.41796875, + "learning_rate": 0.0003750632060677825, + "loss": 5.1985, + "step": 487000 + }, + { + "epoch": 3.1202995487566807, + "grad_norm": 0.408203125, + "learning_rate": 0.0003750376036099466, + "loss": 5.1917, + "step": 487500 + }, + { + "epoch": 3.1234998559861746, + "grad_norm": 0.416015625, + "learning_rate": 0.0003750120011521106, + "loss": 5.1907, + "step": 488000 + }, + { + "epoch": 3.1267001632156686, + "grad_norm": 0.408203125, + "learning_rate": 0.0003749863986942747, + "loss": 5.1917, + "step": 488500 + }, + { + "epoch": 3.129900470445163, + "grad_norm": 0.416015625, + "learning_rate": 0.00037496079623643874, + "loss": 5.1896, + "step": 489000 + }, + { + "epoch": 3.133100777674657, + "grad_norm": 0.458984375, + "learning_rate": 0.00037493519377860277, + "loss": 5.1971, + "step": 489500 + }, + { + "epoch": 3.136301084904151, + "grad_norm": 0.4296875, + "learning_rate": 0.0003749095913207668, + "loss": 5.1931, + "step": 490000 + }, + { + "epoch": 3.1395013921336448, + "grad_norm": 0.4140625, + "learning_rate": 0.0003748839888629309, + "loss": 5.1998, + "step": 490500 + }, + { + "epoch": 3.1427016993631387, + "grad_norm": 0.427734375, + "learning_rate": 0.00037485838640509493, + "loss": 5.1847, + "step": 491000 + }, + { + "epoch": 3.145902006592633, + "grad_norm": 0.474609375, + "learning_rate": 0.00037483278394725896, + "loss": 5.1932, + "step": 491500 + }, + { + "epoch": 3.149102313822127, + "grad_norm": 0.419921875, + "learning_rate": 0.000374807181489423, + "loss": 5.1958, + "step": 492000 + }, + { + "epoch": 3.152302621051621, + "grad_norm": 0.4140625, + "learning_rate": 0.00037478157903158703, + "loss": 5.191, + "step": 492500 + }, + { + "epoch": 3.155502928281115, + "grad_norm": 0.419921875, + "learning_rate": 0.00037475597657375106, + "loss": 5.1945, + "step": 493000 + }, + { + "epoch": 3.158703235510609, + "grad_norm": 0.427734375, + "learning_rate": 0.00037473037411591515, + "loss": 5.1945, + "step": 493500 + }, + { + "epoch": 3.161903542740103, + "grad_norm": 0.427734375, + "learning_rate": 0.0003747047716580792, + "loss": 5.1986, + "step": 494000 + }, + { + "epoch": 3.165103849969597, + "grad_norm": 0.400390625, + "learning_rate": 0.0003746791692002432, + "loss": 5.1928, + "step": 494500 + }, + { + "epoch": 3.168304157199091, + "grad_norm": 0.455078125, + "learning_rate": 0.00037465356674240726, + "loss": 5.195, + "step": 495000 + }, + { + "epoch": 3.171504464428585, + "grad_norm": 0.41015625, + "learning_rate": 0.00037462796428457134, + "loss": 5.1943, + "step": 495500 + }, + { + "epoch": 3.174704771658079, + "grad_norm": 0.43359375, + "learning_rate": 0.0003746023618267354, + "loss": 5.1968, + "step": 496000 + }, + { + "epoch": 3.1779050788875733, + "grad_norm": 0.4453125, + "learning_rate": 0.00037457675936889947, + "loss": 5.1964, + "step": 496500 + }, + { + "epoch": 3.1811053861170673, + "grad_norm": 0.443359375, + "learning_rate": 0.0003745511569110635, + "loss": 5.1988, + "step": 497000 + }, + { + "epoch": 3.184305693346561, + "grad_norm": 0.447265625, + "learning_rate": 0.00037452555445322754, + "loss": 5.195, + "step": 497500 + }, + { + "epoch": 3.187506000576055, + "grad_norm": 0.439453125, + "learning_rate": 0.00037449995199539157, + "loss": 5.1972, + "step": 498000 + }, + { + "epoch": 3.1907063078055495, + "grad_norm": 0.421875, + "learning_rate": 0.0003744743495375556, + "loss": 5.1964, + "step": 498500 + }, + { + "epoch": 3.1939066150350435, + "grad_norm": 0.45703125, + "learning_rate": 0.0003744487470797197, + "loss": 5.1935, + "step": 499000 + }, + { + "epoch": 3.1971069222645374, + "grad_norm": 0.40234375, + "learning_rate": 0.0003744231446218837, + "loss": 5.1911, + "step": 499500 + }, + { + "epoch": 3.2003072294940313, + "grad_norm": 0.42578125, + "learning_rate": 0.00037439754216404776, + "loss": 5.1889, + "step": 500000 + }, + { + "epoch": 3.2035075367235253, + "grad_norm": 0.43359375, + "learning_rate": 0.0003743719397062118, + "loss": 5.1899, + "step": 500500 + }, + { + "epoch": 3.2067078439530197, + "grad_norm": 0.4609375, + "learning_rate": 0.00037434633724837583, + "loss": 5.1958, + "step": 501000 + }, + { + "epoch": 3.2099081511825136, + "grad_norm": 0.455078125, + "learning_rate": 0.0003743207347905399, + "loss": 5.1986, + "step": 501500 + }, + { + "epoch": 3.2131084584120075, + "grad_norm": 0.4453125, + "learning_rate": 0.00037429513233270395, + "loss": 5.1947, + "step": 502000 + }, + { + "epoch": 3.2163087656415015, + "grad_norm": 0.4453125, + "learning_rate": 0.000374269529874868, + "loss": 5.188, + "step": 502500 + }, + { + "epoch": 3.2195090728709954, + "grad_norm": 0.3671875, + "learning_rate": 0.0003742439274170321, + "loss": 5.1988, + "step": 503000 + }, + { + "epoch": 3.22270938010049, + "grad_norm": 0.42578125, + "learning_rate": 0.0003742183249591961, + "loss": 5.1933, + "step": 503500 + }, + { + "epoch": 3.2259096873299837, + "grad_norm": 0.38671875, + "learning_rate": 0.00037419272250136014, + "loss": 5.1946, + "step": 504000 + }, + { + "epoch": 3.2291099945594777, + "grad_norm": 0.4609375, + "learning_rate": 0.00037416712004352423, + "loss": 5.1928, + "step": 504500 + }, + { + "epoch": 3.2323103017889716, + "grad_norm": 0.43359375, + "learning_rate": 0.00037414151758568827, + "loss": 5.1958, + "step": 505000 + }, + { + "epoch": 3.2355106090184655, + "grad_norm": 0.416015625, + "learning_rate": 0.0003741159151278523, + "loss": 5.1934, + "step": 505500 + }, + { + "epoch": 3.23871091624796, + "grad_norm": 0.5546875, + "learning_rate": 0.00037409031267001633, + "loss": 5.197, + "step": 506000 + }, + { + "epoch": 3.241911223477454, + "grad_norm": 0.4140625, + "learning_rate": 0.00037406471021218037, + "loss": 5.1905, + "step": 506500 + }, + { + "epoch": 3.245111530706948, + "grad_norm": 0.447265625, + "learning_rate": 0.00037403910775434446, + "loss": 5.1953, + "step": 507000 + }, + { + "epoch": 3.2483118379364417, + "grad_norm": 0.4609375, + "learning_rate": 0.0003740135052965085, + "loss": 5.1952, + "step": 507500 + }, + { + "epoch": 3.251512145165936, + "grad_norm": 0.42578125, + "learning_rate": 0.0003739879028386725, + "loss": 5.1929, + "step": 508000 + }, + { + "epoch": 3.25471245239543, + "grad_norm": 0.435546875, + "learning_rate": 0.00037396230038083656, + "loss": 5.1908, + "step": 508500 + }, + { + "epoch": 3.257912759624924, + "grad_norm": 0.4609375, + "learning_rate": 0.0003739366979230006, + "loss": 5.1931, + "step": 509000 + }, + { + "epoch": 3.261113066854418, + "grad_norm": 0.423828125, + "learning_rate": 0.0003739110954651647, + "loss": 5.1924, + "step": 509500 + }, + { + "epoch": 3.264313374083912, + "grad_norm": 0.4609375, + "learning_rate": 0.0003738854930073287, + "loss": 5.1907, + "step": 510000 + }, + { + "epoch": 3.2675136813134062, + "grad_norm": 0.431640625, + "learning_rate": 0.00037385989054949275, + "loss": 5.1911, + "step": 510500 + }, + { + "epoch": 3.2707139885429, + "grad_norm": 0.443359375, + "learning_rate": 0.00037383428809165684, + "loss": 5.1866, + "step": 511000 + }, + { + "epoch": 3.273914295772394, + "grad_norm": 0.44921875, + "learning_rate": 0.0003738086856338209, + "loss": 5.1897, + "step": 511500 + }, + { + "epoch": 3.277114603001888, + "grad_norm": 0.423828125, + "learning_rate": 0.0003737830831759849, + "loss": 5.1928, + "step": 512000 + }, + { + "epoch": 3.2803149102313824, + "grad_norm": 0.41796875, + "learning_rate": 0.000373757480718149, + "loss": 5.1908, + "step": 512500 + }, + { + "epoch": 3.2835152174608764, + "grad_norm": 0.453125, + "learning_rate": 0.00037373187826031303, + "loss": 5.1901, + "step": 513000 + }, + { + "epoch": 3.2867155246903703, + "grad_norm": 0.453125, + "learning_rate": 0.00037370627580247706, + "loss": 5.1933, + "step": 513500 + }, + { + "epoch": 3.2899158319198643, + "grad_norm": 0.447265625, + "learning_rate": 0.0003736806733446411, + "loss": 5.1916, + "step": 514000 + }, + { + "epoch": 3.293116139149358, + "grad_norm": 0.451171875, + "learning_rate": 0.00037365507088680513, + "loss": 5.1879, + "step": 514500 + }, + { + "epoch": 3.296316446378852, + "grad_norm": 0.462890625, + "learning_rate": 0.0003736294684289692, + "loss": 5.1881, + "step": 515000 + }, + { + "epoch": 3.2995167536083465, + "grad_norm": 0.44921875, + "learning_rate": 0.00037360386597113326, + "loss": 5.1928, + "step": 515500 + }, + { + "epoch": 3.3027170608378404, + "grad_norm": 0.4609375, + "learning_rate": 0.0003735782635132973, + "loss": 5.1954, + "step": 516000 + }, + { + "epoch": 3.3059173680673344, + "grad_norm": 0.42578125, + "learning_rate": 0.0003735526610554613, + "loss": 5.1907, + "step": 516500 + }, + { + "epoch": 3.3091176752968283, + "grad_norm": 0.42578125, + "learning_rate": 0.00037352705859762536, + "loss": 5.1956, + "step": 517000 + }, + { + "epoch": 3.3123179825263227, + "grad_norm": 0.423828125, + "learning_rate": 0.0003735014561397894, + "loss": 5.1939, + "step": 517500 + }, + { + "epoch": 3.3155182897558166, + "grad_norm": 0.408203125, + "learning_rate": 0.0003734758536819535, + "loss": 5.1891, + "step": 518000 + }, + { + "epoch": 3.3187185969853106, + "grad_norm": 0.490234375, + "learning_rate": 0.0003734502512241175, + "loss": 5.1844, + "step": 518500 + }, + { + "epoch": 3.3219189042148045, + "grad_norm": 0.443359375, + "learning_rate": 0.0003734246487662816, + "loss": 5.1886, + "step": 519000 + }, + { + "epoch": 3.3251192114442985, + "grad_norm": 0.458984375, + "learning_rate": 0.00037339904630844564, + "loss": 5.1915, + "step": 519500 + }, + { + "epoch": 3.328319518673793, + "grad_norm": 0.443359375, + "learning_rate": 0.00037337344385060967, + "loss": 5.1936, + "step": 520000 + }, + { + "epoch": 3.3315198259032868, + "grad_norm": 0.44140625, + "learning_rate": 0.00037334784139277376, + "loss": 5.1928, + "step": 520500 + }, + { + "epoch": 3.3347201331327807, + "grad_norm": 0.423828125, + "learning_rate": 0.0003733222389349378, + "loss": 5.1946, + "step": 521000 + }, + { + "epoch": 3.3379204403622746, + "grad_norm": 0.443359375, + "learning_rate": 0.00037329663647710183, + "loss": 5.1969, + "step": 521500 + }, + { + "epoch": 3.341120747591769, + "grad_norm": 0.4375, + "learning_rate": 0.00037327103401926586, + "loss": 5.1986, + "step": 522000 + }, + { + "epoch": 3.344321054821263, + "grad_norm": 0.43359375, + "learning_rate": 0.0003732454315614299, + "loss": 5.1924, + "step": 522500 + }, + { + "epoch": 3.347521362050757, + "grad_norm": 0.478515625, + "learning_rate": 0.00037321982910359393, + "loss": 5.1939, + "step": 523000 + }, + { + "epoch": 3.350721669280251, + "grad_norm": 0.435546875, + "learning_rate": 0.000373194226645758, + "loss": 5.1984, + "step": 523500 + }, + { + "epoch": 3.3539219765097448, + "grad_norm": 0.404296875, + "learning_rate": 0.00037316862418792205, + "loss": 5.1929, + "step": 524000 + }, + { + "epoch": 3.357122283739239, + "grad_norm": 0.4375, + "learning_rate": 0.0003731430217300861, + "loss": 5.1921, + "step": 524500 + }, + { + "epoch": 3.360322590968733, + "grad_norm": 0.408203125, + "learning_rate": 0.0003731174192722501, + "loss": 5.1959, + "step": 525000 + }, + { + "epoch": 3.363522898198227, + "grad_norm": 0.44921875, + "learning_rate": 0.0003730918168144142, + "loss": 5.1947, + "step": 525500 + }, + { + "epoch": 3.366723205427721, + "grad_norm": 0.427734375, + "learning_rate": 0.00037306621435657825, + "loss": 5.1939, + "step": 526000 + }, + { + "epoch": 3.369923512657215, + "grad_norm": 0.416015625, + "learning_rate": 0.00037304061189874233, + "loss": 5.2001, + "step": 526500 + }, + { + "epoch": 3.3731238198867093, + "grad_norm": 0.46875, + "learning_rate": 0.00037301500944090637, + "loss": 5.1944, + "step": 527000 + }, + { + "epoch": 3.376324127116203, + "grad_norm": 0.443359375, + "learning_rate": 0.0003729894069830704, + "loss": 5.1885, + "step": 527500 + }, + { + "epoch": 3.379524434345697, + "grad_norm": 0.408203125, + "learning_rate": 0.00037296380452523444, + "loss": 5.1954, + "step": 528000 + }, + { + "epoch": 3.382724741575191, + "grad_norm": 0.4609375, + "learning_rate": 0.00037293820206739847, + "loss": 5.1898, + "step": 528500 + }, + { + "epoch": 3.385925048804685, + "grad_norm": 0.431640625, + "learning_rate": 0.00037291259960956256, + "loss": 5.1952, + "step": 529000 + }, + { + "epoch": 3.3891253560341794, + "grad_norm": 0.431640625, + "learning_rate": 0.0003728869971517266, + "loss": 5.1911, + "step": 529500 + }, + { + "epoch": 3.3923256632636734, + "grad_norm": 0.439453125, + "learning_rate": 0.00037286139469389063, + "loss": 5.1901, + "step": 530000 + }, + { + "epoch": 3.3955259704931673, + "grad_norm": 0.40625, + "learning_rate": 0.00037283579223605466, + "loss": 5.1941, + "step": 530500 + }, + { + "epoch": 3.3987262777226612, + "grad_norm": 0.416015625, + "learning_rate": 0.0003728101897782187, + "loss": 5.1962, + "step": 531000 + }, + { + "epoch": 3.4019265849521556, + "grad_norm": 0.4296875, + "learning_rate": 0.0003727845873203828, + "loss": 5.1943, + "step": 531500 + }, + { + "epoch": 3.4051268921816495, + "grad_norm": 0.427734375, + "learning_rate": 0.0003727589848625468, + "loss": 5.1894, + "step": 532000 + }, + { + "epoch": 3.4083271994111435, + "grad_norm": 0.419921875, + "learning_rate": 0.00037273338240471085, + "loss": 5.1891, + "step": 532500 + }, + { + "epoch": 3.4115275066406374, + "grad_norm": 0.43359375, + "learning_rate": 0.0003727077799468749, + "loss": 5.1877, + "step": 533000 + }, + { + "epoch": 3.4147278138701314, + "grad_norm": 0.423828125, + "learning_rate": 0.000372682177489039, + "loss": 5.1948, + "step": 533500 + }, + { + "epoch": 3.4179281210996257, + "grad_norm": 0.44140625, + "learning_rate": 0.000372656575031203, + "loss": 5.1937, + "step": 534000 + }, + { + "epoch": 3.4211284283291197, + "grad_norm": 0.4375, + "learning_rate": 0.0003726309725733671, + "loss": 5.1932, + "step": 534500 + }, + { + "epoch": 3.4243287355586136, + "grad_norm": 0.404296875, + "learning_rate": 0.00037260537011553113, + "loss": 5.1979, + "step": 535000 + }, + { + "epoch": 3.4275290427881075, + "grad_norm": 0.466796875, + "learning_rate": 0.00037257976765769517, + "loss": 5.1899, + "step": 535500 + }, + { + "epoch": 3.430729350017602, + "grad_norm": 0.43359375, + "learning_rate": 0.0003725541651998592, + "loss": 5.1883, + "step": 536000 + }, + { + "epoch": 3.433929657247096, + "grad_norm": 0.470703125, + "learning_rate": 0.00037252856274202324, + "loss": 5.1951, + "step": 536500 + }, + { + "epoch": 3.43712996447659, + "grad_norm": 0.462890625, + "learning_rate": 0.0003725029602841873, + "loss": 5.1938, + "step": 537000 + }, + { + "epoch": 3.4403302717060837, + "grad_norm": 0.41015625, + "learning_rate": 0.00037247735782635136, + "loss": 5.1935, + "step": 537500 + }, + { + "epoch": 3.4435305789355777, + "grad_norm": 0.447265625, + "learning_rate": 0.0003724517553685154, + "loss": 5.1865, + "step": 538000 + }, + { + "epoch": 3.4467308861650716, + "grad_norm": 0.423828125, + "learning_rate": 0.00037242615291067943, + "loss": 5.1933, + "step": 538500 + }, + { + "epoch": 3.449931193394566, + "grad_norm": 0.447265625, + "learning_rate": 0.00037240055045284346, + "loss": 5.1946, + "step": 539000 + }, + { + "epoch": 3.45313150062406, + "grad_norm": 0.45703125, + "learning_rate": 0.0003723749479950075, + "loss": 5.1926, + "step": 539500 + }, + { + "epoch": 3.456331807853554, + "grad_norm": 0.451171875, + "learning_rate": 0.0003723493455371716, + "loss": 5.1921, + "step": 540000 + }, + { + "epoch": 3.459532115083048, + "grad_norm": 0.44140625, + "learning_rate": 0.0003723237430793356, + "loss": 5.1952, + "step": 540500 + }, + { + "epoch": 3.462732422312542, + "grad_norm": 0.42578125, + "learning_rate": 0.0003722981406214997, + "loss": 5.1905, + "step": 541000 + }, + { + "epoch": 3.465932729542036, + "grad_norm": 0.427734375, + "learning_rate": 0.00037227253816366374, + "loss": 5.1907, + "step": 541500 + }, + { + "epoch": 3.46913303677153, + "grad_norm": 0.451171875, + "learning_rate": 0.0003722469357058278, + "loss": 5.1925, + "step": 542000 + }, + { + "epoch": 3.472333344001024, + "grad_norm": 0.443359375, + "learning_rate": 0.00037222133324799186, + "loss": 5.1903, + "step": 542500 + }, + { + "epoch": 3.475533651230518, + "grad_norm": 0.45703125, + "learning_rate": 0.0003721957307901559, + "loss": 5.1954, + "step": 543000 + }, + { + "epoch": 3.4787339584600123, + "grad_norm": 0.486328125, + "learning_rate": 0.00037217012833231993, + "loss": 5.1947, + "step": 543500 + }, + { + "epoch": 3.4819342656895063, + "grad_norm": 0.484375, + "learning_rate": 0.00037214452587448397, + "loss": 5.1955, + "step": 544000 + }, + { + "epoch": 3.485134572919, + "grad_norm": 0.48046875, + "learning_rate": 0.000372118923416648, + "loss": 5.195, + "step": 544500 + }, + { + "epoch": 3.488334880148494, + "grad_norm": 0.416015625, + "learning_rate": 0.00037209332095881203, + "loss": 5.1956, + "step": 545000 + }, + { + "epoch": 3.4915351873779885, + "grad_norm": 0.3984375, + "learning_rate": 0.0003720677185009761, + "loss": 5.1975, + "step": 545500 + }, + { + "epoch": 3.4947354946074825, + "grad_norm": 0.439453125, + "learning_rate": 0.00037204211604314016, + "loss": 5.1947, + "step": 546000 + }, + { + "epoch": 3.4979358018369764, + "grad_norm": 0.46484375, + "learning_rate": 0.0003720165135853042, + "loss": 5.1972, + "step": 546500 + }, + { + "epoch": 3.5011361090664703, + "grad_norm": 0.478515625, + "learning_rate": 0.0003719909111274682, + "loss": 5.1907, + "step": 547000 + }, + { + "epoch": 3.5043364162959643, + "grad_norm": 0.41796875, + "learning_rate": 0.00037196530866963226, + "loss": 5.1957, + "step": 547500 + }, + { + "epoch": 3.507536723525458, + "grad_norm": 0.431640625, + "learning_rate": 0.00037193970621179635, + "loss": 5.1908, + "step": 548000 + }, + { + "epoch": 3.5107370307549526, + "grad_norm": 0.45703125, + "learning_rate": 0.0003719141037539604, + "loss": 5.1912, + "step": 548500 + }, + { + "epoch": 3.5139373379844465, + "grad_norm": 0.4296875, + "learning_rate": 0.00037188850129612447, + "loss": 5.1996, + "step": 549000 + }, + { + "epoch": 3.5171376452139405, + "grad_norm": 0.4375, + "learning_rate": 0.0003718628988382885, + "loss": 5.1944, + "step": 549500 + }, + { + "epoch": 3.520337952443435, + "grad_norm": 0.44921875, + "learning_rate": 0.00037183729638045254, + "loss": 5.1949, + "step": 550000 + }, + { + "epoch": 3.5235382596729288, + "grad_norm": 0.41015625, + "learning_rate": 0.00037181169392261663, + "loss": 5.1878, + "step": 550500 + }, + { + "epoch": 3.5267385669024227, + "grad_norm": 0.48046875, + "learning_rate": 0.00037178609146478066, + "loss": 5.1897, + "step": 551000 + }, + { + "epoch": 3.5299388741319166, + "grad_norm": 0.423828125, + "learning_rate": 0.0003717604890069447, + "loss": 5.1953, + "step": 551500 + }, + { + "epoch": 3.5331391813614106, + "grad_norm": 0.455078125, + "learning_rate": 0.00037173488654910873, + "loss": 5.1928, + "step": 552000 + }, + { + "epoch": 3.5363394885909045, + "grad_norm": 0.46875, + "learning_rate": 0.00037170928409127277, + "loss": 5.1918, + "step": 552500 + }, + { + "epoch": 3.539539795820399, + "grad_norm": 0.45703125, + "learning_rate": 0.0003716836816334368, + "loss": 5.1947, + "step": 553000 + }, + { + "epoch": 3.542740103049893, + "grad_norm": 0.423828125, + "learning_rate": 0.0003716580791756009, + "loss": 5.1898, + "step": 553500 + }, + { + "epoch": 3.545940410279387, + "grad_norm": 0.431640625, + "learning_rate": 0.0003716324767177649, + "loss": 5.194, + "step": 554000 + }, + { + "epoch": 3.5491407175088807, + "grad_norm": 0.46484375, + "learning_rate": 0.00037160687425992896, + "loss": 5.1985, + "step": 554500 + }, + { + "epoch": 3.552341024738375, + "grad_norm": 0.431640625, + "learning_rate": 0.000371581271802093, + "loss": 5.1973, + "step": 555000 + }, + { + "epoch": 3.555541331967869, + "grad_norm": 0.41796875, + "learning_rate": 0.000371555669344257, + "loss": 5.1947, + "step": 555500 + }, + { + "epoch": 3.558741639197363, + "grad_norm": 0.458984375, + "learning_rate": 0.0003715300668864211, + "loss": 5.1974, + "step": 556000 + }, + { + "epoch": 3.561941946426857, + "grad_norm": 0.408203125, + "learning_rate": 0.0003715044644285852, + "loss": 5.1936, + "step": 556500 + }, + { + "epoch": 3.565142253656351, + "grad_norm": 0.4375, + "learning_rate": 0.00037147886197074924, + "loss": 5.197, + "step": 557000 + }, + { + "epoch": 3.568342560885845, + "grad_norm": 0.435546875, + "learning_rate": 0.00037145325951291327, + "loss": 5.1962, + "step": 557500 + }, + { + "epoch": 3.571542868115339, + "grad_norm": 0.439453125, + "learning_rate": 0.0003714276570550773, + "loss": 5.1893, + "step": 558000 + }, + { + "epoch": 3.574743175344833, + "grad_norm": 0.474609375, + "learning_rate": 0.00037140205459724134, + "loss": 5.1963, + "step": 558500 + }, + { + "epoch": 3.577943482574327, + "grad_norm": 0.4296875, + "learning_rate": 0.00037137645213940543, + "loss": 5.1885, + "step": 559000 + }, + { + "epoch": 3.5811437898038214, + "grad_norm": 0.451171875, + "learning_rate": 0.00037135084968156946, + "loss": 5.192, + "step": 559500 + }, + { + "epoch": 3.5843440970333154, + "grad_norm": 0.42578125, + "learning_rate": 0.0003713252472237335, + "loss": 5.1898, + "step": 560000 + }, + { + "epoch": 3.5875444042628093, + "grad_norm": 0.458984375, + "learning_rate": 0.00037129964476589753, + "loss": 5.1964, + "step": 560500 + }, + { + "epoch": 3.5907447114923032, + "grad_norm": 0.43359375, + "learning_rate": 0.00037127404230806156, + "loss": 5.1908, + "step": 561000 + }, + { + "epoch": 3.593945018721797, + "grad_norm": 0.44140625, + "learning_rate": 0.00037124843985022565, + "loss": 5.1914, + "step": 561500 + }, + { + "epoch": 3.597145325951291, + "grad_norm": 0.4140625, + "learning_rate": 0.0003712228373923897, + "loss": 5.1917, + "step": 562000 + }, + { + "epoch": 3.6003456331807855, + "grad_norm": 0.416015625, + "learning_rate": 0.0003711972349345537, + "loss": 5.1943, + "step": 562500 + }, + { + "epoch": 3.6035459404102794, + "grad_norm": 0.4453125, + "learning_rate": 0.00037117163247671776, + "loss": 5.1978, + "step": 563000 + }, + { + "epoch": 3.6067462476397734, + "grad_norm": 0.439453125, + "learning_rate": 0.00037114603001888184, + "loss": 5.1876, + "step": 563500 + }, + { + "epoch": 3.6099465548692673, + "grad_norm": 0.41796875, + "learning_rate": 0.0003711204275610459, + "loss": 5.191, + "step": 564000 + }, + { + "epoch": 3.6131468620987617, + "grad_norm": 0.431640625, + "learning_rate": 0.00037109482510320997, + "loss": 5.1959, + "step": 564500 + }, + { + "epoch": 3.6163471693282556, + "grad_norm": 0.44140625, + "learning_rate": 0.000371069222645374, + "loss": 5.1885, + "step": 565000 + }, + { + "epoch": 3.6195474765577496, + "grad_norm": 0.421875, + "learning_rate": 0.00037104362018753804, + "loss": 5.1891, + "step": 565500 + }, + { + "epoch": 3.6227477837872435, + "grad_norm": 0.5, + "learning_rate": 0.00037101801772970207, + "loss": 5.1944, + "step": 566000 + }, + { + "epoch": 3.6259480910167374, + "grad_norm": 0.427734375, + "learning_rate": 0.0003709924152718661, + "loss": 5.1902, + "step": 566500 + }, + { + "epoch": 3.6291483982462314, + "grad_norm": 0.427734375, + "learning_rate": 0.0003709668128140302, + "loss": 5.1885, + "step": 567000 + }, + { + "epoch": 3.6323487054757257, + "grad_norm": 0.51171875, + "learning_rate": 0.0003709412103561942, + "loss": 5.189, + "step": 567500 + }, + { + "epoch": 3.6355490127052197, + "grad_norm": 0.435546875, + "learning_rate": 0.00037091560789835826, + "loss": 5.1871, + "step": 568000 + }, + { + "epoch": 3.6387493199347136, + "grad_norm": 0.423828125, + "learning_rate": 0.0003708900054405223, + "loss": 5.1998, + "step": 568500 + }, + { + "epoch": 3.641949627164208, + "grad_norm": 0.458984375, + "learning_rate": 0.00037086440298268633, + "loss": 5.1883, + "step": 569000 + }, + { + "epoch": 3.645149934393702, + "grad_norm": 0.484375, + "learning_rate": 0.00037083880052485036, + "loss": 5.1916, + "step": 569500 + }, + { + "epoch": 3.648350241623196, + "grad_norm": 0.46484375, + "learning_rate": 0.00037081319806701445, + "loss": 5.1919, + "step": 570000 + }, + { + "epoch": 3.65155054885269, + "grad_norm": 0.48046875, + "learning_rate": 0.0003707875956091785, + "loss": 5.1965, + "step": 570500 + }, + { + "epoch": 3.6547508560821838, + "grad_norm": 0.435546875, + "learning_rate": 0.0003707619931513425, + "loss": 5.1954, + "step": 571000 + }, + { + "epoch": 3.6579511633116777, + "grad_norm": 0.46484375, + "learning_rate": 0.0003707363906935066, + "loss": 5.1963, + "step": 571500 + }, + { + "epoch": 3.661151470541172, + "grad_norm": 0.478515625, + "learning_rate": 0.00037071078823567064, + "loss": 5.1893, + "step": 572000 + }, + { + "epoch": 3.664351777770666, + "grad_norm": 0.451171875, + "learning_rate": 0.00037068518577783473, + "loss": 5.1916, + "step": 572500 + }, + { + "epoch": 3.66755208500016, + "grad_norm": 0.4296875, + "learning_rate": 0.00037065958331999877, + "loss": 5.1885, + "step": 573000 + }, + { + "epoch": 3.670752392229654, + "grad_norm": 0.44921875, + "learning_rate": 0.0003706339808621628, + "loss": 5.1929, + "step": 573500 + }, + { + "epoch": 3.6739526994591483, + "grad_norm": 0.482421875, + "learning_rate": 0.00037060837840432683, + "loss": 5.1891, + "step": 574000 + }, + { + "epoch": 3.677153006688642, + "grad_norm": 0.48046875, + "learning_rate": 0.00037058277594649087, + "loss": 5.1971, + "step": 574500 + }, + { + "epoch": 3.680353313918136, + "grad_norm": 0.46875, + "learning_rate": 0.0003705571734886549, + "loss": 5.201, + "step": 575000 + }, + { + "epoch": 3.68355362114763, + "grad_norm": 0.478515625, + "learning_rate": 0.000370531571030819, + "loss": 5.1911, + "step": 575500 + }, + { + "epoch": 3.686753928377124, + "grad_norm": 0.408203125, + "learning_rate": 0.000370505968572983, + "loss": 5.18, + "step": 576000 + }, + { + "epoch": 3.6899542356066184, + "grad_norm": 0.474609375, + "learning_rate": 0.00037048036611514706, + "loss": 5.1987, + "step": 576500 + }, + { + "epoch": 3.6931545428361123, + "grad_norm": 0.43359375, + "learning_rate": 0.0003704547636573111, + "loss": 5.1972, + "step": 577000 + }, + { + "epoch": 3.6963548500656063, + "grad_norm": 0.470703125, + "learning_rate": 0.00037042916119947513, + "loss": 5.1944, + "step": 577500 + }, + { + "epoch": 3.6995551572951, + "grad_norm": 0.4765625, + "learning_rate": 0.0003704035587416392, + "loss": 5.1943, + "step": 578000 + }, + { + "epoch": 3.7027554645245946, + "grad_norm": 0.412109375, + "learning_rate": 0.00037037795628380325, + "loss": 5.1932, + "step": 578500 + }, + { + "epoch": 3.7059557717540885, + "grad_norm": 0.419921875, + "learning_rate": 0.00037035235382596734, + "loss": 5.198, + "step": 579000 + }, + { + "epoch": 3.7091560789835825, + "grad_norm": 0.484375, + "learning_rate": 0.0003703267513681314, + "loss": 5.201, + "step": 579500 + }, + { + "epoch": 3.7123563862130764, + "grad_norm": 0.451171875, + "learning_rate": 0.0003703011489102954, + "loss": 5.191, + "step": 580000 + }, + { + "epoch": 3.7155566934425703, + "grad_norm": 0.453125, + "learning_rate": 0.00037027554645245944, + "loss": 5.1979, + "step": 580500 + }, + { + "epoch": 3.7187570006720643, + "grad_norm": 0.46875, + "learning_rate": 0.00037024994399462353, + "loss": 5.1952, + "step": 581000 + }, + { + "epoch": 3.7219573079015587, + "grad_norm": 0.443359375, + "learning_rate": 0.00037022434153678756, + "loss": 5.1955, + "step": 581500 + }, + { + "epoch": 3.7251576151310526, + "grad_norm": 0.470703125, + "learning_rate": 0.0003701987390789516, + "loss": 5.1907, + "step": 582000 + }, + { + "epoch": 3.7283579223605465, + "grad_norm": 0.462890625, + "learning_rate": 0.00037017313662111563, + "loss": 5.189, + "step": 582500 + }, + { + "epoch": 3.731558229590041, + "grad_norm": 0.4140625, + "learning_rate": 0.00037014753416327967, + "loss": 5.1898, + "step": 583000 + }, + { + "epoch": 3.734758536819535, + "grad_norm": 0.47265625, + "learning_rate": 0.00037012193170544376, + "loss": 5.1901, + "step": 583500 + }, + { + "epoch": 3.737958844049029, + "grad_norm": 0.439453125, + "learning_rate": 0.0003700963292476078, + "loss": 5.1962, + "step": 584000 + }, + { + "epoch": 3.7411591512785227, + "grad_norm": 0.431640625, + "learning_rate": 0.0003700707267897718, + "loss": 5.1929, + "step": 584500 + }, + { + "epoch": 3.7443594585080167, + "grad_norm": 0.44921875, + "learning_rate": 0.00037004512433193586, + "loss": 5.1978, + "step": 585000 + }, + { + "epoch": 3.7475597657375106, + "grad_norm": 0.44140625, + "learning_rate": 0.0003700195218740999, + "loss": 5.1906, + "step": 585500 + }, + { + "epoch": 3.750760072967005, + "grad_norm": 0.4453125, + "learning_rate": 0.000369993919416264, + "loss": 5.1892, + "step": 586000 + }, + { + "epoch": 3.753960380196499, + "grad_norm": 0.451171875, + "learning_rate": 0.000369968316958428, + "loss": 5.191, + "step": 586500 + }, + { + "epoch": 3.757160687425993, + "grad_norm": 0.470703125, + "learning_rate": 0.0003699427145005921, + "loss": 5.1967, + "step": 587000 + }, + { + "epoch": 3.760360994655487, + "grad_norm": 0.486328125, + "learning_rate": 0.00036991711204275614, + "loss": 5.1956, + "step": 587500 + }, + { + "epoch": 3.763561301884981, + "grad_norm": 0.46484375, + "learning_rate": 0.00036989150958492017, + "loss": 5.1934, + "step": 588000 + }, + { + "epoch": 3.766761609114475, + "grad_norm": 0.5, + "learning_rate": 0.0003698659071270842, + "loss": 5.1938, + "step": 588500 + }, + { + "epoch": 3.769961916343969, + "grad_norm": 0.4609375, + "learning_rate": 0.0003698403046692483, + "loss": 5.1923, + "step": 589000 + }, + { + "epoch": 3.773162223573463, + "grad_norm": 0.44140625, + "learning_rate": 0.00036981470221141233, + "loss": 5.1926, + "step": 589500 + }, + { + "epoch": 3.776362530802957, + "grad_norm": 0.5, + "learning_rate": 0.00036978909975357636, + "loss": 5.1917, + "step": 590000 + }, + { + "epoch": 3.779562838032451, + "grad_norm": 0.427734375, + "learning_rate": 0.0003697634972957404, + "loss": 5.1902, + "step": 590500 + }, + { + "epoch": 3.7827631452619452, + "grad_norm": 0.46875, + "learning_rate": 0.00036973789483790443, + "loss": 5.1998, + "step": 591000 + }, + { + "epoch": 3.785963452491439, + "grad_norm": 0.421875, + "learning_rate": 0.0003697122923800685, + "loss": 5.1894, + "step": 591500 + }, + { + "epoch": 3.789163759720933, + "grad_norm": 0.466796875, + "learning_rate": 0.00036968668992223255, + "loss": 5.1918, + "step": 592000 + }, + { + "epoch": 3.7923640669504275, + "grad_norm": 0.443359375, + "learning_rate": 0.0003696610874643966, + "loss": 5.1876, + "step": 592500 + }, + { + "epoch": 3.7955643741799214, + "grad_norm": 0.51171875, + "learning_rate": 0.0003696354850065606, + "loss": 5.2001, + "step": 593000 + }, + { + "epoch": 3.7987646814094154, + "grad_norm": 0.435546875, + "learning_rate": 0.00036960988254872466, + "loss": 5.1952, + "step": 593500 + }, + { + "epoch": 3.8019649886389093, + "grad_norm": 0.451171875, + "learning_rate": 0.00036958428009088875, + "loss": 5.192, + "step": 594000 + }, + { + "epoch": 3.8051652958684032, + "grad_norm": 0.51171875, + "learning_rate": 0.00036955867763305283, + "loss": 5.1968, + "step": 594500 + }, + { + "epoch": 3.808365603097897, + "grad_norm": 0.455078125, + "learning_rate": 0.00036953307517521687, + "loss": 5.1993, + "step": 595000 + }, + { + "epoch": 3.8115659103273916, + "grad_norm": 0.44921875, + "learning_rate": 0.0003695074727173809, + "loss": 5.1949, + "step": 595500 + }, + { + "epoch": 3.8147662175568855, + "grad_norm": 0.50390625, + "learning_rate": 0.00036948187025954494, + "loss": 5.1955, + "step": 596000 + }, + { + "epoch": 3.8179665247863794, + "grad_norm": 0.4375, + "learning_rate": 0.00036945626780170897, + "loss": 5.1966, + "step": 596500 + }, + { + "epoch": 3.8211668320158734, + "grad_norm": 0.5078125, + "learning_rate": 0.00036943066534387306, + "loss": 5.1922, + "step": 597000 + }, + { + "epoch": 3.8243671392453678, + "grad_norm": 0.453125, + "learning_rate": 0.0003694050628860371, + "loss": 5.188, + "step": 597500 + }, + { + "epoch": 3.8275674464748617, + "grad_norm": 0.404296875, + "learning_rate": 0.00036937946042820113, + "loss": 5.1951, + "step": 598000 + }, + { + "epoch": 3.8307677537043556, + "grad_norm": 0.404296875, + "learning_rate": 0.00036935385797036516, + "loss": 5.1962, + "step": 598500 + }, + { + "epoch": 3.8339680609338496, + "grad_norm": 0.4609375, + "learning_rate": 0.0003693282555125292, + "loss": 5.1961, + "step": 599000 + }, + { + "epoch": 3.8371683681633435, + "grad_norm": 0.4296875, + "learning_rate": 0.00036930265305469323, + "loss": 5.1996, + "step": 599500 + }, + { + "epoch": 3.8403686753928374, + "grad_norm": 0.427734375, + "learning_rate": 0.0003692770505968573, + "loss": 5.1948, + "step": 600000 + }, + { + "epoch": 3.843568982622332, + "grad_norm": 0.46484375, + "learning_rate": 0.00036925144813902135, + "loss": 5.1891, + "step": 600500 + }, + { + "epoch": 3.8467692898518258, + "grad_norm": 0.48046875, + "learning_rate": 0.0003692258456811854, + "loss": 5.1954, + "step": 601000 + }, + { + "epoch": 3.8499695970813197, + "grad_norm": 0.443359375, + "learning_rate": 0.0003692002432233495, + "loss": 5.1881, + "step": 601500 + }, + { + "epoch": 3.853169904310814, + "grad_norm": 0.4296875, + "learning_rate": 0.0003691746407655135, + "loss": 5.1938, + "step": 602000 + }, + { + "epoch": 3.856370211540308, + "grad_norm": 0.447265625, + "learning_rate": 0.0003691490383076776, + "loss": 5.1946, + "step": 602500 + }, + { + "epoch": 3.859570518769802, + "grad_norm": 0.46484375, + "learning_rate": 0.00036912343584984163, + "loss": 5.1926, + "step": 603000 + }, + { + "epoch": 3.862770825999296, + "grad_norm": 0.43359375, + "learning_rate": 0.00036909783339200567, + "loss": 5.1925, + "step": 603500 + }, + { + "epoch": 3.86597113322879, + "grad_norm": 0.435546875, + "learning_rate": 0.0003690722309341697, + "loss": 5.1956, + "step": 604000 + }, + { + "epoch": 3.8691714404582838, + "grad_norm": 0.46484375, + "learning_rate": 0.00036904662847633374, + "loss": 5.1917, + "step": 604500 + }, + { + "epoch": 3.872371747687778, + "grad_norm": 0.51953125, + "learning_rate": 0.00036902102601849777, + "loss": 5.1941, + "step": 605000 + }, + { + "epoch": 3.875572054917272, + "grad_norm": 0.478515625, + "learning_rate": 0.00036899542356066186, + "loss": 5.196, + "step": 605500 + }, + { + "epoch": 3.878772362146766, + "grad_norm": 0.5, + "learning_rate": 0.0003689698211028259, + "loss": 5.1926, + "step": 606000 + }, + { + "epoch": 3.88197266937626, + "grad_norm": 0.490234375, + "learning_rate": 0.00036894421864498993, + "loss": 5.1966, + "step": 606500 + }, + { + "epoch": 3.8851729766057543, + "grad_norm": 0.41796875, + "learning_rate": 0.00036891861618715396, + "loss": 5.1931, + "step": 607000 + }, + { + "epoch": 3.8883732838352483, + "grad_norm": 0.43359375, + "learning_rate": 0.000368893013729318, + "loss": 5.1937, + "step": 607500 + }, + { + "epoch": 3.891573591064742, + "grad_norm": 0.466796875, + "learning_rate": 0.0003688674112714821, + "loss": 5.195, + "step": 608000 + }, + { + "epoch": 3.894773898294236, + "grad_norm": 0.48046875, + "learning_rate": 0.0003688418088136461, + "loss": 5.1918, + "step": 608500 + }, + { + "epoch": 3.89797420552373, + "grad_norm": 0.462890625, + "learning_rate": 0.00036881620635581015, + "loss": 5.1942, + "step": 609000 + }, + { + "epoch": 3.9011745127532245, + "grad_norm": 0.466796875, + "learning_rate": 0.00036879060389797424, + "loss": 5.1918, + "step": 609500 + }, + { + "epoch": 3.9043748199827184, + "grad_norm": 0.44140625, + "learning_rate": 0.0003687650014401383, + "loss": 5.1955, + "step": 610000 + }, + { + "epoch": 3.9075751272122123, + "grad_norm": 0.41015625, + "learning_rate": 0.0003687393989823023, + "loss": 5.19, + "step": 610500 + }, + { + "epoch": 3.9107754344417063, + "grad_norm": 0.4453125, + "learning_rate": 0.0003687137965244664, + "loss": 5.1947, + "step": 611000 + }, + { + "epoch": 3.9139757416712007, + "grad_norm": 0.4765625, + "learning_rate": 0.00036868819406663043, + "loss": 5.1975, + "step": 611500 + }, + { + "epoch": 3.9171760489006946, + "grad_norm": 0.443359375, + "learning_rate": 0.00036866259160879447, + "loss": 5.1929, + "step": 612000 + }, + { + "epoch": 3.9203763561301885, + "grad_norm": 0.42578125, + "learning_rate": 0.0003686369891509585, + "loss": 5.1986, + "step": 612500 + }, + { + "epoch": 3.9235766633596825, + "grad_norm": 0.453125, + "learning_rate": 0.00036861138669312254, + "loss": 5.1918, + "step": 613000 + }, + { + "epoch": 3.9267769705891764, + "grad_norm": 0.435546875, + "learning_rate": 0.0003685857842352866, + "loss": 5.1931, + "step": 613500 + }, + { + "epoch": 3.9299772778186703, + "grad_norm": 0.4453125, + "learning_rate": 0.00036856018177745066, + "loss": 5.1948, + "step": 614000 + }, + { + "epoch": 3.9331775850481647, + "grad_norm": 0.439453125, + "learning_rate": 0.0003685345793196147, + "loss": 5.1881, + "step": 614500 + }, + { + "epoch": 3.9363778922776587, + "grad_norm": 0.42578125, + "learning_rate": 0.0003685089768617787, + "loss": 5.1954, + "step": 615000 + }, + { + "epoch": 3.9395781995071526, + "grad_norm": 0.4453125, + "learning_rate": 0.00036848337440394276, + "loss": 5.1983, + "step": 615500 + }, + { + "epoch": 3.942778506736647, + "grad_norm": 0.470703125, + "learning_rate": 0.00036845777194610685, + "loss": 5.192, + "step": 616000 + }, + { + "epoch": 3.945978813966141, + "grad_norm": 0.4296875, + "learning_rate": 0.0003684321694882709, + "loss": 5.1921, + "step": 616500 + }, + { + "epoch": 3.949179121195635, + "grad_norm": 0.43359375, + "learning_rate": 0.00036840656703043497, + "loss": 5.193, + "step": 617000 + }, + { + "epoch": 3.952379428425129, + "grad_norm": 0.51171875, + "learning_rate": 0.000368380964572599, + "loss": 5.1931, + "step": 617500 + }, + { + "epoch": 3.9555797356546227, + "grad_norm": 0.44140625, + "learning_rate": 0.00036835536211476304, + "loss": 5.1898, + "step": 618000 + }, + { + "epoch": 3.9587800428841167, + "grad_norm": 0.46875, + "learning_rate": 0.0003683297596569271, + "loss": 5.1993, + "step": 618500 + }, + { + "epoch": 3.961980350113611, + "grad_norm": 0.45703125, + "learning_rate": 0.00036830415719909116, + "loss": 5.1947, + "step": 619000 + }, + { + "epoch": 3.965180657343105, + "grad_norm": 0.462890625, + "learning_rate": 0.0003682785547412552, + "loss": 5.1963, + "step": 619500 + }, + { + "epoch": 3.968380964572599, + "grad_norm": 0.49609375, + "learning_rate": 0.00036825295228341923, + "loss": 5.1975, + "step": 620000 + }, + { + "epoch": 3.971581271802093, + "grad_norm": 0.478515625, + "learning_rate": 0.00036822734982558327, + "loss": 5.195, + "step": 620500 + }, + { + "epoch": 3.9747815790315872, + "grad_norm": 0.47265625, + "learning_rate": 0.0003682017473677473, + "loss": 5.1945, + "step": 621000 + }, + { + "epoch": 3.977981886261081, + "grad_norm": 0.45703125, + "learning_rate": 0.0003681761449099114, + "loss": 5.1949, + "step": 621500 + }, + { + "epoch": 3.981182193490575, + "grad_norm": 0.43359375, + "learning_rate": 0.0003681505424520754, + "loss": 5.1965, + "step": 622000 + }, + { + "epoch": 3.984382500720069, + "grad_norm": 0.453125, + "learning_rate": 0.00036812493999423946, + "loss": 5.1927, + "step": 622500 + }, + { + "epoch": 3.987582807949563, + "grad_norm": 0.48828125, + "learning_rate": 0.0003680993375364035, + "loss": 5.1926, + "step": 623000 + }, + { + "epoch": 3.990783115179057, + "grad_norm": 0.453125, + "learning_rate": 0.0003680737350785675, + "loss": 5.1971, + "step": 623500 + }, + { + "epoch": 3.9939834224085513, + "grad_norm": 0.4375, + "learning_rate": 0.0003680481326207316, + "loss": 5.1945, + "step": 624000 + }, + { + "epoch": 3.9971837296380452, + "grad_norm": 0.45703125, + "learning_rate": 0.00036802253016289565, + "loss": 5.1938, + "step": 624500 + }, + { + "epoch": 4.0, + "eval_loss": 5.188127040863037, + "eval_runtime": 1.1918, + "eval_samples_per_second": 839.075, + "eval_steps_per_second": 13.425, + "step": 624940 + }, + { + "epoch": 4.00038403686754, + "grad_norm": 0.431640625, + "learning_rate": 0.00036799692770505974, + "loss": 5.1936, + "step": 625000 + }, + { + "epoch": 4.003584344097034, + "grad_norm": 0.44140625, + "learning_rate": 0.00036797132524722377, + "loss": 5.1842, + "step": 625500 + }, + { + "epoch": 4.0067846513265275, + "grad_norm": 0.443359375, + "learning_rate": 0.0003679457227893878, + "loss": 5.1866, + "step": 626000 + }, + { + "epoch": 4.009984958556021, + "grad_norm": 0.490234375, + "learning_rate": 0.00036792012033155184, + "loss": 5.1875, + "step": 626500 + }, + { + "epoch": 4.013185265785515, + "grad_norm": 0.435546875, + "learning_rate": 0.00036789451787371593, + "loss": 5.1882, + "step": 627000 + }, + { + "epoch": 4.016385573015009, + "grad_norm": 0.431640625, + "learning_rate": 0.00036786891541587996, + "loss": 5.1898, + "step": 627500 + }, + { + "epoch": 4.019585880244503, + "grad_norm": 0.484375, + "learning_rate": 0.000367843312958044, + "loss": 5.1909, + "step": 628000 + }, + { + "epoch": 4.022786187473997, + "grad_norm": 0.4765625, + "learning_rate": 0.00036781771050020803, + "loss": 5.1898, + "step": 628500 + }, + { + "epoch": 4.025986494703491, + "grad_norm": 0.4453125, + "learning_rate": 0.00036779210804237206, + "loss": 5.1957, + "step": 629000 + }, + { + "epoch": 4.029186801932986, + "grad_norm": 0.49609375, + "learning_rate": 0.0003677665055845361, + "loss": 5.1845, + "step": 629500 + }, + { + "epoch": 4.03238710916248, + "grad_norm": 0.470703125, + "learning_rate": 0.0003677409031267002, + "loss": 5.1906, + "step": 630000 + }, + { + "epoch": 4.035587416391974, + "grad_norm": 0.42578125, + "learning_rate": 0.0003677153006688642, + "loss": 5.1994, + "step": 630500 + }, + { + "epoch": 4.038787723621468, + "grad_norm": 0.462890625, + "learning_rate": 0.00036768969821102826, + "loss": 5.1873, + "step": 631000 + }, + { + "epoch": 4.041988030850962, + "grad_norm": 0.50390625, + "learning_rate": 0.0003676640957531923, + "loss": 5.1878, + "step": 631500 + }, + { + "epoch": 4.045188338080456, + "grad_norm": 0.44140625, + "learning_rate": 0.0003676384932953564, + "loss": 5.1948, + "step": 632000 + }, + { + "epoch": 4.04838864530995, + "grad_norm": 0.466796875, + "learning_rate": 0.00036761289083752047, + "loss": 5.1922, + "step": 632500 + }, + { + "epoch": 4.0515889525394435, + "grad_norm": 0.455078125, + "learning_rate": 0.0003675872883796845, + "loss": 5.1802, + "step": 633000 + }, + { + "epoch": 4.0547892597689374, + "grad_norm": 0.443359375, + "learning_rate": 0.00036756168592184854, + "loss": 5.1917, + "step": 633500 + }, + { + "epoch": 4.057989566998432, + "grad_norm": 0.494140625, + "learning_rate": 0.00036753608346401257, + "loss": 5.1942, + "step": 634000 + }, + { + "epoch": 4.061189874227926, + "grad_norm": 0.48828125, + "learning_rate": 0.0003675104810061766, + "loss": 5.1912, + "step": 634500 + }, + { + "epoch": 4.06439018145742, + "grad_norm": 0.494140625, + "learning_rate": 0.00036748487854834064, + "loss": 5.186, + "step": 635000 + }, + { + "epoch": 4.067590488686914, + "grad_norm": 0.47265625, + "learning_rate": 0.0003674592760905047, + "loss": 5.1892, + "step": 635500 + }, + { + "epoch": 4.070790795916408, + "grad_norm": 0.4609375, + "learning_rate": 0.00036743367363266876, + "loss": 5.1912, + "step": 636000 + }, + { + "epoch": 4.073991103145902, + "grad_norm": 0.4453125, + "learning_rate": 0.0003674080711748328, + "loss": 5.1944, + "step": 636500 + }, + { + "epoch": 4.077191410375396, + "grad_norm": 0.447265625, + "learning_rate": 0.00036738246871699683, + "loss": 5.1918, + "step": 637000 + }, + { + "epoch": 4.08039171760489, + "grad_norm": 0.46484375, + "learning_rate": 0.00036735686625916086, + "loss": 5.1935, + "step": 637500 + }, + { + "epoch": 4.083592024834384, + "grad_norm": 0.455078125, + "learning_rate": 0.00036733126380132495, + "loss": 5.1961, + "step": 638000 + }, + { + "epoch": 4.086792332063878, + "grad_norm": 0.439453125, + "learning_rate": 0.000367305661343489, + "loss": 5.1916, + "step": 638500 + }, + { + "epoch": 4.0899926392933725, + "grad_norm": 0.490234375, + "learning_rate": 0.000367280058885653, + "loss": 5.1949, + "step": 639000 + }, + { + "epoch": 4.0931929465228665, + "grad_norm": 0.451171875, + "learning_rate": 0.0003672544564278171, + "loss": 5.1934, + "step": 639500 + }, + { + "epoch": 4.09639325375236, + "grad_norm": 0.447265625, + "learning_rate": 0.00036722885396998114, + "loss": 5.1895, + "step": 640000 + }, + { + "epoch": 4.099593560981854, + "grad_norm": 0.490234375, + "learning_rate": 0.0003672032515121452, + "loss": 5.193, + "step": 640500 + }, + { + "epoch": 4.102793868211348, + "grad_norm": 0.498046875, + "learning_rate": 0.00036717764905430927, + "loss": 5.1871, + "step": 641000 + }, + { + "epoch": 4.105994175440842, + "grad_norm": 0.451171875, + "learning_rate": 0.0003671520465964733, + "loss": 5.1882, + "step": 641500 + }, + { + "epoch": 4.109194482670336, + "grad_norm": 0.50390625, + "learning_rate": 0.00036712644413863733, + "loss": 5.1871, + "step": 642000 + }, + { + "epoch": 4.11239478989983, + "grad_norm": 0.462890625, + "learning_rate": 0.00036710084168080137, + "loss": 5.1922, + "step": 642500 + }, + { + "epoch": 4.115595097129324, + "grad_norm": 0.443359375, + "learning_rate": 0.0003670752392229654, + "loss": 5.186, + "step": 643000 + }, + { + "epoch": 4.118795404358819, + "grad_norm": 0.412109375, + "learning_rate": 0.0003670496367651295, + "loss": 5.1933, + "step": 643500 + }, + { + "epoch": 4.121995711588313, + "grad_norm": 0.486328125, + "learning_rate": 0.0003670240343072935, + "loss": 5.1924, + "step": 644000 + }, + { + "epoch": 4.125196018817807, + "grad_norm": 0.466796875, + "learning_rate": 0.00036699843184945756, + "loss": 5.1947, + "step": 644500 + }, + { + "epoch": 4.128396326047301, + "grad_norm": 0.43359375, + "learning_rate": 0.0003669728293916216, + "loss": 5.2002, + "step": 645000 + }, + { + "epoch": 4.131596633276795, + "grad_norm": 0.40234375, + "learning_rate": 0.00036694722693378563, + "loss": 5.1989, + "step": 645500 + }, + { + "epoch": 4.1347969405062885, + "grad_norm": 0.42578125, + "learning_rate": 0.00036692162447594966, + "loss": 5.1922, + "step": 646000 + }, + { + "epoch": 4.1379972477357825, + "grad_norm": 0.55078125, + "learning_rate": 0.00036689602201811375, + "loss": 5.1884, + "step": 646500 + }, + { + "epoch": 4.141197554965276, + "grad_norm": 0.474609375, + "learning_rate": 0.0003668704195602778, + "loss": 5.1934, + "step": 647000 + }, + { + "epoch": 4.14439786219477, + "grad_norm": 0.47265625, + "learning_rate": 0.0003668448171024419, + "loss": 5.1914, + "step": 647500 + }, + { + "epoch": 4.147598169424265, + "grad_norm": 0.4453125, + "learning_rate": 0.0003668192146446059, + "loss": 5.1949, + "step": 648000 + }, + { + "epoch": 4.150798476653759, + "grad_norm": 0.45703125, + "learning_rate": 0.00036679361218676994, + "loss": 5.1976, + "step": 648500 + }, + { + "epoch": 4.153998783883253, + "grad_norm": 0.43359375, + "learning_rate": 0.00036676800972893403, + "loss": 5.1909, + "step": 649000 + }, + { + "epoch": 4.157199091112747, + "grad_norm": 0.458984375, + "learning_rate": 0.00036674240727109806, + "loss": 5.1887, + "step": 649500 + }, + { + "epoch": 4.160399398342241, + "grad_norm": 0.4609375, + "learning_rate": 0.0003667168048132621, + "loss": 5.1877, + "step": 650000 + }, + { + "epoch": 4.163599705571735, + "grad_norm": 0.447265625, + "learning_rate": 0.00036669120235542613, + "loss": 5.1908, + "step": 650500 + }, + { + "epoch": 4.166800012801229, + "grad_norm": 0.52734375, + "learning_rate": 0.00036666559989759017, + "loss": 5.1921, + "step": 651000 + }, + { + "epoch": 4.170000320030723, + "grad_norm": 0.52734375, + "learning_rate": 0.0003666399974397542, + "loss": 5.1892, + "step": 651500 + }, + { + "epoch": 4.173200627260217, + "grad_norm": 0.48828125, + "learning_rate": 0.0003666143949819183, + "loss": 5.1924, + "step": 652000 + }, + { + "epoch": 4.176400934489711, + "grad_norm": 0.52734375, + "learning_rate": 0.0003665887925240823, + "loss": 5.195, + "step": 652500 + }, + { + "epoch": 4.179601241719205, + "grad_norm": 0.4609375, + "learning_rate": 0.00036656319006624636, + "loss": 5.1932, + "step": 653000 + }, + { + "epoch": 4.182801548948699, + "grad_norm": 0.4609375, + "learning_rate": 0.0003665375876084104, + "loss": 5.1947, + "step": 653500 + }, + { + "epoch": 4.186001856178193, + "grad_norm": 0.46875, + "learning_rate": 0.0003665119851505745, + "loss": 5.1889, + "step": 654000 + }, + { + "epoch": 4.189202163407687, + "grad_norm": 0.470703125, + "learning_rate": 0.0003664863826927385, + "loss": 5.1934, + "step": 654500 + }, + { + "epoch": 4.192402470637181, + "grad_norm": 0.44921875, + "learning_rate": 0.0003664607802349026, + "loss": 5.1974, + "step": 655000 + }, + { + "epoch": 4.195602777866675, + "grad_norm": 0.48828125, + "learning_rate": 0.00036643517777706664, + "loss": 5.1908, + "step": 655500 + }, + { + "epoch": 4.198803085096169, + "grad_norm": 0.423828125, + "learning_rate": 0.00036640957531923067, + "loss": 5.1891, + "step": 656000 + }, + { + "epoch": 4.202003392325663, + "grad_norm": 0.47265625, + "learning_rate": 0.0003663839728613947, + "loss": 5.1873, + "step": 656500 + }, + { + "epoch": 4.205203699555157, + "grad_norm": 0.462890625, + "learning_rate": 0.00036635837040355874, + "loss": 5.1883, + "step": 657000 + }, + { + "epoch": 4.208404006784651, + "grad_norm": 0.435546875, + "learning_rate": 0.00036633276794572283, + "loss": 5.1928, + "step": 657500 + }, + { + "epoch": 4.211604314014146, + "grad_norm": 0.50390625, + "learning_rate": 0.00036630716548788686, + "loss": 5.1884, + "step": 658000 + }, + { + "epoch": 4.21480462124364, + "grad_norm": 0.5078125, + "learning_rate": 0.0003662815630300509, + "loss": 5.1933, + "step": 658500 + }, + { + "epoch": 4.218004928473134, + "grad_norm": 0.52734375, + "learning_rate": 0.00036625596057221493, + "loss": 5.1924, + "step": 659000 + }, + { + "epoch": 4.2212052357026275, + "grad_norm": 0.462890625, + "learning_rate": 0.00036623035811437897, + "loss": 5.1907, + "step": 659500 + }, + { + "epoch": 4.2244055429321214, + "grad_norm": 0.474609375, + "learning_rate": 0.00036620475565654306, + "loss": 5.1954, + "step": 660000 + }, + { + "epoch": 4.227605850161615, + "grad_norm": 0.451171875, + "learning_rate": 0.0003661791531987071, + "loss": 5.1918, + "step": 660500 + }, + { + "epoch": 4.230806157391109, + "grad_norm": 0.4375, + "learning_rate": 0.0003661535507408711, + "loss": 5.1888, + "step": 661000 + }, + { + "epoch": 4.234006464620603, + "grad_norm": 0.47265625, + "learning_rate": 0.00036612794828303516, + "loss": 5.1945, + "step": 661500 + }, + { + "epoch": 4.237206771850097, + "grad_norm": 0.46875, + "learning_rate": 0.00036610234582519925, + "loss": 5.1909, + "step": 662000 + }, + { + "epoch": 4.240407079079592, + "grad_norm": 0.423828125, + "learning_rate": 0.0003660767433673633, + "loss": 5.1898, + "step": 662500 + }, + { + "epoch": 4.243607386309086, + "grad_norm": 0.447265625, + "learning_rate": 0.00036605114090952737, + "loss": 5.1976, + "step": 663000 + }, + { + "epoch": 4.24680769353858, + "grad_norm": 0.5078125, + "learning_rate": 0.0003660255384516914, + "loss": 5.1939, + "step": 663500 + }, + { + "epoch": 4.250008000768074, + "grad_norm": 0.5078125, + "learning_rate": 0.00036599993599385544, + "loss": 5.1878, + "step": 664000 + }, + { + "epoch": 4.253208307997568, + "grad_norm": 0.447265625, + "learning_rate": 0.00036597433353601947, + "loss": 5.1858, + "step": 664500 + }, + { + "epoch": 4.256408615227062, + "grad_norm": 0.5078125, + "learning_rate": 0.0003659487310781835, + "loss": 5.1895, + "step": 665000 + }, + { + "epoch": 4.259608922456556, + "grad_norm": 0.4296875, + "learning_rate": 0.0003659231286203476, + "loss": 5.1937, + "step": 665500 + }, + { + "epoch": 4.26280922968605, + "grad_norm": 0.5078125, + "learning_rate": 0.00036589752616251163, + "loss": 5.1913, + "step": 666000 + }, + { + "epoch": 4.2660095369155435, + "grad_norm": 0.44140625, + "learning_rate": 0.00036587192370467566, + "loss": 5.191, + "step": 666500 + }, + { + "epoch": 4.269209844145038, + "grad_norm": 0.482421875, + "learning_rate": 0.0003658463212468397, + "loss": 5.1996, + "step": 667000 + }, + { + "epoch": 4.272410151374532, + "grad_norm": 0.466796875, + "learning_rate": 0.00036582071878900373, + "loss": 5.1936, + "step": 667500 + }, + { + "epoch": 4.275610458604026, + "grad_norm": 0.51171875, + "learning_rate": 0.0003657951163311678, + "loss": 5.1897, + "step": 668000 + }, + { + "epoch": 4.27881076583352, + "grad_norm": 0.470703125, + "learning_rate": 0.00036576951387333185, + "loss": 5.1917, + "step": 668500 + }, + { + "epoch": 4.282011073063014, + "grad_norm": 0.447265625, + "learning_rate": 0.0003657439114154959, + "loss": 5.1918, + "step": 669000 + }, + { + "epoch": 4.285211380292508, + "grad_norm": 0.486328125, + "learning_rate": 0.00036571830895766, + "loss": 5.1907, + "step": 669500 + }, + { + "epoch": 4.288411687522002, + "grad_norm": 0.466796875, + "learning_rate": 0.000365692706499824, + "loss": 5.1899, + "step": 670000 + }, + { + "epoch": 4.291611994751496, + "grad_norm": 0.482421875, + "learning_rate": 0.00036566710404198805, + "loss": 5.1958, + "step": 670500 + }, + { + "epoch": 4.29481230198099, + "grad_norm": 0.4609375, + "learning_rate": 0.00036564150158415213, + "loss": 5.1896, + "step": 671000 + }, + { + "epoch": 4.298012609210485, + "grad_norm": 0.48828125, + "learning_rate": 0.00036561589912631617, + "loss": 5.197, + "step": 671500 + }, + { + "epoch": 4.301212916439979, + "grad_norm": 0.423828125, + "learning_rate": 0.0003655902966684802, + "loss": 5.1875, + "step": 672000 + }, + { + "epoch": 4.3044132236694725, + "grad_norm": 0.51171875, + "learning_rate": 0.00036556469421064424, + "loss": 5.1866, + "step": 672500 + }, + { + "epoch": 4.3076135308989665, + "grad_norm": 0.41015625, + "learning_rate": 0.00036553909175280827, + "loss": 5.1938, + "step": 673000 + }, + { + "epoch": 4.31081383812846, + "grad_norm": 0.443359375, + "learning_rate": 0.00036551348929497236, + "loss": 5.1941, + "step": 673500 + }, + { + "epoch": 4.314014145357954, + "grad_norm": 0.46484375, + "learning_rate": 0.0003654878868371364, + "loss": 5.1871, + "step": 674000 + }, + { + "epoch": 4.317214452587448, + "grad_norm": 0.458984375, + "learning_rate": 0.00036546228437930043, + "loss": 5.194, + "step": 674500 + }, + { + "epoch": 4.320414759816942, + "grad_norm": 0.47265625, + "learning_rate": 0.00036543668192146446, + "loss": 5.1944, + "step": 675000 + }, + { + "epoch": 4.323615067046436, + "grad_norm": 0.53515625, + "learning_rate": 0.0003654110794636285, + "loss": 5.1897, + "step": 675500 + }, + { + "epoch": 4.32681537427593, + "grad_norm": 0.470703125, + "learning_rate": 0.00036538547700579253, + "loss": 5.195, + "step": 676000 + }, + { + "epoch": 4.330015681505425, + "grad_norm": 0.46875, + "learning_rate": 0.0003653598745479566, + "loss": 5.1907, + "step": 676500 + }, + { + "epoch": 4.333215988734919, + "grad_norm": 0.462890625, + "learning_rate": 0.00036533427209012065, + "loss": 5.1874, + "step": 677000 + }, + { + "epoch": 4.336416295964413, + "grad_norm": 0.45703125, + "learning_rate": 0.00036530866963228474, + "loss": 5.1906, + "step": 677500 + }, + { + "epoch": 4.339616603193907, + "grad_norm": 0.458984375, + "learning_rate": 0.0003652830671744488, + "loss": 5.1959, + "step": 678000 + }, + { + "epoch": 4.342816910423401, + "grad_norm": 0.486328125, + "learning_rate": 0.0003652574647166128, + "loss": 5.1941, + "step": 678500 + }, + { + "epoch": 4.346017217652895, + "grad_norm": 0.474609375, + "learning_rate": 0.0003652318622587769, + "loss": 5.1884, + "step": 679000 + }, + { + "epoch": 4.3492175248823886, + "grad_norm": 0.43359375, + "learning_rate": 0.00036520625980094093, + "loss": 5.19, + "step": 679500 + }, + { + "epoch": 4.3524178321118825, + "grad_norm": 0.5, + "learning_rate": 0.00036518065734310497, + "loss": 5.1904, + "step": 680000 + }, + { + "epoch": 4.355618139341376, + "grad_norm": 0.44140625, + "learning_rate": 0.000365155054885269, + "loss": 5.1916, + "step": 680500 + }, + { + "epoch": 4.35881844657087, + "grad_norm": 0.46484375, + "learning_rate": 0.00036512945242743304, + "loss": 5.1923, + "step": 681000 + }, + { + "epoch": 4.362018753800365, + "grad_norm": 0.490234375, + "learning_rate": 0.00036510384996959707, + "loss": 5.1895, + "step": 681500 + }, + { + "epoch": 4.365219061029859, + "grad_norm": 0.494140625, + "learning_rate": 0.00036507824751176116, + "loss": 5.1908, + "step": 682000 + }, + { + "epoch": 4.368419368259353, + "grad_norm": 0.44140625, + "learning_rate": 0.0003650526450539252, + "loss": 5.1916, + "step": 682500 + }, + { + "epoch": 4.371619675488847, + "grad_norm": 0.46484375, + "learning_rate": 0.0003650270425960892, + "loss": 5.1936, + "step": 683000 + }, + { + "epoch": 4.374819982718341, + "grad_norm": 0.478515625, + "learning_rate": 0.00036500144013825326, + "loss": 5.1916, + "step": 683500 + }, + { + "epoch": 4.378020289947835, + "grad_norm": 0.46484375, + "learning_rate": 0.0003649758376804173, + "loss": 5.1896, + "step": 684000 + }, + { + "epoch": 4.381220597177329, + "grad_norm": 0.486328125, + "learning_rate": 0.0003649502352225814, + "loss": 5.1947, + "step": 684500 + }, + { + "epoch": 4.384420904406823, + "grad_norm": 0.453125, + "learning_rate": 0.0003649246327647454, + "loss": 5.1908, + "step": 685000 + }, + { + "epoch": 4.387621211636317, + "grad_norm": 0.455078125, + "learning_rate": 0.0003648990303069095, + "loss": 5.1974, + "step": 685500 + }, + { + "epoch": 4.3908215188658115, + "grad_norm": 0.515625, + "learning_rate": 0.00036487342784907354, + "loss": 5.1908, + "step": 686000 + }, + { + "epoch": 4.3940218260953054, + "grad_norm": 0.486328125, + "learning_rate": 0.0003648478253912376, + "loss": 5.1929, + "step": 686500 + }, + { + "epoch": 4.397222133324799, + "grad_norm": 0.51171875, + "learning_rate": 0.0003648222229334016, + "loss": 5.1903, + "step": 687000 + }, + { + "epoch": 4.400422440554293, + "grad_norm": 0.5546875, + "learning_rate": 0.0003647966204755657, + "loss": 5.1931, + "step": 687500 + }, + { + "epoch": 4.403622747783787, + "grad_norm": 0.4609375, + "learning_rate": 0.00036477101801772973, + "loss": 5.192, + "step": 688000 + }, + { + "epoch": 4.406823055013281, + "grad_norm": 0.54296875, + "learning_rate": 0.00036474541555989377, + "loss": 5.1902, + "step": 688500 + }, + { + "epoch": 4.410023362242775, + "grad_norm": 0.455078125, + "learning_rate": 0.0003647198131020578, + "loss": 5.1876, + "step": 689000 + }, + { + "epoch": 4.413223669472269, + "grad_norm": 0.486328125, + "learning_rate": 0.00036469421064422183, + "loss": 5.1888, + "step": 689500 + }, + { + "epoch": 4.416423976701763, + "grad_norm": 0.453125, + "learning_rate": 0.0003646686081863859, + "loss": 5.1934, + "step": 690000 + }, + { + "epoch": 4.419624283931258, + "grad_norm": 0.490234375, + "learning_rate": 0.00036464300572854996, + "loss": 5.1933, + "step": 690500 + }, + { + "epoch": 4.422824591160752, + "grad_norm": 0.453125, + "learning_rate": 0.000364617403270714, + "loss": 5.1894, + "step": 691000 + }, + { + "epoch": 4.426024898390246, + "grad_norm": 0.53515625, + "learning_rate": 0.000364591800812878, + "loss": 5.1912, + "step": 691500 + }, + { + "epoch": 4.42922520561974, + "grad_norm": 0.421875, + "learning_rate": 0.0003645661983550421, + "loss": 5.1926, + "step": 692000 + }, + { + "epoch": 4.432425512849234, + "grad_norm": 0.455078125, + "learning_rate": 0.00036454059589720615, + "loss": 5.1901, + "step": 692500 + }, + { + "epoch": 4.4356258200787275, + "grad_norm": 0.482421875, + "learning_rate": 0.00036451499343937024, + "loss": 5.1939, + "step": 693000 + }, + { + "epoch": 4.4388261273082215, + "grad_norm": 0.5078125, + "learning_rate": 0.00036448939098153427, + "loss": 5.1953, + "step": 693500 + }, + { + "epoch": 4.442026434537715, + "grad_norm": 0.5234375, + "learning_rate": 0.0003644637885236983, + "loss": 5.1922, + "step": 694000 + }, + { + "epoch": 4.445226741767209, + "grad_norm": 0.515625, + "learning_rate": 0.00036443818606586234, + "loss": 5.1988, + "step": 694500 + }, + { + "epoch": 4.448427048996703, + "grad_norm": 0.4921875, + "learning_rate": 0.0003644125836080264, + "loss": 5.1935, + "step": 695000 + }, + { + "epoch": 4.451627356226198, + "grad_norm": 0.61328125, + "learning_rate": 0.00036438698115019046, + "loss": 5.1963, + "step": 695500 + }, + { + "epoch": 4.454827663455692, + "grad_norm": 0.45703125, + "learning_rate": 0.0003643613786923545, + "loss": 5.1938, + "step": 696000 + }, + { + "epoch": 4.458027970685186, + "grad_norm": 0.5078125, + "learning_rate": 0.00036433577623451853, + "loss": 5.1942, + "step": 696500 + }, + { + "epoch": 4.46122827791468, + "grad_norm": 0.455078125, + "learning_rate": 0.00036431017377668256, + "loss": 5.1892, + "step": 697000 + }, + { + "epoch": 4.464428585144174, + "grad_norm": 0.51171875, + "learning_rate": 0.0003642845713188466, + "loss": 5.1923, + "step": 697500 + }, + { + "epoch": 4.467628892373668, + "grad_norm": 0.490234375, + "learning_rate": 0.0003642589688610107, + "loss": 5.1957, + "step": 698000 + }, + { + "epoch": 4.470829199603162, + "grad_norm": 0.498046875, + "learning_rate": 0.0003642333664031747, + "loss": 5.1982, + "step": 698500 + }, + { + "epoch": 4.474029506832656, + "grad_norm": 0.4765625, + "learning_rate": 0.00036420776394533876, + "loss": 5.194, + "step": 699000 + }, + { + "epoch": 4.47722981406215, + "grad_norm": 0.4375, + "learning_rate": 0.0003641821614875028, + "loss": 5.1964, + "step": 699500 + }, + { + "epoch": 4.4804301212916435, + "grad_norm": 0.484375, + "learning_rate": 0.0003641565590296669, + "loss": 5.1911, + "step": 700000 + }, + { + "epoch": 4.483630428521138, + "grad_norm": 0.4453125, + "learning_rate": 0.0003641309565718309, + "loss": 5.195, + "step": 700500 + }, + { + "epoch": 4.486830735750632, + "grad_norm": 0.51953125, + "learning_rate": 0.000364105354113995, + "loss": 5.1903, + "step": 701000 + }, + { + "epoch": 4.490031042980126, + "grad_norm": 0.453125, + "learning_rate": 0.00036407975165615904, + "loss": 5.2005, + "step": 701500 + }, + { + "epoch": 4.49323135020962, + "grad_norm": 0.478515625, + "learning_rate": 0.00036405414919832307, + "loss": 5.1917, + "step": 702000 + }, + { + "epoch": 4.496431657439114, + "grad_norm": 0.453125, + "learning_rate": 0.0003640285467404871, + "loss": 5.1933, + "step": 702500 + }, + { + "epoch": 4.499631964668608, + "grad_norm": 0.46484375, + "learning_rate": 0.00036400294428265114, + "loss": 5.1886, + "step": 703000 + }, + { + "epoch": 4.502832271898102, + "grad_norm": 0.443359375, + "learning_rate": 0.0003639773418248152, + "loss": 5.1913, + "step": 703500 + }, + { + "epoch": 4.506032579127596, + "grad_norm": 0.46875, + "learning_rate": 0.00036395173936697926, + "loss": 5.2004, + "step": 704000 + }, + { + "epoch": 4.50923288635709, + "grad_norm": 0.447265625, + "learning_rate": 0.0003639261369091433, + "loss": 5.1908, + "step": 704500 + }, + { + "epoch": 4.512433193586585, + "grad_norm": 0.458984375, + "learning_rate": 0.00036390053445130733, + "loss": 5.1889, + "step": 705000 + }, + { + "epoch": 4.515633500816079, + "grad_norm": 0.4921875, + "learning_rate": 0.00036387493199347136, + "loss": 5.1943, + "step": 705500 + }, + { + "epoch": 4.5188338080455726, + "grad_norm": 0.462890625, + "learning_rate": 0.0003638493295356354, + "loss": 5.1924, + "step": 706000 + }, + { + "epoch": 4.5220341152750665, + "grad_norm": 0.453125, + "learning_rate": 0.0003638237270777995, + "loss": 5.1945, + "step": 706500 + }, + { + "epoch": 4.52523442250456, + "grad_norm": 0.478515625, + "learning_rate": 0.0003637981246199635, + "loss": 5.1936, + "step": 707000 + }, + { + "epoch": 4.528434729734054, + "grad_norm": 0.455078125, + "learning_rate": 0.0003637725221621276, + "loss": 5.1932, + "step": 707500 + }, + { + "epoch": 4.531635036963548, + "grad_norm": 0.451171875, + "learning_rate": 0.00036374691970429164, + "loss": 5.1967, + "step": 708000 + }, + { + "epoch": 4.534835344193042, + "grad_norm": 0.44921875, + "learning_rate": 0.0003637213172464557, + "loss": 5.1921, + "step": 708500 + }, + { + "epoch": 4.538035651422536, + "grad_norm": 0.470703125, + "learning_rate": 0.00036369571478861977, + "loss": 5.1905, + "step": 709000 + }, + { + "epoch": 4.541235958652031, + "grad_norm": 0.4765625, + "learning_rate": 0.0003636701123307838, + "loss": 5.1932, + "step": 709500 + }, + { + "epoch": 4.544436265881525, + "grad_norm": 0.4921875, + "learning_rate": 0.00036364450987294783, + "loss": 5.1955, + "step": 710000 + }, + { + "epoch": 4.547636573111019, + "grad_norm": 0.50390625, + "learning_rate": 0.00036361890741511187, + "loss": 5.1927, + "step": 710500 + }, + { + "epoch": 4.550836880340513, + "grad_norm": 0.466796875, + "learning_rate": 0.0003635933049572759, + "loss": 5.1945, + "step": 711000 + }, + { + "epoch": 4.554037187570007, + "grad_norm": 0.443359375, + "learning_rate": 0.00036356770249943994, + "loss": 5.1902, + "step": 711500 + }, + { + "epoch": 4.557237494799501, + "grad_norm": 0.46484375, + "learning_rate": 0.000363542100041604, + "loss": 5.1927, + "step": 712000 + }, + { + "epoch": 4.560437802028995, + "grad_norm": 0.451171875, + "learning_rate": 0.00036351649758376806, + "loss": 5.1942, + "step": 712500 + }, + { + "epoch": 4.563638109258489, + "grad_norm": 0.447265625, + "learning_rate": 0.0003634908951259321, + "loss": 5.1962, + "step": 713000 + }, + { + "epoch": 4.5668384164879825, + "grad_norm": 0.65625, + "learning_rate": 0.00036346529266809613, + "loss": 5.1954, + "step": 713500 + }, + { + "epoch": 4.570038723717477, + "grad_norm": 0.466796875, + "learning_rate": 0.00036343969021026016, + "loss": 5.1929, + "step": 714000 + }, + { + "epoch": 4.573239030946971, + "grad_norm": 0.453125, + "learning_rate": 0.00036341408775242425, + "loss": 5.1943, + "step": 714500 + }, + { + "epoch": 4.576439338176465, + "grad_norm": 0.46484375, + "learning_rate": 0.0003633884852945883, + "loss": 5.194, + "step": 715000 + }, + { + "epoch": 4.579639645405959, + "grad_norm": 0.5, + "learning_rate": 0.0003633628828367524, + "loss": 5.194, + "step": 715500 + }, + { + "epoch": 4.582839952635453, + "grad_norm": 0.44921875, + "learning_rate": 0.0003633372803789164, + "loss": 5.1945, + "step": 716000 + }, + { + "epoch": 4.586040259864947, + "grad_norm": 0.46484375, + "learning_rate": 0.00036331167792108044, + "loss": 5.1895, + "step": 716500 + }, + { + "epoch": 4.589240567094441, + "grad_norm": 0.482421875, + "learning_rate": 0.0003632860754632445, + "loss": 5.1929, + "step": 717000 + }, + { + "epoch": 4.592440874323935, + "grad_norm": 0.50390625, + "learning_rate": 0.00036326047300540857, + "loss": 5.1933, + "step": 717500 + }, + { + "epoch": 4.595641181553429, + "grad_norm": 0.453125, + "learning_rate": 0.0003632348705475726, + "loss": 5.1961, + "step": 718000 + }, + { + "epoch": 4.598841488782924, + "grad_norm": 0.47265625, + "learning_rate": 0.00036320926808973663, + "loss": 5.1955, + "step": 718500 + }, + { + "epoch": 4.602041796012417, + "grad_norm": 0.49609375, + "learning_rate": 0.00036318366563190067, + "loss": 5.1968, + "step": 719000 + }, + { + "epoch": 4.6052421032419115, + "grad_norm": 0.466796875, + "learning_rate": 0.0003631580631740647, + "loss": 5.195, + "step": 719500 + }, + { + "epoch": 4.6084424104714055, + "grad_norm": 0.44921875, + "learning_rate": 0.0003631324607162288, + "loss": 5.1944, + "step": 720000 + }, + { + "epoch": 4.611642717700899, + "grad_norm": 0.486328125, + "learning_rate": 0.0003631068582583928, + "loss": 5.1937, + "step": 720500 + }, + { + "epoch": 4.614843024930393, + "grad_norm": 0.46875, + "learning_rate": 0.00036308125580055686, + "loss": 5.196, + "step": 721000 + }, + { + "epoch": 4.618043332159887, + "grad_norm": 0.482421875, + "learning_rate": 0.0003630556533427209, + "loss": 5.1996, + "step": 721500 + }, + { + "epoch": 4.621243639389381, + "grad_norm": 0.53125, + "learning_rate": 0.00036303005088488493, + "loss": 5.194, + "step": 722000 + }, + { + "epoch": 4.624443946618875, + "grad_norm": 0.482421875, + "learning_rate": 0.000363004448427049, + "loss": 5.1924, + "step": 722500 + }, + { + "epoch": 4.627644253848369, + "grad_norm": 0.5390625, + "learning_rate": 0.00036297884596921305, + "loss": 5.2027, + "step": 723000 + }, + { + "epoch": 4.630844561077863, + "grad_norm": 0.48828125, + "learning_rate": 0.00036295324351137714, + "loss": 5.199, + "step": 723500 + }, + { + "epoch": 4.634044868307358, + "grad_norm": 0.458984375, + "learning_rate": 0.00036292764105354117, + "loss": 5.1987, + "step": 724000 + }, + { + "epoch": 4.637245175536852, + "grad_norm": 0.41796875, + "learning_rate": 0.0003629020385957052, + "loss": 5.193, + "step": 724500 + }, + { + "epoch": 4.640445482766346, + "grad_norm": 0.515625, + "learning_rate": 0.00036287643613786924, + "loss": 5.1978, + "step": 725000 + }, + { + "epoch": 4.64364578999584, + "grad_norm": 0.470703125, + "learning_rate": 0.00036285083368003333, + "loss": 5.1938, + "step": 725500 + }, + { + "epoch": 4.646846097225334, + "grad_norm": 0.453125, + "learning_rate": 0.00036282523122219736, + "loss": 5.1967, + "step": 726000 + }, + { + "epoch": 4.6500464044548275, + "grad_norm": 0.466796875, + "learning_rate": 0.0003627996287643614, + "loss": 5.1909, + "step": 726500 + }, + { + "epoch": 4.6532467116843215, + "grad_norm": 0.5546875, + "learning_rate": 0.00036277402630652543, + "loss": 5.1929, + "step": 727000 + }, + { + "epoch": 4.656447018913815, + "grad_norm": 0.490234375, + "learning_rate": 0.00036274842384868947, + "loss": 5.1954, + "step": 727500 + }, + { + "epoch": 4.659647326143309, + "grad_norm": 0.486328125, + "learning_rate": 0.0003627228213908535, + "loss": 5.1988, + "step": 728000 + }, + { + "epoch": 4.662847633372804, + "grad_norm": 0.45703125, + "learning_rate": 0.0003626972189330176, + "loss": 5.1884, + "step": 728500 + }, + { + "epoch": 4.666047940602298, + "grad_norm": 0.443359375, + "learning_rate": 0.0003626716164751816, + "loss": 5.1952, + "step": 729000 + }, + { + "epoch": 4.669248247831792, + "grad_norm": 0.453125, + "learning_rate": 0.00036264601401734566, + "loss": 5.1953, + "step": 729500 + }, + { + "epoch": 4.672448555061286, + "grad_norm": 0.494140625, + "learning_rate": 0.00036262041155950975, + "loss": 5.1932, + "step": 730000 + }, + { + "epoch": 4.67564886229078, + "grad_norm": 0.49609375, + "learning_rate": 0.0003625948091016738, + "loss": 5.1955, + "step": 730500 + }, + { + "epoch": 4.678849169520274, + "grad_norm": 0.5078125, + "learning_rate": 0.00036256920664383787, + "loss": 5.1975, + "step": 731000 + }, + { + "epoch": 4.682049476749768, + "grad_norm": 0.4609375, + "learning_rate": 0.0003625436041860019, + "loss": 5.1946, + "step": 731500 + }, + { + "epoch": 4.685249783979262, + "grad_norm": 0.4609375, + "learning_rate": 0.00036251800172816594, + "loss": 5.1981, + "step": 732000 + }, + { + "epoch": 4.688450091208756, + "grad_norm": 0.474609375, + "learning_rate": 0.00036249239927032997, + "loss": 5.1932, + "step": 732500 + }, + { + "epoch": 4.6916503984382505, + "grad_norm": 0.458984375, + "learning_rate": 0.000362466796812494, + "loss": 5.1963, + "step": 733000 + }, + { + "epoch": 4.694850705667744, + "grad_norm": 0.5078125, + "learning_rate": 0.00036244119435465804, + "loss": 5.2029, + "step": 733500 + }, + { + "epoch": 4.698051012897238, + "grad_norm": 0.49609375, + "learning_rate": 0.00036241559189682213, + "loss": 5.1952, + "step": 734000 + }, + { + "epoch": 4.701251320126732, + "grad_norm": 0.482421875, + "learning_rate": 0.00036238998943898616, + "loss": 5.1905, + "step": 734500 + }, + { + "epoch": 4.704451627356226, + "grad_norm": 0.494140625, + "learning_rate": 0.0003623643869811502, + "loss": 5.1941, + "step": 735000 + }, + { + "epoch": 4.70765193458572, + "grad_norm": 0.421875, + "learning_rate": 0.00036233878452331423, + "loss": 5.1893, + "step": 735500 + }, + { + "epoch": 4.710852241815214, + "grad_norm": 0.466796875, + "learning_rate": 0.00036231318206547827, + "loss": 5.1955, + "step": 736000 + }, + { + "epoch": 4.714052549044708, + "grad_norm": 0.47265625, + "learning_rate": 0.00036228757960764235, + "loss": 5.1985, + "step": 736500 + }, + { + "epoch": 4.717252856274202, + "grad_norm": 0.484375, + "learning_rate": 0.0003622619771498064, + "loss": 5.1987, + "step": 737000 + }, + { + "epoch": 4.720453163503697, + "grad_norm": 0.5, + "learning_rate": 0.0003622363746919704, + "loss": 5.1945, + "step": 737500 + }, + { + "epoch": 4.723653470733191, + "grad_norm": 0.458984375, + "learning_rate": 0.0003622107722341345, + "loss": 5.1946, + "step": 738000 + }, + { + "epoch": 4.726853777962685, + "grad_norm": 0.48046875, + "learning_rate": 0.00036218516977629855, + "loss": 5.1921, + "step": 738500 + }, + { + "epoch": 4.730054085192179, + "grad_norm": 0.4609375, + "learning_rate": 0.00036215956731846263, + "loss": 5.1962, + "step": 739000 + }, + { + "epoch": 4.733254392421673, + "grad_norm": 0.490234375, + "learning_rate": 0.00036213396486062667, + "loss": 5.1977, + "step": 739500 + }, + { + "epoch": 4.7364546996511665, + "grad_norm": 0.5234375, + "learning_rate": 0.0003621083624027907, + "loss": 5.1945, + "step": 740000 + }, + { + "epoch": 4.73965500688066, + "grad_norm": 0.49609375, + "learning_rate": 0.00036208275994495474, + "loss": 5.1903, + "step": 740500 + }, + { + "epoch": 4.742855314110154, + "grad_norm": 0.458984375, + "learning_rate": 0.00036205715748711877, + "loss": 5.2021, + "step": 741000 + }, + { + "epoch": 4.746055621339648, + "grad_norm": 0.5, + "learning_rate": 0.0003620315550292828, + "loss": 5.1963, + "step": 741500 + }, + { + "epoch": 4.749255928569143, + "grad_norm": 0.5234375, + "learning_rate": 0.0003620059525714469, + "loss": 5.1947, + "step": 742000 + }, + { + "epoch": 4.752456235798636, + "grad_norm": 0.5234375, + "learning_rate": 0.00036198035011361093, + "loss": 5.1968, + "step": 742500 + }, + { + "epoch": 4.755656543028131, + "grad_norm": 0.482421875, + "learning_rate": 0.00036195474765577496, + "loss": 5.1955, + "step": 743000 + }, + { + "epoch": 4.758856850257625, + "grad_norm": 0.470703125, + "learning_rate": 0.000361929145197939, + "loss": 5.1982, + "step": 743500 + }, + { + "epoch": 4.762057157487119, + "grad_norm": 0.5546875, + "learning_rate": 0.00036190354274010303, + "loss": 5.1986, + "step": 744000 + }, + { + "epoch": 4.765257464716613, + "grad_norm": 0.482421875, + "learning_rate": 0.0003618779402822671, + "loss": 5.1994, + "step": 744500 + }, + { + "epoch": 4.768457771946107, + "grad_norm": 0.498046875, + "learning_rate": 0.00036185233782443115, + "loss": 5.1965, + "step": 745000 + }, + { + "epoch": 4.771658079175601, + "grad_norm": 0.5, + "learning_rate": 0.00036182673536659524, + "loss": 5.195, + "step": 745500 + }, + { + "epoch": 4.774858386405095, + "grad_norm": 0.423828125, + "learning_rate": 0.0003618011329087593, + "loss": 5.1981, + "step": 746000 + }, + { + "epoch": 4.778058693634589, + "grad_norm": 0.4921875, + "learning_rate": 0.0003617755304509233, + "loss": 5.1937, + "step": 746500 + }, + { + "epoch": 4.7812590008640825, + "grad_norm": 0.47265625, + "learning_rate": 0.00036174992799308734, + "loss": 5.1932, + "step": 747000 + }, + { + "epoch": 4.784459308093577, + "grad_norm": 0.54296875, + "learning_rate": 0.00036172432553525143, + "loss": 5.1944, + "step": 747500 + }, + { + "epoch": 4.787659615323071, + "grad_norm": 0.47265625, + "learning_rate": 0.00036169872307741547, + "loss": 5.1957, + "step": 748000 + }, + { + "epoch": 4.790859922552565, + "grad_norm": 0.51171875, + "learning_rate": 0.0003616731206195795, + "loss": 5.1977, + "step": 748500 + }, + { + "epoch": 4.794060229782059, + "grad_norm": 0.478515625, + "learning_rate": 0.00036164751816174354, + "loss": 5.2021, + "step": 749000 + }, + { + "epoch": 4.797260537011553, + "grad_norm": 0.5625, + "learning_rate": 0.00036162191570390757, + "loss": 5.1928, + "step": 749500 + }, + { + "epoch": 4.800460844241047, + "grad_norm": 0.51171875, + "learning_rate": 0.00036159631324607166, + "loss": 5.1915, + "step": 750000 + }, + { + "epoch": 4.803661151470541, + "grad_norm": 0.5234375, + "learning_rate": 0.0003615707107882357, + "loss": 5.1967, + "step": 750500 + }, + { + "epoch": 4.806861458700035, + "grad_norm": 0.515625, + "learning_rate": 0.0003615451083303997, + "loss": 5.1971, + "step": 751000 + }, + { + "epoch": 4.810061765929529, + "grad_norm": 0.462890625, + "learning_rate": 0.00036151950587256376, + "loss": 5.1977, + "step": 751500 + }, + { + "epoch": 4.813262073159024, + "grad_norm": 0.486328125, + "learning_rate": 0.0003614939034147278, + "loss": 5.1976, + "step": 752000 + }, + { + "epoch": 4.816462380388518, + "grad_norm": 0.486328125, + "learning_rate": 0.0003614683009568919, + "loss": 5.19, + "step": 752500 + }, + { + "epoch": 4.8196626876180115, + "grad_norm": 0.474609375, + "learning_rate": 0.0003614426984990559, + "loss": 5.1953, + "step": 753000 + }, + { + "epoch": 4.8228629948475055, + "grad_norm": 0.458984375, + "learning_rate": 0.00036141709604122, + "loss": 5.2064, + "step": 753500 + }, + { + "epoch": 4.826063302076999, + "grad_norm": 0.44140625, + "learning_rate": 0.00036139149358338404, + "loss": 5.1947, + "step": 754000 + }, + { + "epoch": 4.829263609306493, + "grad_norm": 0.48828125, + "learning_rate": 0.0003613658911255481, + "loss": 5.191, + "step": 754500 + }, + { + "epoch": 4.832463916535987, + "grad_norm": 0.46875, + "learning_rate": 0.0003613402886677121, + "loss": 5.1949, + "step": 755000 + }, + { + "epoch": 4.835664223765481, + "grad_norm": 0.498046875, + "learning_rate": 0.0003613146862098762, + "loss": 5.1965, + "step": 755500 + }, + { + "epoch": 4.838864530994975, + "grad_norm": 0.462890625, + "learning_rate": 0.00036128908375204023, + "loss": 5.1932, + "step": 756000 + }, + { + "epoch": 4.84206483822447, + "grad_norm": 0.45703125, + "learning_rate": 0.00036126348129420427, + "loss": 5.1951, + "step": 756500 + }, + { + "epoch": 4.845265145453964, + "grad_norm": 0.48828125, + "learning_rate": 0.0003612378788363683, + "loss": 5.1925, + "step": 757000 + }, + { + "epoch": 4.848465452683458, + "grad_norm": 0.4765625, + "learning_rate": 0.00036121227637853233, + "loss": 5.1934, + "step": 757500 + }, + { + "epoch": 4.851665759912952, + "grad_norm": 0.478515625, + "learning_rate": 0.00036118667392069637, + "loss": 5.1932, + "step": 758000 + }, + { + "epoch": 4.854866067142446, + "grad_norm": 0.53125, + "learning_rate": 0.00036116107146286046, + "loss": 5.1952, + "step": 758500 + }, + { + "epoch": 4.85806637437194, + "grad_norm": 0.515625, + "learning_rate": 0.0003611354690050245, + "loss": 5.1937, + "step": 759000 + }, + { + "epoch": 4.861266681601434, + "grad_norm": 0.462890625, + "learning_rate": 0.0003611098665471885, + "loss": 5.1867, + "step": 759500 + }, + { + "epoch": 4.8644669888309275, + "grad_norm": 0.48046875, + "learning_rate": 0.00036108426408935256, + "loss": 5.1896, + "step": 760000 + }, + { + "epoch": 4.8676672960604215, + "grad_norm": 0.4453125, + "learning_rate": 0.00036105866163151665, + "loss": 5.1969, + "step": 760500 + }, + { + "epoch": 4.870867603289916, + "grad_norm": 0.466796875, + "learning_rate": 0.00036103305917368074, + "loss": 5.1907, + "step": 761000 + }, + { + "epoch": 4.87406791051941, + "grad_norm": 0.546875, + "learning_rate": 0.00036100745671584477, + "loss": 5.1907, + "step": 761500 + }, + { + "epoch": 4.877268217748904, + "grad_norm": 0.7734375, + "learning_rate": 0.0003609818542580088, + "loss": 5.1944, + "step": 762000 + }, + { + "epoch": 4.880468524978398, + "grad_norm": 0.54296875, + "learning_rate": 0.00036095625180017284, + "loss": 5.1974, + "step": 762500 + }, + { + "epoch": 4.883668832207892, + "grad_norm": 0.478515625, + "learning_rate": 0.0003609306493423369, + "loss": 5.1938, + "step": 763000 + }, + { + "epoch": 4.886869139437386, + "grad_norm": 0.462890625, + "learning_rate": 0.0003609050468845009, + "loss": 5.1975, + "step": 763500 + }, + { + "epoch": 4.89006944666688, + "grad_norm": 0.52734375, + "learning_rate": 0.000360879444426665, + "loss": 5.1969, + "step": 764000 + }, + { + "epoch": 4.893269753896374, + "grad_norm": 0.50390625, + "learning_rate": 0.00036085384196882903, + "loss": 5.1967, + "step": 764500 + }, + { + "epoch": 4.896470061125868, + "grad_norm": 0.466796875, + "learning_rate": 0.00036082823951099306, + "loss": 5.1969, + "step": 765000 + }, + { + "epoch": 4.899670368355362, + "grad_norm": 0.46484375, + "learning_rate": 0.0003608026370531571, + "loss": 5.1924, + "step": 765500 + }, + { + "epoch": 4.902870675584856, + "grad_norm": 0.4609375, + "learning_rate": 0.00036077703459532113, + "loss": 5.1943, + "step": 766000 + }, + { + "epoch": 4.9060709828143505, + "grad_norm": 0.466796875, + "learning_rate": 0.0003607514321374852, + "loss": 5.1916, + "step": 766500 + }, + { + "epoch": 4.909271290043844, + "grad_norm": 0.466796875, + "learning_rate": 0.00036072582967964926, + "loss": 5.199, + "step": 767000 + }, + { + "epoch": 4.912471597273338, + "grad_norm": 0.498046875, + "learning_rate": 0.0003607002272218133, + "loss": 5.1961, + "step": 767500 + }, + { + "epoch": 4.915671904502832, + "grad_norm": 0.490234375, + "learning_rate": 0.0003606746247639774, + "loss": 5.196, + "step": 768000 + }, + { + "epoch": 4.918872211732326, + "grad_norm": 0.5078125, + "learning_rate": 0.0003606490223061414, + "loss": 5.1949, + "step": 768500 + }, + { + "epoch": 4.92207251896182, + "grad_norm": 0.5, + "learning_rate": 0.00036062341984830545, + "loss": 5.19, + "step": 769000 + }, + { + "epoch": 4.925272826191314, + "grad_norm": 0.4609375, + "learning_rate": 0.00036059781739046954, + "loss": 5.1954, + "step": 769500 + }, + { + "epoch": 4.928473133420808, + "grad_norm": 0.48828125, + "learning_rate": 0.00036057221493263357, + "loss": 5.201, + "step": 770000 + }, + { + "epoch": 4.931673440650302, + "grad_norm": 0.466796875, + "learning_rate": 0.0003605466124747976, + "loss": 5.1932, + "step": 770500 + }, + { + "epoch": 4.934873747879797, + "grad_norm": 0.50390625, + "learning_rate": 0.00036052101001696164, + "loss": 5.1932, + "step": 771000 + }, + { + "epoch": 4.938074055109291, + "grad_norm": 0.470703125, + "learning_rate": 0.00036049540755912567, + "loss": 5.1997, + "step": 771500 + }, + { + "epoch": 4.941274362338785, + "grad_norm": 0.451171875, + "learning_rate": 0.00036046980510128976, + "loss": 5.1931, + "step": 772000 + }, + { + "epoch": 4.944474669568279, + "grad_norm": 0.51171875, + "learning_rate": 0.0003604442026434538, + "loss": 5.1964, + "step": 772500 + }, + { + "epoch": 4.947674976797773, + "grad_norm": 0.494140625, + "learning_rate": 0.00036041860018561783, + "loss": 5.1998, + "step": 773000 + }, + { + "epoch": 4.9508752840272665, + "grad_norm": 0.451171875, + "learning_rate": 0.00036039299772778186, + "loss": 5.1968, + "step": 773500 + }, + { + "epoch": 4.9540755912567604, + "grad_norm": 0.60546875, + "learning_rate": 0.0003603673952699459, + "loss": 5.1856, + "step": 774000 + }, + { + "epoch": 4.957275898486254, + "grad_norm": 0.486328125, + "learning_rate": 0.00036034179281211, + "loss": 5.1989, + "step": 774500 + }, + { + "epoch": 4.960476205715748, + "grad_norm": 0.49609375, + "learning_rate": 0.000360316190354274, + "loss": 5.1884, + "step": 775000 + }, + { + "epoch": 4.963676512945243, + "grad_norm": 0.474609375, + "learning_rate": 0.00036029058789643806, + "loss": 5.1902, + "step": 775500 + }, + { + "epoch": 4.966876820174737, + "grad_norm": 0.490234375, + "learning_rate": 0.00036026498543860214, + "loss": 5.1938, + "step": 776000 + }, + { + "epoch": 4.970077127404231, + "grad_norm": 0.4609375, + "learning_rate": 0.0003602393829807662, + "loss": 5.1943, + "step": 776500 + }, + { + "epoch": 4.973277434633725, + "grad_norm": 0.478515625, + "learning_rate": 0.0003602137805229302, + "loss": 5.1974, + "step": 777000 + }, + { + "epoch": 4.976477741863219, + "grad_norm": 0.474609375, + "learning_rate": 0.0003601881780650943, + "loss": 5.1969, + "step": 777500 + }, + { + "epoch": 4.979678049092713, + "grad_norm": 0.482421875, + "learning_rate": 0.00036016257560725833, + "loss": 5.1962, + "step": 778000 + }, + { + "epoch": 4.982878356322207, + "grad_norm": 0.49609375, + "learning_rate": 0.00036013697314942237, + "loss": 5.1981, + "step": 778500 + }, + { + "epoch": 4.986078663551701, + "grad_norm": 0.57421875, + "learning_rate": 0.0003601113706915864, + "loss": 5.2013, + "step": 779000 + }, + { + "epoch": 4.989278970781195, + "grad_norm": 0.51171875, + "learning_rate": 0.00036008576823375044, + "loss": 5.1841, + "step": 779500 + }, + { + "epoch": 4.9924792780106895, + "grad_norm": 0.4375, + "learning_rate": 0.0003600601657759145, + "loss": 5.194, + "step": 780000 + }, + { + "epoch": 4.995679585240183, + "grad_norm": 0.462890625, + "learning_rate": 0.00036003456331807856, + "loss": 5.1985, + "step": 780500 + }, + { + "epoch": 4.998879892469677, + "grad_norm": 0.48828125, + "learning_rate": 0.0003600089608602426, + "loss": 5.1993, + "step": 781000 + }, + { + "epoch": 5.0, + "eval_loss": 5.186727523803711, + "eval_runtime": 1.1486, + "eval_samples_per_second": 870.654, + "eval_steps_per_second": 13.93, + "step": 781175 + }, + { + "epoch": 5.002080199699171, + "grad_norm": 0.455078125, + "learning_rate": 0.00035998335840240663, + "loss": 5.196, + "step": 781500 + }, + { + "epoch": 5.005280506928665, + "grad_norm": 0.50390625, + "learning_rate": 0.00035995775594457066, + "loss": 5.1908, + "step": 782000 + }, + { + "epoch": 5.008480814158159, + "grad_norm": 0.447265625, + "learning_rate": 0.00035993215348673475, + "loss": 5.1907, + "step": 782500 + }, + { + "epoch": 5.011681121387653, + "grad_norm": 0.5078125, + "learning_rate": 0.0003599065510288988, + "loss": 5.1877, + "step": 783000 + }, + { + "epoch": 5.014881428617147, + "grad_norm": 0.486328125, + "learning_rate": 0.0003598809485710629, + "loss": 5.1926, + "step": 783500 + }, + { + "epoch": 5.018081735846641, + "grad_norm": 0.5078125, + "learning_rate": 0.0003598553461132269, + "loss": 5.1938, + "step": 784000 + }, + { + "epoch": 5.021282043076135, + "grad_norm": 0.4921875, + "learning_rate": 0.00035982974365539094, + "loss": 5.1873, + "step": 784500 + }, + { + "epoch": 5.02448235030563, + "grad_norm": 0.4765625, + "learning_rate": 0.000359804141197555, + "loss": 5.1941, + "step": 785000 + }, + { + "epoch": 5.027682657535124, + "grad_norm": 0.48046875, + "learning_rate": 0.00035977853873971907, + "loss": 5.1923, + "step": 785500 + }, + { + "epoch": 5.030882964764618, + "grad_norm": 0.4921875, + "learning_rate": 0.0003597529362818831, + "loss": 5.1884, + "step": 786000 + }, + { + "epoch": 5.0340832719941115, + "grad_norm": 0.4453125, + "learning_rate": 0.00035972733382404713, + "loss": 5.1959, + "step": 786500 + }, + { + "epoch": 5.0372835792236055, + "grad_norm": 0.451171875, + "learning_rate": 0.00035970173136621117, + "loss": 5.1889, + "step": 787000 + }, + { + "epoch": 5.040483886453099, + "grad_norm": 0.54296875, + "learning_rate": 0.0003596761289083752, + "loss": 5.195, + "step": 787500 + }, + { + "epoch": 5.043684193682593, + "grad_norm": 0.58203125, + "learning_rate": 0.00035965052645053924, + "loss": 5.1857, + "step": 788000 + }, + { + "epoch": 5.046884500912087, + "grad_norm": 0.51171875, + "learning_rate": 0.0003596249239927033, + "loss": 5.1941, + "step": 788500 + }, + { + "epoch": 5.050084808141581, + "grad_norm": 0.4765625, + "learning_rate": 0.00035959932153486736, + "loss": 5.1934, + "step": 789000 + }, + { + "epoch": 5.053285115371076, + "grad_norm": 0.5625, + "learning_rate": 0.0003595737190770314, + "loss": 5.1901, + "step": 789500 + }, + { + "epoch": 5.05648542260057, + "grad_norm": 0.4921875, + "learning_rate": 0.00035954811661919543, + "loss": 5.1927, + "step": 790000 + }, + { + "epoch": 5.059685729830064, + "grad_norm": 0.474609375, + "learning_rate": 0.0003595225141613595, + "loss": 5.1878, + "step": 790500 + }, + { + "epoch": 5.062886037059558, + "grad_norm": 0.443359375, + "learning_rate": 0.00035949691170352355, + "loss": 5.1916, + "step": 791000 + }, + { + "epoch": 5.066086344289052, + "grad_norm": 0.478515625, + "learning_rate": 0.00035947130924568764, + "loss": 5.1882, + "step": 791500 + }, + { + "epoch": 5.069286651518546, + "grad_norm": 0.5234375, + "learning_rate": 0.0003594457067878517, + "loss": 5.1905, + "step": 792000 + }, + { + "epoch": 5.07248695874804, + "grad_norm": 0.484375, + "learning_rate": 0.0003594201043300157, + "loss": 5.1944, + "step": 792500 + }, + { + "epoch": 5.075687265977534, + "grad_norm": 0.54296875, + "learning_rate": 0.00035939450187217974, + "loss": 5.1941, + "step": 793000 + }, + { + "epoch": 5.0788875732070276, + "grad_norm": 0.54296875, + "learning_rate": 0.0003593688994143438, + "loss": 5.1883, + "step": 793500 + }, + { + "epoch": 5.0820878804365215, + "grad_norm": 0.435546875, + "learning_rate": 0.00035934329695650786, + "loss": 5.1958, + "step": 794000 + }, + { + "epoch": 5.085288187666016, + "grad_norm": 0.4765625, + "learning_rate": 0.0003593176944986719, + "loss": 5.1933, + "step": 794500 + }, + { + "epoch": 5.08848849489551, + "grad_norm": 0.458984375, + "learning_rate": 0.00035929209204083593, + "loss": 5.1934, + "step": 795000 + }, + { + "epoch": 5.091688802125004, + "grad_norm": 0.466796875, + "learning_rate": 0.00035926648958299997, + "loss": 5.187, + "step": 795500 + }, + { + "epoch": 5.094889109354498, + "grad_norm": 0.486328125, + "learning_rate": 0.000359240887125164, + "loss": 5.199, + "step": 796000 + }, + { + "epoch": 5.098089416583992, + "grad_norm": 0.546875, + "learning_rate": 0.0003592152846673281, + "loss": 5.186, + "step": 796500 + }, + { + "epoch": 5.101289723813486, + "grad_norm": 0.462890625, + "learning_rate": 0.0003591896822094921, + "loss": 5.1892, + "step": 797000 + }, + { + "epoch": 5.10449003104298, + "grad_norm": 0.515625, + "learning_rate": 0.00035916407975165616, + "loss": 5.1931, + "step": 797500 + }, + { + "epoch": 5.107690338272474, + "grad_norm": 0.478515625, + "learning_rate": 0.0003591384772938202, + "loss": 5.1888, + "step": 798000 + }, + { + "epoch": 5.110890645501968, + "grad_norm": 0.48828125, + "learning_rate": 0.0003591128748359843, + "loss": 5.1965, + "step": 798500 + }, + { + "epoch": 5.114090952731463, + "grad_norm": 0.57421875, + "learning_rate": 0.0003590872723781483, + "loss": 5.1959, + "step": 799000 + }, + { + "epoch": 5.117291259960957, + "grad_norm": 0.515625, + "learning_rate": 0.0003590616699203124, + "loss": 5.1928, + "step": 799500 + }, + { + "epoch": 5.1204915671904505, + "grad_norm": 0.490234375, + "learning_rate": 0.00035903606746247644, + "loss": 5.1896, + "step": 800000 + }, + { + "epoch": 5.1236918744199444, + "grad_norm": 0.5234375, + "learning_rate": 0.00035901046500464047, + "loss": 5.1934, + "step": 800500 + }, + { + "epoch": 5.126892181649438, + "grad_norm": 0.478515625, + "learning_rate": 0.0003589848625468045, + "loss": 5.1969, + "step": 801000 + }, + { + "epoch": 5.130092488878932, + "grad_norm": 0.498046875, + "learning_rate": 0.00035895926008896854, + "loss": 5.1986, + "step": 801500 + }, + { + "epoch": 5.133292796108426, + "grad_norm": 0.51171875, + "learning_rate": 0.00035893365763113263, + "loss": 5.1884, + "step": 802000 + }, + { + "epoch": 5.13649310333792, + "grad_norm": 0.52734375, + "learning_rate": 0.00035890805517329666, + "loss": 5.195, + "step": 802500 + }, + { + "epoch": 5.139693410567414, + "grad_norm": 0.48828125, + "learning_rate": 0.0003588824527154607, + "loss": 5.1907, + "step": 803000 + }, + { + "epoch": 5.142893717796909, + "grad_norm": 0.458984375, + "learning_rate": 0.00035885685025762473, + "loss": 5.1878, + "step": 803500 + }, + { + "epoch": 5.146094025026403, + "grad_norm": 0.51953125, + "learning_rate": 0.00035883124779978877, + "loss": 5.1876, + "step": 804000 + }, + { + "epoch": 5.149294332255897, + "grad_norm": 0.4765625, + "learning_rate": 0.0003588056453419528, + "loss": 5.19, + "step": 804500 + }, + { + "epoch": 5.152494639485391, + "grad_norm": 0.4765625, + "learning_rate": 0.0003587800428841169, + "loss": 5.196, + "step": 805000 + }, + { + "epoch": 5.155694946714885, + "grad_norm": 0.52734375, + "learning_rate": 0.0003587544404262809, + "loss": 5.1937, + "step": 805500 + }, + { + "epoch": 5.158895253944379, + "grad_norm": 0.482421875, + "learning_rate": 0.000358728837968445, + "loss": 5.191, + "step": 806000 + }, + { + "epoch": 5.162095561173873, + "grad_norm": 0.455078125, + "learning_rate": 0.00035870323551060905, + "loss": 5.191, + "step": 806500 + }, + { + "epoch": 5.1652958684033665, + "grad_norm": 0.5, + "learning_rate": 0.0003586776330527731, + "loss": 5.1922, + "step": 807000 + }, + { + "epoch": 5.1684961756328605, + "grad_norm": 0.54296875, + "learning_rate": 0.00035865203059493717, + "loss": 5.1869, + "step": 807500 + }, + { + "epoch": 5.171696482862354, + "grad_norm": 0.5078125, + "learning_rate": 0.0003586264281371012, + "loss": 5.1968, + "step": 808000 + }, + { + "epoch": 5.174896790091849, + "grad_norm": 0.490234375, + "learning_rate": 0.00035860082567926524, + "loss": 5.189, + "step": 808500 + }, + { + "epoch": 5.178097097321343, + "grad_norm": 0.44140625, + "learning_rate": 0.00035857522322142927, + "loss": 5.1945, + "step": 809000 + }, + { + "epoch": 5.181297404550837, + "grad_norm": 0.4921875, + "learning_rate": 0.0003585496207635933, + "loss": 5.1938, + "step": 809500 + }, + { + "epoch": 5.184497711780331, + "grad_norm": 0.5390625, + "learning_rate": 0.0003585240183057574, + "loss": 5.1902, + "step": 810000 + }, + { + "epoch": 5.187698019009825, + "grad_norm": 0.5, + "learning_rate": 0.00035849841584792143, + "loss": 5.1895, + "step": 810500 + }, + { + "epoch": 5.190898326239319, + "grad_norm": 0.490234375, + "learning_rate": 0.00035847281339008546, + "loss": 5.1867, + "step": 811000 + }, + { + "epoch": 5.194098633468813, + "grad_norm": 0.45703125, + "learning_rate": 0.0003584472109322495, + "loss": 5.1907, + "step": 811500 + }, + { + "epoch": 5.197298940698307, + "grad_norm": 0.56640625, + "learning_rate": 0.00035842160847441353, + "loss": 5.1907, + "step": 812000 + }, + { + "epoch": 5.200499247927801, + "grad_norm": 0.474609375, + "learning_rate": 0.00035839600601657756, + "loss": 5.192, + "step": 812500 + }, + { + "epoch": 5.203699555157295, + "grad_norm": 0.4765625, + "learning_rate": 0.00035837040355874165, + "loss": 5.2016, + "step": 813000 + }, + { + "epoch": 5.2068998623867895, + "grad_norm": 0.443359375, + "learning_rate": 0.0003583448011009057, + "loss": 5.193, + "step": 813500 + }, + { + "epoch": 5.210100169616283, + "grad_norm": 0.51171875, + "learning_rate": 0.0003583191986430698, + "loss": 5.1969, + "step": 814000 + }, + { + "epoch": 5.213300476845777, + "grad_norm": 0.51953125, + "learning_rate": 0.0003582935961852338, + "loss": 5.193, + "step": 814500 + }, + { + "epoch": 5.216500784075271, + "grad_norm": 0.51171875, + "learning_rate": 0.00035826799372739784, + "loss": 5.1904, + "step": 815000 + }, + { + "epoch": 5.219701091304765, + "grad_norm": 0.498046875, + "learning_rate": 0.00035824239126956193, + "loss": 5.1898, + "step": 815500 + }, + { + "epoch": 5.222901398534259, + "grad_norm": 0.48046875, + "learning_rate": 0.00035821678881172597, + "loss": 5.1947, + "step": 816000 + }, + { + "epoch": 5.226101705763753, + "grad_norm": 0.52734375, + "learning_rate": 0.00035819118635389, + "loss": 5.201, + "step": 816500 + }, + { + "epoch": 5.229302012993247, + "grad_norm": 0.494140625, + "learning_rate": 0.00035816558389605404, + "loss": 5.195, + "step": 817000 + }, + { + "epoch": 5.232502320222741, + "grad_norm": 0.484375, + "learning_rate": 0.00035813998143821807, + "loss": 5.1934, + "step": 817500 + }, + { + "epoch": 5.235702627452236, + "grad_norm": 0.51171875, + "learning_rate": 0.0003581143789803821, + "loss": 5.1882, + "step": 818000 + }, + { + "epoch": 5.23890293468173, + "grad_norm": 0.5, + "learning_rate": 0.0003580887765225462, + "loss": 5.1984, + "step": 818500 + }, + { + "epoch": 5.242103241911224, + "grad_norm": 0.4921875, + "learning_rate": 0.0003580631740647102, + "loss": 5.1947, + "step": 819000 + }, + { + "epoch": 5.245303549140718, + "grad_norm": 0.44921875, + "learning_rate": 0.00035803757160687426, + "loss": 5.1932, + "step": 819500 + }, + { + "epoch": 5.2485038563702116, + "grad_norm": 0.5, + "learning_rate": 0.0003580119691490383, + "loss": 5.1865, + "step": 820000 + }, + { + "epoch": 5.2517041635997055, + "grad_norm": 0.53515625, + "learning_rate": 0.0003579863666912024, + "loss": 5.1919, + "step": 820500 + }, + { + "epoch": 5.254904470829199, + "grad_norm": 0.515625, + "learning_rate": 0.0003579607642333664, + "loss": 5.196, + "step": 821000 + }, + { + "epoch": 5.258104778058693, + "grad_norm": 0.51953125, + "learning_rate": 0.0003579351617755305, + "loss": 5.1858, + "step": 821500 + }, + { + "epoch": 5.261305085288187, + "grad_norm": 0.515625, + "learning_rate": 0.00035790955931769454, + "loss": 5.1927, + "step": 822000 + }, + { + "epoch": 5.264505392517682, + "grad_norm": 0.51953125, + "learning_rate": 0.0003578839568598586, + "loss": 5.1908, + "step": 822500 + }, + { + "epoch": 5.267705699747176, + "grad_norm": 0.5390625, + "learning_rate": 0.0003578583544020226, + "loss": 5.1927, + "step": 823000 + }, + { + "epoch": 5.27090600697667, + "grad_norm": 0.52734375, + "learning_rate": 0.00035783275194418664, + "loss": 5.1974, + "step": 823500 + }, + { + "epoch": 5.274106314206164, + "grad_norm": 0.578125, + "learning_rate": 0.00035780714948635073, + "loss": 5.1912, + "step": 824000 + }, + { + "epoch": 5.277306621435658, + "grad_norm": 0.48828125, + "learning_rate": 0.00035778154702851477, + "loss": 5.1906, + "step": 824500 + }, + { + "epoch": 5.280506928665152, + "grad_norm": 0.5078125, + "learning_rate": 0.0003577559445706788, + "loss": 5.1898, + "step": 825000 + }, + { + "epoch": 5.283707235894646, + "grad_norm": 0.50390625, + "learning_rate": 0.00035773034211284283, + "loss": 5.1904, + "step": 825500 + }, + { + "epoch": 5.28690754312414, + "grad_norm": 0.53515625, + "learning_rate": 0.00035770473965500687, + "loss": 5.1934, + "step": 826000 + }, + { + "epoch": 5.290107850353634, + "grad_norm": 0.53125, + "learning_rate": 0.00035767913719717096, + "loss": 5.1942, + "step": 826500 + }, + { + "epoch": 5.2933081575831284, + "grad_norm": 0.51171875, + "learning_rate": 0.000357653534739335, + "loss": 5.1926, + "step": 827000 + }, + { + "epoch": 5.296508464812622, + "grad_norm": 0.5078125, + "learning_rate": 0.000357627932281499, + "loss": 5.1936, + "step": 827500 + }, + { + "epoch": 5.299708772042116, + "grad_norm": 0.47265625, + "learning_rate": 0.00035760232982366306, + "loss": 5.1895, + "step": 828000 + }, + { + "epoch": 5.30290907927161, + "grad_norm": 0.55859375, + "learning_rate": 0.00035757672736582715, + "loss": 5.1925, + "step": 828500 + }, + { + "epoch": 5.306109386501104, + "grad_norm": 0.5078125, + "learning_rate": 0.0003575511249079912, + "loss": 5.196, + "step": 829000 + }, + { + "epoch": 5.309309693730598, + "grad_norm": 0.5546875, + "learning_rate": 0.00035752552245015527, + "loss": 5.1964, + "step": 829500 + }, + { + "epoch": 5.312510000960092, + "grad_norm": 0.474609375, + "learning_rate": 0.0003574999199923193, + "loss": 5.1984, + "step": 830000 + }, + { + "epoch": 5.315710308189586, + "grad_norm": 0.490234375, + "learning_rate": 0.00035747431753448334, + "loss": 5.1861, + "step": 830500 + }, + { + "epoch": 5.31891061541908, + "grad_norm": 0.53515625, + "learning_rate": 0.0003574487150766474, + "loss": 5.1971, + "step": 831000 + }, + { + "epoch": 5.322110922648574, + "grad_norm": 0.453125, + "learning_rate": 0.0003574231126188114, + "loss": 5.2024, + "step": 831500 + }, + { + "epoch": 5.325311229878069, + "grad_norm": 0.498046875, + "learning_rate": 0.0003573975101609755, + "loss": 5.194, + "step": 832000 + }, + { + "epoch": 5.328511537107563, + "grad_norm": 0.48828125, + "learning_rate": 0.00035737190770313953, + "loss": 5.1953, + "step": 832500 + }, + { + "epoch": 5.331711844337057, + "grad_norm": 0.53515625, + "learning_rate": 0.00035734630524530357, + "loss": 5.192, + "step": 833000 + }, + { + "epoch": 5.3349121515665505, + "grad_norm": 0.5, + "learning_rate": 0.0003573207027874676, + "loss": 5.1998, + "step": 833500 + }, + { + "epoch": 5.3381124587960445, + "grad_norm": 0.55859375, + "learning_rate": 0.00035729510032963163, + "loss": 5.1896, + "step": 834000 + }, + { + "epoch": 5.341312766025538, + "grad_norm": 0.53515625, + "learning_rate": 0.00035726949787179567, + "loss": 5.1879, + "step": 834500 + }, + { + "epoch": 5.344513073255032, + "grad_norm": 0.42578125, + "learning_rate": 0.00035724389541395976, + "loss": 5.194, + "step": 835000 + }, + { + "epoch": 5.347713380484526, + "grad_norm": 0.52734375, + "learning_rate": 0.0003572182929561238, + "loss": 5.193, + "step": 835500 + }, + { + "epoch": 5.35091368771402, + "grad_norm": 0.482421875, + "learning_rate": 0.0003571926904982878, + "loss": 5.1903, + "step": 836000 + }, + { + "epoch": 5.354113994943514, + "grad_norm": 0.51171875, + "learning_rate": 0.0003571670880404519, + "loss": 5.1892, + "step": 836500 + }, + { + "epoch": 5.357314302173009, + "grad_norm": 0.48046875, + "learning_rate": 0.00035714148558261595, + "loss": 5.2005, + "step": 837000 + }, + { + "epoch": 5.360514609402503, + "grad_norm": 0.515625, + "learning_rate": 0.00035711588312478004, + "loss": 5.1901, + "step": 837500 + }, + { + "epoch": 5.363714916631997, + "grad_norm": 0.53125, + "learning_rate": 0.00035709028066694407, + "loss": 5.1951, + "step": 838000 + }, + { + "epoch": 5.366915223861491, + "grad_norm": 0.5546875, + "learning_rate": 0.0003570646782091081, + "loss": 5.189, + "step": 838500 + }, + { + "epoch": 5.370115531090985, + "grad_norm": 0.466796875, + "learning_rate": 0.00035703907575127214, + "loss": 5.1957, + "step": 839000 + }, + { + "epoch": 5.373315838320479, + "grad_norm": 0.51953125, + "learning_rate": 0.0003570134732934362, + "loss": 5.1892, + "step": 839500 + }, + { + "epoch": 5.376516145549973, + "grad_norm": 0.4296875, + "learning_rate": 0.0003569878708356002, + "loss": 5.192, + "step": 840000 + }, + { + "epoch": 5.3797164527794665, + "grad_norm": 0.48828125, + "learning_rate": 0.0003569622683777643, + "loss": 5.1936, + "step": 840500 + }, + { + "epoch": 5.3829167600089605, + "grad_norm": 0.53125, + "learning_rate": 0.00035693666591992833, + "loss": 5.1927, + "step": 841000 + }, + { + "epoch": 5.386117067238455, + "grad_norm": 0.51171875, + "learning_rate": 0.00035691106346209236, + "loss": 5.1942, + "step": 841500 + }, + { + "epoch": 5.389317374467949, + "grad_norm": 0.439453125, + "learning_rate": 0.0003568854610042564, + "loss": 5.1916, + "step": 842000 + }, + { + "epoch": 5.392517681697443, + "grad_norm": 0.5390625, + "learning_rate": 0.00035685985854642043, + "loss": 5.1909, + "step": 842500 + }, + { + "epoch": 5.395717988926937, + "grad_norm": 0.478515625, + "learning_rate": 0.0003568342560885845, + "loss": 5.1895, + "step": 843000 + }, + { + "epoch": 5.398918296156431, + "grad_norm": 0.498046875, + "learning_rate": 0.00035680865363074856, + "loss": 5.196, + "step": 843500 + }, + { + "epoch": 5.402118603385925, + "grad_norm": 0.478515625, + "learning_rate": 0.00035678305117291264, + "loss": 5.1934, + "step": 844000 + }, + { + "epoch": 5.405318910615419, + "grad_norm": 0.5, + "learning_rate": 0.0003567574487150767, + "loss": 5.193, + "step": 844500 + }, + { + "epoch": 5.408519217844913, + "grad_norm": 0.474609375, + "learning_rate": 0.0003567318462572407, + "loss": 5.1885, + "step": 845000 + }, + { + "epoch": 5.411719525074407, + "grad_norm": 0.46875, + "learning_rate": 0.00035670624379940475, + "loss": 5.191, + "step": 845500 + }, + { + "epoch": 5.414919832303902, + "grad_norm": 0.5234375, + "learning_rate": 0.00035668064134156883, + "loss": 5.1924, + "step": 846000 + }, + { + "epoch": 5.4181201395333956, + "grad_norm": 0.466796875, + "learning_rate": 0.00035665503888373287, + "loss": 5.1941, + "step": 846500 + }, + { + "epoch": 5.4213204467628895, + "grad_norm": 0.4609375, + "learning_rate": 0.0003566294364258969, + "loss": 5.1947, + "step": 847000 + }, + { + "epoch": 5.424520753992383, + "grad_norm": 0.5, + "learning_rate": 0.00035660383396806094, + "loss": 5.1892, + "step": 847500 + }, + { + "epoch": 5.427721061221877, + "grad_norm": 0.54296875, + "learning_rate": 0.00035657823151022497, + "loss": 5.1924, + "step": 848000 + }, + { + "epoch": 5.430921368451371, + "grad_norm": 0.55078125, + "learning_rate": 0.00035655262905238906, + "loss": 5.1932, + "step": 848500 + }, + { + "epoch": 5.434121675680865, + "grad_norm": 0.498046875, + "learning_rate": 0.0003565270265945531, + "loss": 5.1959, + "step": 849000 + }, + { + "epoch": 5.437321982910359, + "grad_norm": 0.54296875, + "learning_rate": 0.00035650142413671713, + "loss": 5.1937, + "step": 849500 + }, + { + "epoch": 5.440522290139853, + "grad_norm": 0.54296875, + "learning_rate": 0.00035647582167888116, + "loss": 5.1952, + "step": 850000 + }, + { + "epoch": 5.443722597369347, + "grad_norm": 0.48828125, + "learning_rate": 0.0003564502192210452, + "loss": 5.1862, + "step": 850500 + }, + { + "epoch": 5.446922904598842, + "grad_norm": 0.5234375, + "learning_rate": 0.0003564246167632093, + "loss": 5.1865, + "step": 851000 + }, + { + "epoch": 5.450123211828336, + "grad_norm": 0.52734375, + "learning_rate": 0.0003563990143053733, + "loss": 5.1919, + "step": 851500 + }, + { + "epoch": 5.45332351905783, + "grad_norm": 0.57421875, + "learning_rate": 0.0003563734118475374, + "loss": 5.1846, + "step": 852000 + }, + { + "epoch": 5.456523826287324, + "grad_norm": 0.51171875, + "learning_rate": 0.00035634780938970144, + "loss": 5.1946, + "step": 852500 + }, + { + "epoch": 5.459724133516818, + "grad_norm": 0.57421875, + "learning_rate": 0.0003563222069318655, + "loss": 5.188, + "step": 853000 + }, + { + "epoch": 5.462924440746312, + "grad_norm": 0.51953125, + "learning_rate": 0.0003562966044740295, + "loss": 5.189, + "step": 853500 + }, + { + "epoch": 5.4661247479758055, + "grad_norm": 0.51953125, + "learning_rate": 0.0003562710020161936, + "loss": 5.1924, + "step": 854000 + }, + { + "epoch": 5.469325055205299, + "grad_norm": 0.486328125, + "learning_rate": 0.00035624539955835763, + "loss": 5.1974, + "step": 854500 + }, + { + "epoch": 5.472525362434793, + "grad_norm": 0.546875, + "learning_rate": 0.00035621979710052167, + "loss": 5.19, + "step": 855000 + }, + { + "epoch": 5.475725669664287, + "grad_norm": 0.474609375, + "learning_rate": 0.0003561941946426857, + "loss": 5.1927, + "step": 855500 + }, + { + "epoch": 5.478925976893782, + "grad_norm": 0.51171875, + "learning_rate": 0.00035616859218484974, + "loss": 5.1937, + "step": 856000 + }, + { + "epoch": 5.482126284123276, + "grad_norm": 0.51953125, + "learning_rate": 0.0003561429897270138, + "loss": 5.1928, + "step": 856500 + }, + { + "epoch": 5.48532659135277, + "grad_norm": 0.51171875, + "learning_rate": 0.00035611738726917786, + "loss": 5.1935, + "step": 857000 + }, + { + "epoch": 5.488526898582264, + "grad_norm": 0.4765625, + "learning_rate": 0.0003560917848113419, + "loss": 5.1932, + "step": 857500 + }, + { + "epoch": 5.491727205811758, + "grad_norm": 0.51171875, + "learning_rate": 0.00035606618235350593, + "loss": 5.1861, + "step": 858000 + }, + { + "epoch": 5.494927513041252, + "grad_norm": 0.52734375, + "learning_rate": 0.00035604057989567, + "loss": 5.1931, + "step": 858500 + }, + { + "epoch": 5.498127820270746, + "grad_norm": 0.48046875, + "learning_rate": 0.00035601497743783405, + "loss": 5.1891, + "step": 859000 + }, + { + "epoch": 5.50132812750024, + "grad_norm": 0.470703125, + "learning_rate": 0.00035598937497999814, + "loss": 5.1924, + "step": 859500 + }, + { + "epoch": 5.504528434729734, + "grad_norm": 0.515625, + "learning_rate": 0.0003559637725221622, + "loss": 5.1958, + "step": 860000 + }, + { + "epoch": 5.5077287419592285, + "grad_norm": 0.47265625, + "learning_rate": 0.0003559381700643262, + "loss": 5.1897, + "step": 860500 + }, + { + "epoch": 5.510929049188722, + "grad_norm": 0.515625, + "learning_rate": 0.00035591256760649024, + "loss": 5.1907, + "step": 861000 + }, + { + "epoch": 5.514129356418216, + "grad_norm": 0.55859375, + "learning_rate": 0.0003558869651486543, + "loss": 5.1888, + "step": 861500 + }, + { + "epoch": 5.51732966364771, + "grad_norm": 0.5234375, + "learning_rate": 0.00035586136269081836, + "loss": 5.1928, + "step": 862000 + }, + { + "epoch": 5.520529970877204, + "grad_norm": 0.53515625, + "learning_rate": 0.0003558357602329824, + "loss": 5.1948, + "step": 862500 + }, + { + "epoch": 5.523730278106698, + "grad_norm": 0.515625, + "learning_rate": 0.00035581015777514643, + "loss": 5.1949, + "step": 863000 + }, + { + "epoch": 5.526930585336192, + "grad_norm": 0.5390625, + "learning_rate": 0.00035578455531731047, + "loss": 5.1971, + "step": 863500 + }, + { + "epoch": 5.530130892565686, + "grad_norm": 0.5234375, + "learning_rate": 0.0003557589528594745, + "loss": 5.189, + "step": 864000 + }, + { + "epoch": 5.53333119979518, + "grad_norm": 0.5078125, + "learning_rate": 0.00035573335040163854, + "loss": 5.1916, + "step": 864500 + }, + { + "epoch": 5.536531507024675, + "grad_norm": 0.546875, + "learning_rate": 0.0003557077479438026, + "loss": 5.1895, + "step": 865000 + }, + { + "epoch": 5.539731814254169, + "grad_norm": 0.50390625, + "learning_rate": 0.00035568214548596666, + "loss": 5.1928, + "step": 865500 + }, + { + "epoch": 5.542932121483663, + "grad_norm": 0.5390625, + "learning_rate": 0.0003556565430281307, + "loss": 5.1955, + "step": 866000 + }, + { + "epoch": 5.546132428713157, + "grad_norm": 0.4765625, + "learning_rate": 0.0003556309405702948, + "loss": 5.1908, + "step": 866500 + }, + { + "epoch": 5.5493327359426505, + "grad_norm": 0.50390625, + "learning_rate": 0.0003556053381124588, + "loss": 5.1921, + "step": 867000 + }, + { + "epoch": 5.5525330431721445, + "grad_norm": 0.48046875, + "learning_rate": 0.0003555797356546229, + "loss": 5.187, + "step": 867500 + }, + { + "epoch": 5.555733350401638, + "grad_norm": 0.466796875, + "learning_rate": 0.00035555413319678694, + "loss": 5.1901, + "step": 868000 + }, + { + "epoch": 5.558933657631132, + "grad_norm": 0.5078125, + "learning_rate": 0.00035552853073895097, + "loss": 5.1981, + "step": 868500 + }, + { + "epoch": 5.562133964860626, + "grad_norm": 0.55078125, + "learning_rate": 0.000355502928281115, + "loss": 5.1823, + "step": 869000 + }, + { + "epoch": 5.565334272090121, + "grad_norm": 0.51171875, + "learning_rate": 0.00035547732582327904, + "loss": 5.1946, + "step": 869500 + }, + { + "epoch": 5.568534579319615, + "grad_norm": 0.50390625, + "learning_rate": 0.0003554517233654431, + "loss": 5.1921, + "step": 870000 + }, + { + "epoch": 5.571734886549109, + "grad_norm": 0.5, + "learning_rate": 0.00035542612090760716, + "loss": 5.1861, + "step": 870500 + }, + { + "epoch": 5.574935193778603, + "grad_norm": 0.47265625, + "learning_rate": 0.0003554005184497712, + "loss": 5.1876, + "step": 871000 + }, + { + "epoch": 5.578135501008097, + "grad_norm": 0.4765625, + "learning_rate": 0.00035537491599193523, + "loss": 5.1953, + "step": 871500 + }, + { + "epoch": 5.581335808237591, + "grad_norm": 0.490234375, + "learning_rate": 0.00035534931353409927, + "loss": 5.1937, + "step": 872000 + }, + { + "epoch": 5.584536115467085, + "grad_norm": 0.55859375, + "learning_rate": 0.0003553237110762633, + "loss": 5.1894, + "step": 872500 + }, + { + "epoch": 5.587736422696579, + "grad_norm": 0.486328125, + "learning_rate": 0.0003552981086184274, + "loss": 5.1917, + "step": 873000 + }, + { + "epoch": 5.590936729926073, + "grad_norm": 0.494140625, + "learning_rate": 0.0003552725061605914, + "loss": 5.1891, + "step": 873500 + }, + { + "epoch": 5.594137037155567, + "grad_norm": 0.48046875, + "learning_rate": 0.0003552469037027555, + "loss": 5.1961, + "step": 874000 + }, + { + "epoch": 5.5973373443850605, + "grad_norm": 0.52734375, + "learning_rate": 0.00035522130124491955, + "loss": 5.1954, + "step": 874500 + }, + { + "epoch": 5.600537651614555, + "grad_norm": 0.48828125, + "learning_rate": 0.0003551956987870836, + "loss": 5.1918, + "step": 875000 + }, + { + "epoch": 5.603737958844049, + "grad_norm": 0.515625, + "learning_rate": 0.0003551700963292476, + "loss": 5.1932, + "step": 875500 + }, + { + "epoch": 5.606938266073543, + "grad_norm": 0.484375, + "learning_rate": 0.0003551444938714117, + "loss": 5.2006, + "step": 876000 + }, + { + "epoch": 5.610138573303037, + "grad_norm": 0.4921875, + "learning_rate": 0.00035511889141357574, + "loss": 5.1887, + "step": 876500 + }, + { + "epoch": 5.613338880532531, + "grad_norm": 0.52734375, + "learning_rate": 0.00035509328895573977, + "loss": 5.1883, + "step": 877000 + }, + { + "epoch": 5.616539187762025, + "grad_norm": 0.494140625, + "learning_rate": 0.0003550676864979038, + "loss": 5.1887, + "step": 877500 + }, + { + "epoch": 5.619739494991519, + "grad_norm": 0.466796875, + "learning_rate": 0.00035504208404006784, + "loss": 5.1867, + "step": 878000 + }, + { + "epoch": 5.622939802221013, + "grad_norm": 0.52734375, + "learning_rate": 0.00035501648158223193, + "loss": 5.196, + "step": 878500 + }, + { + "epoch": 5.626140109450507, + "grad_norm": 0.5546875, + "learning_rate": 0.00035499087912439596, + "loss": 5.1891, + "step": 879000 + }, + { + "epoch": 5.629340416680002, + "grad_norm": 0.48828125, + "learning_rate": 0.00035496527666656, + "loss": 5.1935, + "step": 879500 + }, + { + "epoch": 5.632540723909496, + "grad_norm": 0.515625, + "learning_rate": 0.00035493967420872403, + "loss": 5.1882, + "step": 880000 + }, + { + "epoch": 5.6357410311389895, + "grad_norm": 0.5234375, + "learning_rate": 0.00035491407175088807, + "loss": 5.1948, + "step": 880500 + }, + { + "epoch": 5.638941338368483, + "grad_norm": 0.462890625, + "learning_rate": 0.00035488846929305215, + "loss": 5.1945, + "step": 881000 + }, + { + "epoch": 5.642141645597977, + "grad_norm": 0.5234375, + "learning_rate": 0.0003548628668352162, + "loss": 5.1885, + "step": 881500 + }, + { + "epoch": 5.645341952827471, + "grad_norm": 0.4765625, + "learning_rate": 0.0003548372643773803, + "loss": 5.1935, + "step": 882000 + }, + { + "epoch": 5.648542260056965, + "grad_norm": 0.515625, + "learning_rate": 0.0003548116619195443, + "loss": 5.1941, + "step": 882500 + }, + { + "epoch": 5.651742567286459, + "grad_norm": 0.59765625, + "learning_rate": 0.00035478605946170834, + "loss": 5.1936, + "step": 883000 + }, + { + "epoch": 5.654942874515953, + "grad_norm": 0.462890625, + "learning_rate": 0.0003547604570038724, + "loss": 5.1873, + "step": 883500 + }, + { + "epoch": 5.658143181745448, + "grad_norm": 0.52734375, + "learning_rate": 0.00035473485454603647, + "loss": 5.1939, + "step": 884000 + }, + { + "epoch": 5.661343488974942, + "grad_norm": 0.5078125, + "learning_rate": 0.0003547092520882005, + "loss": 5.1914, + "step": 884500 + }, + { + "epoch": 5.664543796204436, + "grad_norm": 0.4921875, + "learning_rate": 0.00035468364963036454, + "loss": 5.1913, + "step": 885000 + }, + { + "epoch": 5.66774410343393, + "grad_norm": 0.515625, + "learning_rate": 0.00035465804717252857, + "loss": 5.1865, + "step": 885500 + }, + { + "epoch": 5.670944410663424, + "grad_norm": 0.5078125, + "learning_rate": 0.0003546324447146926, + "loss": 5.191, + "step": 886000 + }, + { + "epoch": 5.674144717892918, + "grad_norm": 0.54296875, + "learning_rate": 0.0003546068422568567, + "loss": 5.1936, + "step": 886500 + }, + { + "epoch": 5.677345025122412, + "grad_norm": 0.62109375, + "learning_rate": 0.0003545812397990207, + "loss": 5.1917, + "step": 887000 + }, + { + "epoch": 5.6805453323519055, + "grad_norm": 0.53125, + "learning_rate": 0.00035455563734118476, + "loss": 5.1941, + "step": 887500 + }, + { + "epoch": 5.6837456395813994, + "grad_norm": 0.48046875, + "learning_rate": 0.0003545300348833488, + "loss": 5.1957, + "step": 888000 + }, + { + "epoch": 5.686945946810894, + "grad_norm": 0.51953125, + "learning_rate": 0.00035450443242551283, + "loss": 5.1946, + "step": 888500 + }, + { + "epoch": 5.690146254040388, + "grad_norm": 0.50390625, + "learning_rate": 0.0003544788299676769, + "loss": 5.1872, + "step": 889000 + }, + { + "epoch": 5.693346561269882, + "grad_norm": 0.48828125, + "learning_rate": 0.00035445322750984095, + "loss": 5.1903, + "step": 889500 + }, + { + "epoch": 5.696546868499376, + "grad_norm": 0.54296875, + "learning_rate": 0.00035442762505200504, + "loss": 5.1979, + "step": 890000 + }, + { + "epoch": 5.69974717572887, + "grad_norm": 0.515625, + "learning_rate": 0.0003544020225941691, + "loss": 5.1933, + "step": 890500 + }, + { + "epoch": 5.702947482958364, + "grad_norm": 0.54296875, + "learning_rate": 0.0003543764201363331, + "loss": 5.1966, + "step": 891000 + }, + { + "epoch": 5.706147790187858, + "grad_norm": 0.50390625, + "learning_rate": 0.00035435081767849714, + "loss": 5.1889, + "step": 891500 + }, + { + "epoch": 5.709348097417352, + "grad_norm": 0.49609375, + "learning_rate": 0.00035432521522066123, + "loss": 5.1904, + "step": 892000 + }, + { + "epoch": 5.712548404646846, + "grad_norm": 0.498046875, + "learning_rate": 0.00035429961276282527, + "loss": 5.1933, + "step": 892500 + }, + { + "epoch": 5.715748711876341, + "grad_norm": 0.5078125, + "learning_rate": 0.0003542740103049893, + "loss": 5.1877, + "step": 893000 + }, + { + "epoch": 5.7189490191058345, + "grad_norm": 0.58203125, + "learning_rate": 0.00035424840784715333, + "loss": 5.1965, + "step": 893500 + }, + { + "epoch": 5.7221493263353285, + "grad_norm": 0.5234375, + "learning_rate": 0.00035422280538931737, + "loss": 5.1922, + "step": 894000 + }, + { + "epoch": 5.725349633564822, + "grad_norm": 0.50390625, + "learning_rate": 0.0003541972029314814, + "loss": 5.1867, + "step": 894500 + }, + { + "epoch": 5.728549940794316, + "grad_norm": 0.6171875, + "learning_rate": 0.0003541716004736455, + "loss": 5.1915, + "step": 895000 + }, + { + "epoch": 5.73175024802381, + "grad_norm": 0.50390625, + "learning_rate": 0.0003541459980158095, + "loss": 5.1982, + "step": 895500 + }, + { + "epoch": 5.734950555253304, + "grad_norm": 0.51953125, + "learning_rate": 0.00035412039555797356, + "loss": 5.1919, + "step": 896000 + }, + { + "epoch": 5.738150862482798, + "grad_norm": 0.48828125, + "learning_rate": 0.00035409479310013765, + "loss": 5.1929, + "step": 896500 + }, + { + "epoch": 5.741351169712292, + "grad_norm": 0.5390625, + "learning_rate": 0.0003540691906423017, + "loss": 5.1914, + "step": 897000 + }, + { + "epoch": 5.744551476941786, + "grad_norm": 0.54296875, + "learning_rate": 0.00035404358818446577, + "loss": 5.1876, + "step": 897500 + }, + { + "epoch": 5.74775178417128, + "grad_norm": 0.5, + "learning_rate": 0.0003540179857266298, + "loss": 5.1937, + "step": 898000 + }, + { + "epoch": 5.750952091400775, + "grad_norm": 0.515625, + "learning_rate": 0.00035399238326879384, + "loss": 5.194, + "step": 898500 + }, + { + "epoch": 5.754152398630269, + "grad_norm": 0.50390625, + "learning_rate": 0.0003539667808109579, + "loss": 5.1912, + "step": 899000 + }, + { + "epoch": 5.757352705859763, + "grad_norm": 0.51171875, + "learning_rate": 0.0003539411783531219, + "loss": 5.1917, + "step": 899500 + }, + { + "epoch": 5.760553013089257, + "grad_norm": 0.5390625, + "learning_rate": 0.00035391557589528594, + "loss": 5.1971, + "step": 900000 + }, + { + "epoch": 5.7637533203187505, + "grad_norm": 0.458984375, + "learning_rate": 0.00035388997343745003, + "loss": 5.1915, + "step": 900500 + }, + { + "epoch": 5.7669536275482445, + "grad_norm": 0.53125, + "learning_rate": 0.00035386437097961407, + "loss": 5.1934, + "step": 901000 + }, + { + "epoch": 5.770153934777738, + "grad_norm": 0.462890625, + "learning_rate": 0.0003538387685217781, + "loss": 5.1906, + "step": 901500 + }, + { + "epoch": 5.773354242007232, + "grad_norm": 0.482421875, + "learning_rate": 0.00035381316606394213, + "loss": 5.195, + "step": 902000 + }, + { + "epoch": 5.776554549236726, + "grad_norm": 0.4921875, + "learning_rate": 0.00035378756360610617, + "loss": 5.1931, + "step": 902500 + }, + { + "epoch": 5.779754856466221, + "grad_norm": 0.51171875, + "learning_rate": 0.00035376196114827026, + "loss": 5.1899, + "step": 903000 + }, + { + "epoch": 5.782955163695715, + "grad_norm": 0.609375, + "learning_rate": 0.0003537363586904343, + "loss": 5.1867, + "step": 903500 + }, + { + "epoch": 5.786155470925209, + "grad_norm": 0.52734375, + "learning_rate": 0.0003537107562325983, + "loss": 5.1938, + "step": 904000 + }, + { + "epoch": 5.789355778154703, + "grad_norm": 0.48828125, + "learning_rate": 0.0003536851537747624, + "loss": 5.1991, + "step": 904500 + }, + { + "epoch": 5.792556085384197, + "grad_norm": 0.51953125, + "learning_rate": 0.00035365955131692645, + "loss": 5.1889, + "step": 905000 + }, + { + "epoch": 5.795756392613691, + "grad_norm": 0.4609375, + "learning_rate": 0.0003536339488590905, + "loss": 5.1888, + "step": 905500 + }, + { + "epoch": 5.798956699843185, + "grad_norm": 0.51171875, + "learning_rate": 0.00035360834640125457, + "loss": 5.1912, + "step": 906000 + }, + { + "epoch": 5.802157007072679, + "grad_norm": 0.6015625, + "learning_rate": 0.0003535827439434186, + "loss": 5.1871, + "step": 906500 + }, + { + "epoch": 5.805357314302173, + "grad_norm": 0.5625, + "learning_rate": 0.00035355714148558264, + "loss": 5.198, + "step": 907000 + }, + { + "epoch": 5.808557621531667, + "grad_norm": 0.58984375, + "learning_rate": 0.0003535315390277467, + "loss": 5.1929, + "step": 907500 + }, + { + "epoch": 5.811757928761161, + "grad_norm": 0.5078125, + "learning_rate": 0.0003535059365699107, + "loss": 5.1934, + "step": 908000 + }, + { + "epoch": 5.814958235990655, + "grad_norm": 0.498046875, + "learning_rate": 0.0003534803341120748, + "loss": 5.1958, + "step": 908500 + }, + { + "epoch": 5.818158543220149, + "grad_norm": 0.55078125, + "learning_rate": 0.00035345473165423883, + "loss": 5.19, + "step": 909000 + }, + { + "epoch": 5.821358850449643, + "grad_norm": 0.52734375, + "learning_rate": 0.00035342912919640286, + "loss": 5.1926, + "step": 909500 + }, + { + "epoch": 5.824559157679137, + "grad_norm": 0.4765625, + "learning_rate": 0.0003534035267385669, + "loss": 5.196, + "step": 910000 + }, + { + "epoch": 5.827759464908631, + "grad_norm": 0.46484375, + "learning_rate": 0.00035337792428073093, + "loss": 5.1921, + "step": 910500 + }, + { + "epoch": 5.830959772138125, + "grad_norm": 0.486328125, + "learning_rate": 0.00035335232182289497, + "loss": 5.1961, + "step": 911000 + }, + { + "epoch": 5.834160079367619, + "grad_norm": 0.50390625, + "learning_rate": 0.00035332671936505906, + "loss": 5.1909, + "step": 911500 + }, + { + "epoch": 5.837360386597114, + "grad_norm": 0.6328125, + "learning_rate": 0.00035330111690722314, + "loss": 5.1908, + "step": 912000 + }, + { + "epoch": 5.840560693826608, + "grad_norm": 0.51171875, + "learning_rate": 0.0003532755144493872, + "loss": 5.199, + "step": 912500 + }, + { + "epoch": 5.843761001056102, + "grad_norm": 0.5390625, + "learning_rate": 0.0003532499119915512, + "loss": 5.1974, + "step": 913000 + }, + { + "epoch": 5.846961308285596, + "grad_norm": 0.51953125, + "learning_rate": 0.00035322430953371525, + "loss": 5.1941, + "step": 913500 + }, + { + "epoch": 5.8501616155150895, + "grad_norm": 0.5234375, + "learning_rate": 0.00035319870707587934, + "loss": 5.1948, + "step": 914000 + }, + { + "epoch": 5.8533619227445834, + "grad_norm": 0.494140625, + "learning_rate": 0.00035317310461804337, + "loss": 5.1903, + "step": 914500 + }, + { + "epoch": 5.856562229974077, + "grad_norm": 0.5078125, + "learning_rate": 0.0003531475021602074, + "loss": 5.1971, + "step": 915000 + }, + { + "epoch": 5.859762537203571, + "grad_norm": 0.490234375, + "learning_rate": 0.00035312189970237144, + "loss": 5.1977, + "step": 915500 + }, + { + "epoch": 5.862962844433065, + "grad_norm": 0.53515625, + "learning_rate": 0.00035309629724453547, + "loss": 5.1835, + "step": 916000 + }, + { + "epoch": 5.86616315166256, + "grad_norm": 0.51171875, + "learning_rate": 0.0003530706947866995, + "loss": 5.1871, + "step": 916500 + }, + { + "epoch": 5.869363458892054, + "grad_norm": 0.546875, + "learning_rate": 0.0003530450923288636, + "loss": 5.1921, + "step": 917000 + }, + { + "epoch": 5.872563766121548, + "grad_norm": 0.48046875, + "learning_rate": 0.00035301948987102763, + "loss": 5.1888, + "step": 917500 + }, + { + "epoch": 5.875764073351042, + "grad_norm": 0.5, + "learning_rate": 0.00035299388741319166, + "loss": 5.1879, + "step": 918000 + }, + { + "epoch": 5.878964380580536, + "grad_norm": 0.55078125, + "learning_rate": 0.0003529682849553557, + "loss": 5.1944, + "step": 918500 + }, + { + "epoch": 5.88216468781003, + "grad_norm": 0.5, + "learning_rate": 0.0003529426824975198, + "loss": 5.1927, + "step": 919000 + }, + { + "epoch": 5.885364995039524, + "grad_norm": 0.54296875, + "learning_rate": 0.0003529170800396838, + "loss": 5.1947, + "step": 919500 + }, + { + "epoch": 5.888565302269018, + "grad_norm": 0.51953125, + "learning_rate": 0.0003528914775818479, + "loss": 5.1938, + "step": 920000 + }, + { + "epoch": 5.891765609498512, + "grad_norm": 0.5234375, + "learning_rate": 0.00035286587512401194, + "loss": 5.1885, + "step": 920500 + }, + { + "epoch": 5.8949659167280055, + "grad_norm": 0.484375, + "learning_rate": 0.000352840272666176, + "loss": 5.193, + "step": 921000 + }, + { + "epoch": 5.8981662239574995, + "grad_norm": 0.53125, + "learning_rate": 0.00035281467020834, + "loss": 5.1907, + "step": 921500 + }, + { + "epoch": 5.901366531186994, + "grad_norm": 0.53125, + "learning_rate": 0.00035278906775050405, + "loss": 5.1884, + "step": 922000 + }, + { + "epoch": 5.904566838416488, + "grad_norm": 0.50390625, + "learning_rate": 0.00035276346529266813, + "loss": 5.1923, + "step": 922500 + }, + { + "epoch": 5.907767145645982, + "grad_norm": 0.4921875, + "learning_rate": 0.00035273786283483217, + "loss": 5.1928, + "step": 923000 + }, + { + "epoch": 5.910967452875476, + "grad_norm": 0.5, + "learning_rate": 0.0003527122603769962, + "loss": 5.1888, + "step": 923500 + }, + { + "epoch": 5.91416776010497, + "grad_norm": 0.48828125, + "learning_rate": 0.00035268665791916024, + "loss": 5.1952, + "step": 924000 + }, + { + "epoch": 5.917368067334464, + "grad_norm": 0.5, + "learning_rate": 0.00035266105546132427, + "loss": 5.1968, + "step": 924500 + }, + { + "epoch": 5.920568374563958, + "grad_norm": 0.5234375, + "learning_rate": 0.00035263545300348836, + "loss": 5.1912, + "step": 925000 + }, + { + "epoch": 5.923768681793452, + "grad_norm": 0.53125, + "learning_rate": 0.0003526098505456524, + "loss": 5.1918, + "step": 925500 + }, + { + "epoch": 5.926968989022946, + "grad_norm": 0.53125, + "learning_rate": 0.00035258424808781643, + "loss": 5.1855, + "step": 926000 + }, + { + "epoch": 5.930169296252441, + "grad_norm": 0.55859375, + "learning_rate": 0.00035255864562998046, + "loss": 5.1912, + "step": 926500 + }, + { + "epoch": 5.9333696034819345, + "grad_norm": 0.482421875, + "learning_rate": 0.00035253304317214455, + "loss": 5.1881, + "step": 927000 + }, + { + "epoch": 5.9365699107114285, + "grad_norm": 0.49609375, + "learning_rate": 0.00035250744071430864, + "loss": 5.1856, + "step": 927500 + }, + { + "epoch": 5.939770217940922, + "grad_norm": 0.51953125, + "learning_rate": 0.0003524818382564727, + "loss": 5.1917, + "step": 928000 + }, + { + "epoch": 5.942970525170416, + "grad_norm": 0.515625, + "learning_rate": 0.0003524562357986367, + "loss": 5.1948, + "step": 928500 + }, + { + "epoch": 5.94617083239991, + "grad_norm": 0.484375, + "learning_rate": 0.00035243063334080074, + "loss": 5.1927, + "step": 929000 + }, + { + "epoch": 5.949371139629404, + "grad_norm": 0.53515625, + "learning_rate": 0.0003524050308829648, + "loss": 5.1937, + "step": 929500 + }, + { + "epoch": 5.952571446858898, + "grad_norm": 0.5234375, + "learning_rate": 0.0003523794284251288, + "loss": 5.189, + "step": 930000 + }, + { + "epoch": 5.955771754088392, + "grad_norm": 0.5, + "learning_rate": 0.0003523538259672929, + "loss": 5.1947, + "step": 930500 + }, + { + "epoch": 5.958972061317887, + "grad_norm": 0.478515625, + "learning_rate": 0.00035232822350945693, + "loss": 5.1924, + "step": 931000 + }, + { + "epoch": 5.962172368547381, + "grad_norm": 0.49609375, + "learning_rate": 0.00035230262105162097, + "loss": 5.193, + "step": 931500 + }, + { + "epoch": 5.965372675776875, + "grad_norm": 0.482421875, + "learning_rate": 0.000352277018593785, + "loss": 5.1917, + "step": 932000 + }, + { + "epoch": 5.968572983006369, + "grad_norm": 0.54296875, + "learning_rate": 0.00035225141613594904, + "loss": 5.1942, + "step": 932500 + }, + { + "epoch": 5.971773290235863, + "grad_norm": 0.50390625, + "learning_rate": 0.0003522258136781131, + "loss": 5.1945, + "step": 933000 + }, + { + "epoch": 5.974973597465357, + "grad_norm": 0.5234375, + "learning_rate": 0.00035220021122027716, + "loss": 5.1909, + "step": 933500 + }, + { + "epoch": 5.9781739046948505, + "grad_norm": 0.56640625, + "learning_rate": 0.0003521746087624412, + "loss": 5.1906, + "step": 934000 + }, + { + "epoch": 5.9813742119243445, + "grad_norm": 0.4921875, + "learning_rate": 0.0003521490063046053, + "loss": 5.1929, + "step": 934500 + }, + { + "epoch": 5.984574519153838, + "grad_norm": 0.55078125, + "learning_rate": 0.0003521234038467693, + "loss": 5.1918, + "step": 935000 + }, + { + "epoch": 5.987774826383333, + "grad_norm": 0.53125, + "learning_rate": 0.00035209780138893335, + "loss": 5.1913, + "step": 935500 + }, + { + "epoch": 5.990975133612827, + "grad_norm": 0.54296875, + "learning_rate": 0.00035207219893109744, + "loss": 5.1875, + "step": 936000 + }, + { + "epoch": 5.994175440842321, + "grad_norm": 0.56640625, + "learning_rate": 0.00035204659647326147, + "loss": 5.1937, + "step": 936500 + }, + { + "epoch": 5.997375748071815, + "grad_norm": 0.5, + "learning_rate": 0.0003520209940154255, + "loss": 5.1945, + "step": 937000 + }, + { + "epoch": 6.0, + "eval_loss": 5.182580947875977, + "eval_runtime": 1.1238, + "eval_samples_per_second": 889.838, + "eval_steps_per_second": 14.237, + "step": 937410 + }, + { + "epoch": 6.000576055301309, + "grad_norm": 0.484375, + "learning_rate": 0.00035199539155758954, + "loss": 5.1917, + "step": 937500 + }, + { + "epoch": 6.003776362530803, + "grad_norm": 0.50390625, + "learning_rate": 0.0003519697890997536, + "loss": 5.1882, + "step": 938000 + }, + { + "epoch": 6.006976669760297, + "grad_norm": 0.5078125, + "learning_rate": 0.00035194418664191766, + "loss": 5.1892, + "step": 938500 + }, + { + "epoch": 6.010176976989791, + "grad_norm": 0.53515625, + "learning_rate": 0.0003519185841840817, + "loss": 5.193, + "step": 939000 + }, + { + "epoch": 6.013377284219285, + "grad_norm": 0.486328125, + "learning_rate": 0.00035189298172624573, + "loss": 5.1884, + "step": 939500 + }, + { + "epoch": 6.016577591448779, + "grad_norm": 0.49609375, + "learning_rate": 0.00035186737926840977, + "loss": 5.1862, + "step": 940000 + }, + { + "epoch": 6.0197778986782735, + "grad_norm": 0.515625, + "learning_rate": 0.0003518417768105738, + "loss": 5.1902, + "step": 940500 + }, + { + "epoch": 6.0229782059077674, + "grad_norm": 0.49609375, + "learning_rate": 0.00035181617435273783, + "loss": 5.1893, + "step": 941000 + }, + { + "epoch": 6.026178513137261, + "grad_norm": 0.51953125, + "learning_rate": 0.0003517905718949019, + "loss": 5.1898, + "step": 941500 + }, + { + "epoch": 6.029378820366755, + "grad_norm": 0.5234375, + "learning_rate": 0.00035176496943706596, + "loss": 5.1912, + "step": 942000 + }, + { + "epoch": 6.032579127596249, + "grad_norm": 0.51953125, + "learning_rate": 0.00035173936697923005, + "loss": 5.1923, + "step": 942500 + }, + { + "epoch": 6.035779434825743, + "grad_norm": 0.55859375, + "learning_rate": 0.0003517137645213941, + "loss": 5.1908, + "step": 943000 + }, + { + "epoch": 6.038979742055237, + "grad_norm": 0.52734375, + "learning_rate": 0.0003516881620635581, + "loss": 5.1907, + "step": 943500 + }, + { + "epoch": 6.042180049284731, + "grad_norm": 0.462890625, + "learning_rate": 0.0003516625596057222, + "loss": 5.1906, + "step": 944000 + }, + { + "epoch": 6.045380356514225, + "grad_norm": 0.515625, + "learning_rate": 0.00035163695714788624, + "loss": 5.1924, + "step": 944500 + }, + { + "epoch": 6.04858066374372, + "grad_norm": 0.5234375, + "learning_rate": 0.00035161135469005027, + "loss": 5.1851, + "step": 945000 + }, + { + "epoch": 6.051780970973214, + "grad_norm": 0.54296875, + "learning_rate": 0.0003515857522322143, + "loss": 5.1892, + "step": 945500 + }, + { + "epoch": 6.054981278202708, + "grad_norm": 0.62109375, + "learning_rate": 0.00035156014977437834, + "loss": 5.1903, + "step": 946000 + }, + { + "epoch": 6.058181585432202, + "grad_norm": 0.49609375, + "learning_rate": 0.0003515345473165424, + "loss": 5.1901, + "step": 946500 + }, + { + "epoch": 6.061381892661696, + "grad_norm": 0.5546875, + "learning_rate": 0.00035150894485870646, + "loss": 5.1916, + "step": 947000 + }, + { + "epoch": 6.0645821998911895, + "grad_norm": 0.515625, + "learning_rate": 0.0003514833424008705, + "loss": 5.1928, + "step": 947500 + }, + { + "epoch": 6.0677825071206835, + "grad_norm": 0.486328125, + "learning_rate": 0.00035145773994303453, + "loss": 5.194, + "step": 948000 + }, + { + "epoch": 6.070982814350177, + "grad_norm": 0.51953125, + "learning_rate": 0.00035143213748519857, + "loss": 5.1887, + "step": 948500 + }, + { + "epoch": 6.074183121579671, + "grad_norm": 0.5546875, + "learning_rate": 0.00035140653502736265, + "loss": 5.1883, + "step": 949000 + }, + { + "epoch": 6.077383428809165, + "grad_norm": 0.5078125, + "learning_rate": 0.0003513809325695267, + "loss": 5.1863, + "step": 949500 + }, + { + "epoch": 6.08058373603866, + "grad_norm": 0.51953125, + "learning_rate": 0.0003513553301116908, + "loss": 5.1892, + "step": 950000 + }, + { + "epoch": 6.083784043268154, + "grad_norm": 0.5546875, + "learning_rate": 0.0003513297276538548, + "loss": 5.1887, + "step": 950500 + }, + { + "epoch": 6.086984350497648, + "grad_norm": 0.51953125, + "learning_rate": 0.00035130412519601884, + "loss": 5.1915, + "step": 951000 + }, + { + "epoch": 6.090184657727142, + "grad_norm": 0.55078125, + "learning_rate": 0.0003512785227381829, + "loss": 5.1833, + "step": 951500 + }, + { + "epoch": 6.093384964956636, + "grad_norm": 0.48046875, + "learning_rate": 0.0003512529202803469, + "loss": 5.1895, + "step": 952000 + }, + { + "epoch": 6.09658527218613, + "grad_norm": 0.51171875, + "learning_rate": 0.000351227317822511, + "loss": 5.1865, + "step": 952500 + }, + { + "epoch": 6.099785579415624, + "grad_norm": 0.51171875, + "learning_rate": 0.00035120171536467504, + "loss": 5.1873, + "step": 953000 + }, + { + "epoch": 6.102985886645118, + "grad_norm": 0.59375, + "learning_rate": 0.00035117611290683907, + "loss": 5.1876, + "step": 953500 + }, + { + "epoch": 6.106186193874612, + "grad_norm": 0.50390625, + "learning_rate": 0.0003511505104490031, + "loss": 5.1868, + "step": 954000 + }, + { + "epoch": 6.109386501104106, + "grad_norm": 0.5703125, + "learning_rate": 0.00035112490799116714, + "loss": 5.1895, + "step": 954500 + }, + { + "epoch": 6.1125868083336, + "grad_norm": 0.4921875, + "learning_rate": 0.00035109930553333123, + "loss": 5.1849, + "step": 955000 + }, + { + "epoch": 6.115787115563094, + "grad_norm": 0.51171875, + "learning_rate": 0.00035107370307549526, + "loss": 5.1908, + "step": 955500 + }, + { + "epoch": 6.118987422792588, + "grad_norm": 0.5, + "learning_rate": 0.0003510481006176593, + "loss": 5.1902, + "step": 956000 + }, + { + "epoch": 6.122187730022082, + "grad_norm": 0.57421875, + "learning_rate": 0.00035102249815982333, + "loss": 5.1848, + "step": 956500 + }, + { + "epoch": 6.125388037251576, + "grad_norm": 0.498046875, + "learning_rate": 0.0003509968957019874, + "loss": 5.1872, + "step": 957000 + }, + { + "epoch": 6.12858834448107, + "grad_norm": 0.5390625, + "learning_rate": 0.00035097129324415145, + "loss": 5.1885, + "step": 957500 + }, + { + "epoch": 6.131788651710564, + "grad_norm": 0.51953125, + "learning_rate": 0.00035094569078631554, + "loss": 5.1939, + "step": 958000 + }, + { + "epoch": 6.134988958940058, + "grad_norm": 0.56640625, + "learning_rate": 0.0003509200883284796, + "loss": 5.1867, + "step": 958500 + }, + { + "epoch": 6.138189266169552, + "grad_norm": 0.51953125, + "learning_rate": 0.0003508944858706436, + "loss": 5.1911, + "step": 959000 + }, + { + "epoch": 6.141389573399047, + "grad_norm": 0.578125, + "learning_rate": 0.00035086888341280764, + "loss": 5.1907, + "step": 959500 + }, + { + "epoch": 6.144589880628541, + "grad_norm": 0.53515625, + "learning_rate": 0.0003508432809549717, + "loss": 5.1826, + "step": 960000 + }, + { + "epoch": 6.1477901878580345, + "grad_norm": 0.546875, + "learning_rate": 0.00035081767849713577, + "loss": 5.1895, + "step": 960500 + }, + { + "epoch": 6.1509904950875285, + "grad_norm": 0.515625, + "learning_rate": 0.0003507920760392998, + "loss": 5.1838, + "step": 961000 + }, + { + "epoch": 6.154190802317022, + "grad_norm": 0.51171875, + "learning_rate": 0.00035076647358146384, + "loss": 5.189, + "step": 961500 + }, + { + "epoch": 6.157391109546516, + "grad_norm": 0.55859375, + "learning_rate": 0.00035074087112362787, + "loss": 5.197, + "step": 962000 + }, + { + "epoch": 6.16059141677601, + "grad_norm": 0.4921875, + "learning_rate": 0.0003507152686657919, + "loss": 5.1883, + "step": 962500 + }, + { + "epoch": 6.163791724005504, + "grad_norm": 0.54296875, + "learning_rate": 0.000350689666207956, + "loss": 5.1886, + "step": 963000 + }, + { + "epoch": 6.166992031234998, + "grad_norm": 0.5, + "learning_rate": 0.00035066406375012, + "loss": 5.1899, + "step": 963500 + }, + { + "epoch": 6.170192338464493, + "grad_norm": 0.5234375, + "learning_rate": 0.00035063846129228406, + "loss": 5.188, + "step": 964000 + }, + { + "epoch": 6.173392645693987, + "grad_norm": 0.5859375, + "learning_rate": 0.0003506128588344481, + "loss": 5.1878, + "step": 964500 + }, + { + "epoch": 6.176592952923481, + "grad_norm": 0.58984375, + "learning_rate": 0.0003505872563766122, + "loss": 5.1893, + "step": 965000 + }, + { + "epoch": 6.179793260152975, + "grad_norm": 0.484375, + "learning_rate": 0.0003505616539187762, + "loss": 5.1954, + "step": 965500 + }, + { + "epoch": 6.182993567382469, + "grad_norm": 0.5, + "learning_rate": 0.0003505360514609403, + "loss": 5.1895, + "step": 966000 + }, + { + "epoch": 6.186193874611963, + "grad_norm": 0.5546875, + "learning_rate": 0.00035051044900310434, + "loss": 5.1938, + "step": 966500 + }, + { + "epoch": 6.189394181841457, + "grad_norm": 0.51171875, + "learning_rate": 0.0003504848465452684, + "loss": 5.1846, + "step": 967000 + }, + { + "epoch": 6.192594489070951, + "grad_norm": 0.546875, + "learning_rate": 0.0003504592440874324, + "loss": 5.1904, + "step": 967500 + }, + { + "epoch": 6.1957947963004445, + "grad_norm": 0.515625, + "learning_rate": 0.00035043364162959644, + "loss": 5.1925, + "step": 968000 + }, + { + "epoch": 6.198995103529938, + "grad_norm": 0.5546875, + "learning_rate": 0.00035040803917176053, + "loss": 5.1875, + "step": 968500 + }, + { + "epoch": 6.202195410759433, + "grad_norm": 0.5078125, + "learning_rate": 0.00035038243671392457, + "loss": 5.1934, + "step": 969000 + }, + { + "epoch": 6.205395717988927, + "grad_norm": 0.51953125, + "learning_rate": 0.0003503568342560886, + "loss": 5.1878, + "step": 969500 + }, + { + "epoch": 6.208596025218421, + "grad_norm": 0.51953125, + "learning_rate": 0.00035033123179825263, + "loss": 5.1826, + "step": 970000 + }, + { + "epoch": 6.211796332447915, + "grad_norm": 0.55859375, + "learning_rate": 0.00035030562934041667, + "loss": 5.1873, + "step": 970500 + }, + { + "epoch": 6.214996639677409, + "grad_norm": 0.54296875, + "learning_rate": 0.0003502800268825807, + "loss": 5.1948, + "step": 971000 + }, + { + "epoch": 6.218196946906903, + "grad_norm": 0.4921875, + "learning_rate": 0.0003502544244247448, + "loss": 5.1897, + "step": 971500 + }, + { + "epoch": 6.221397254136397, + "grad_norm": 0.50390625, + "learning_rate": 0.0003502288219669088, + "loss": 5.19, + "step": 972000 + }, + { + "epoch": 6.224597561365891, + "grad_norm": 0.5703125, + "learning_rate": 0.0003502032195090729, + "loss": 5.1866, + "step": 972500 + }, + { + "epoch": 6.227797868595385, + "grad_norm": 0.50390625, + "learning_rate": 0.00035017761705123695, + "loss": 5.1908, + "step": 973000 + }, + { + "epoch": 6.23099817582488, + "grad_norm": 0.52734375, + "learning_rate": 0.000350152014593401, + "loss": 5.1909, + "step": 973500 + }, + { + "epoch": 6.2341984830543735, + "grad_norm": 0.54296875, + "learning_rate": 0.00035012641213556507, + "loss": 5.1882, + "step": 974000 + }, + { + "epoch": 6.2373987902838675, + "grad_norm": 0.58984375, + "learning_rate": 0.0003501008096777291, + "loss": 5.1924, + "step": 974500 + }, + { + "epoch": 6.240599097513361, + "grad_norm": 0.5703125, + "learning_rate": 0.00035007520721989314, + "loss": 5.1889, + "step": 975000 + }, + { + "epoch": 6.243799404742855, + "grad_norm": 0.51171875, + "learning_rate": 0.0003500496047620572, + "loss": 5.1881, + "step": 975500 + }, + { + "epoch": 6.246999711972349, + "grad_norm": 0.5859375, + "learning_rate": 0.0003500240023042212, + "loss": 5.188, + "step": 976000 + }, + { + "epoch": 6.250200019201843, + "grad_norm": 0.5078125, + "learning_rate": 0.00034999839984638524, + "loss": 5.1906, + "step": 976500 + }, + { + "epoch": 6.253400326431337, + "grad_norm": 0.48828125, + "learning_rate": 0.00034997279738854933, + "loss": 5.184, + "step": 977000 + }, + { + "epoch": 6.256600633660831, + "grad_norm": 0.52734375, + "learning_rate": 0.00034994719493071336, + "loss": 5.1946, + "step": 977500 + }, + { + "epoch": 6.259800940890326, + "grad_norm": 0.52734375, + "learning_rate": 0.0003499215924728774, + "loss": 5.1867, + "step": 978000 + }, + { + "epoch": 6.26300124811982, + "grad_norm": 0.5078125, + "learning_rate": 0.00034989599001504143, + "loss": 5.1909, + "step": 978500 + }, + { + "epoch": 6.266201555349314, + "grad_norm": 0.54296875, + "learning_rate": 0.00034987038755720547, + "loss": 5.192, + "step": 979000 + }, + { + "epoch": 6.269401862578808, + "grad_norm": 0.515625, + "learning_rate": 0.00034984478509936956, + "loss": 5.1862, + "step": 979500 + }, + { + "epoch": 6.272602169808302, + "grad_norm": 0.478515625, + "learning_rate": 0.0003498191826415336, + "loss": 5.1925, + "step": 980000 + }, + { + "epoch": 6.275802477037796, + "grad_norm": 0.490234375, + "learning_rate": 0.0003497935801836977, + "loss": 5.1927, + "step": 980500 + }, + { + "epoch": 6.2790027842672895, + "grad_norm": 0.50390625, + "learning_rate": 0.0003497679777258617, + "loss": 5.1869, + "step": 981000 + }, + { + "epoch": 6.2822030914967835, + "grad_norm": 0.515625, + "learning_rate": 0.00034974237526802575, + "loss": 5.1929, + "step": 981500 + }, + { + "epoch": 6.285403398726277, + "grad_norm": 0.52734375, + "learning_rate": 0.0003497167728101898, + "loss": 5.1874, + "step": 982000 + }, + { + "epoch": 6.288603705955771, + "grad_norm": 0.57421875, + "learning_rate": 0.00034969117035235387, + "loss": 5.1839, + "step": 982500 + }, + { + "epoch": 6.291804013185266, + "grad_norm": 0.57421875, + "learning_rate": 0.0003496655678945179, + "loss": 5.1902, + "step": 983000 + }, + { + "epoch": 6.29500432041476, + "grad_norm": 0.54296875, + "learning_rate": 0.00034963996543668194, + "loss": 5.1861, + "step": 983500 + }, + { + "epoch": 6.298204627644254, + "grad_norm": 0.5703125, + "learning_rate": 0.00034961436297884597, + "loss": 5.1926, + "step": 984000 + }, + { + "epoch": 6.301404934873748, + "grad_norm": 0.490234375, + "learning_rate": 0.00034958876052101, + "loss": 5.1846, + "step": 984500 + }, + { + "epoch": 6.304605242103242, + "grad_norm": 0.53125, + "learning_rate": 0.0003495631580631741, + "loss": 5.1901, + "step": 985000 + }, + { + "epoch": 6.307805549332736, + "grad_norm": 0.515625, + "learning_rate": 0.00034953755560533813, + "loss": 5.1898, + "step": 985500 + }, + { + "epoch": 6.31100585656223, + "grad_norm": 0.53515625, + "learning_rate": 0.00034951195314750216, + "loss": 5.1932, + "step": 986000 + }, + { + "epoch": 6.314206163791724, + "grad_norm": 0.66015625, + "learning_rate": 0.0003494863506896662, + "loss": 5.186, + "step": 986500 + }, + { + "epoch": 6.317406471021218, + "grad_norm": 0.482421875, + "learning_rate": 0.0003494607482318303, + "loss": 5.192, + "step": 987000 + }, + { + "epoch": 6.320606778250712, + "grad_norm": 0.57421875, + "learning_rate": 0.0003494351457739943, + "loss": 5.193, + "step": 987500 + }, + { + "epoch": 6.323807085480206, + "grad_norm": 0.5234375, + "learning_rate": 0.0003494095433161584, + "loss": 5.1917, + "step": 988000 + }, + { + "epoch": 6.3270073927097, + "grad_norm": 0.5234375, + "learning_rate": 0.00034938394085832244, + "loss": 5.188, + "step": 988500 + }, + { + "epoch": 6.330207699939194, + "grad_norm": 0.625, + "learning_rate": 0.0003493583384004865, + "loss": 5.19, + "step": 989000 + }, + { + "epoch": 6.333408007168688, + "grad_norm": 0.484375, + "learning_rate": 0.0003493327359426505, + "loss": 5.1942, + "step": 989500 + }, + { + "epoch": 6.336608314398182, + "grad_norm": 0.5625, + "learning_rate": 0.00034930713348481455, + "loss": 5.1932, + "step": 990000 + }, + { + "epoch": 6.339808621627676, + "grad_norm": 0.5234375, + "learning_rate": 0.00034928153102697863, + "loss": 5.189, + "step": 990500 + }, + { + "epoch": 6.34300892885717, + "grad_norm": 0.5625, + "learning_rate": 0.00034925592856914267, + "loss": 5.1888, + "step": 991000 + }, + { + "epoch": 6.346209236086664, + "grad_norm": 0.51171875, + "learning_rate": 0.0003492303261113067, + "loss": 5.1849, + "step": 991500 + }, + { + "epoch": 6.349409543316158, + "grad_norm": 0.49609375, + "learning_rate": 0.00034920472365347074, + "loss": 5.1969, + "step": 992000 + }, + { + "epoch": 6.352609850545653, + "grad_norm": 0.5234375, + "learning_rate": 0.00034917912119563477, + "loss": 5.1877, + "step": 992500 + }, + { + "epoch": 6.355810157775147, + "grad_norm": 0.5234375, + "learning_rate": 0.0003491535187377988, + "loss": 5.1914, + "step": 993000 + }, + { + "epoch": 6.359010465004641, + "grad_norm": 0.5234375, + "learning_rate": 0.0003491279162799629, + "loss": 5.1909, + "step": 993500 + }, + { + "epoch": 6.362210772234135, + "grad_norm": 0.515625, + "learning_rate": 0.00034910231382212693, + "loss": 5.1861, + "step": 994000 + }, + { + "epoch": 6.3654110794636285, + "grad_norm": 0.5, + "learning_rate": 0.00034907671136429096, + "loss": 5.1896, + "step": 994500 + }, + { + "epoch": 6.368611386693122, + "grad_norm": 0.5859375, + "learning_rate": 0.00034905110890645505, + "loss": 5.1852, + "step": 995000 + }, + { + "epoch": 6.371811693922616, + "grad_norm": 0.5078125, + "learning_rate": 0.0003490255064486191, + "loss": 5.1914, + "step": 995500 + }, + { + "epoch": 6.37501200115211, + "grad_norm": 0.498046875, + "learning_rate": 0.0003489999039907832, + "loss": 5.1831, + "step": 996000 + }, + { + "epoch": 6.378212308381604, + "grad_norm": 0.490234375, + "learning_rate": 0.0003489743015329472, + "loss": 5.1936, + "step": 996500 + }, + { + "epoch": 6.381412615611099, + "grad_norm": 0.55859375, + "learning_rate": 0.00034894869907511124, + "loss": 5.1896, + "step": 997000 + }, + { + "epoch": 6.384612922840593, + "grad_norm": 0.4609375, + "learning_rate": 0.0003489230966172753, + "loss": 5.1879, + "step": 997500 + }, + { + "epoch": 6.387813230070087, + "grad_norm": 0.578125, + "learning_rate": 0.0003488974941594393, + "loss": 5.1874, + "step": 998000 + }, + { + "epoch": 6.391013537299581, + "grad_norm": 0.55859375, + "learning_rate": 0.0003488718917016034, + "loss": 5.1846, + "step": 998500 + }, + { + "epoch": 6.394213844529075, + "grad_norm": 0.5703125, + "learning_rate": 0.00034884628924376743, + "loss": 5.1941, + "step": 999000 + }, + { + "epoch": 6.397414151758569, + "grad_norm": 0.58203125, + "learning_rate": 0.00034882068678593147, + "loss": 5.1869, + "step": 999500 + }, + { + "epoch": 6.400614458988063, + "grad_norm": 0.4921875, + "learning_rate": 0.0003487950843280955, + "loss": 5.1922, + "step": 1000000 + }, + { + "epoch": 6.403814766217557, + "grad_norm": 0.48828125, + "learning_rate": 0.00034876948187025954, + "loss": 5.1877, + "step": 1000500 + }, + { + "epoch": 6.407015073447051, + "grad_norm": 0.59375, + "learning_rate": 0.00034874387941242357, + "loss": 5.1912, + "step": 1001000 + }, + { + "epoch": 6.410215380676545, + "grad_norm": 0.5078125, + "learning_rate": 0.00034871827695458766, + "loss": 5.1865, + "step": 1001500 + }, + { + "epoch": 6.413415687906039, + "grad_norm": 0.50390625, + "learning_rate": 0.0003486926744967517, + "loss": 5.1884, + "step": 1002000 + }, + { + "epoch": 6.416615995135533, + "grad_norm": 0.55859375, + "learning_rate": 0.00034866707203891573, + "loss": 5.1913, + "step": 1002500 + }, + { + "epoch": 6.419816302365027, + "grad_norm": 0.5625, + "learning_rate": 0.0003486414695810798, + "loss": 5.1852, + "step": 1003000 + }, + { + "epoch": 6.423016609594521, + "grad_norm": 0.5390625, + "learning_rate": 0.00034861586712324385, + "loss": 5.1951, + "step": 1003500 + }, + { + "epoch": 6.426216916824015, + "grad_norm": 0.51171875, + "learning_rate": 0.00034859026466540794, + "loss": 5.1873, + "step": 1004000 + }, + { + "epoch": 6.429417224053509, + "grad_norm": 0.515625, + "learning_rate": 0.00034856466220757197, + "loss": 5.1911, + "step": 1004500 + }, + { + "epoch": 6.432617531283003, + "grad_norm": 0.58984375, + "learning_rate": 0.000348539059749736, + "loss": 5.1974, + "step": 1005000 + }, + { + "epoch": 6.435817838512497, + "grad_norm": 0.5078125, + "learning_rate": 0.00034851345729190004, + "loss": 5.1876, + "step": 1005500 + }, + { + "epoch": 6.439018145741991, + "grad_norm": 0.5625, + "learning_rate": 0.0003484878548340641, + "loss": 5.19, + "step": 1006000 + }, + { + "epoch": 6.442218452971486, + "grad_norm": 0.6015625, + "learning_rate": 0.0003484622523762281, + "loss": 5.1888, + "step": 1006500 + }, + { + "epoch": 6.44541876020098, + "grad_norm": 0.5390625, + "learning_rate": 0.0003484366499183922, + "loss": 5.1857, + "step": 1007000 + }, + { + "epoch": 6.4486190674304735, + "grad_norm": 0.5234375, + "learning_rate": 0.00034841104746055623, + "loss": 5.1881, + "step": 1007500 + }, + { + "epoch": 6.4518193746599675, + "grad_norm": 0.51953125, + "learning_rate": 0.00034838544500272027, + "loss": 5.1899, + "step": 1008000 + }, + { + "epoch": 6.455019681889461, + "grad_norm": 0.515625, + "learning_rate": 0.0003483598425448843, + "loss": 5.1891, + "step": 1008500 + }, + { + "epoch": 6.458219989118955, + "grad_norm": 0.48828125, + "learning_rate": 0.00034833424008704833, + "loss": 5.1972, + "step": 1009000 + }, + { + "epoch": 6.461420296348449, + "grad_norm": 0.59375, + "learning_rate": 0.0003483086376292124, + "loss": 5.1911, + "step": 1009500 + }, + { + "epoch": 6.464620603577943, + "grad_norm": 0.53515625, + "learning_rate": 0.00034828303517137646, + "loss": 5.1908, + "step": 1010000 + }, + { + "epoch": 6.467820910807437, + "grad_norm": 0.546875, + "learning_rate": 0.00034825743271354055, + "loss": 5.1851, + "step": 1010500 + }, + { + "epoch": 6.471021218036931, + "grad_norm": 0.54296875, + "learning_rate": 0.0003482318302557046, + "loss": 5.1934, + "step": 1011000 + }, + { + "epoch": 6.474221525266426, + "grad_norm": 0.53125, + "learning_rate": 0.0003482062277978686, + "loss": 5.1904, + "step": 1011500 + }, + { + "epoch": 6.47742183249592, + "grad_norm": 0.5390625, + "learning_rate": 0.00034818062534003265, + "loss": 5.1897, + "step": 1012000 + }, + { + "epoch": 6.480622139725414, + "grad_norm": 0.5234375, + "learning_rate": 0.00034815502288219674, + "loss": 5.1882, + "step": 1012500 + }, + { + "epoch": 6.483822446954908, + "grad_norm": 0.5078125, + "learning_rate": 0.00034812942042436077, + "loss": 5.1908, + "step": 1013000 + }, + { + "epoch": 6.487022754184402, + "grad_norm": 0.51171875, + "learning_rate": 0.0003481038179665248, + "loss": 5.1889, + "step": 1013500 + }, + { + "epoch": 6.490223061413896, + "grad_norm": 0.53125, + "learning_rate": 0.00034807821550868884, + "loss": 5.1961, + "step": 1014000 + }, + { + "epoch": 6.4934233686433895, + "grad_norm": 0.474609375, + "learning_rate": 0.0003480526130508529, + "loss": 5.1916, + "step": 1014500 + }, + { + "epoch": 6.4966236758728835, + "grad_norm": 0.4921875, + "learning_rate": 0.00034802701059301696, + "loss": 5.1997, + "step": 1015000 + }, + { + "epoch": 6.499823983102377, + "grad_norm": 0.5625, + "learning_rate": 0.000348001408135181, + "loss": 5.1884, + "step": 1015500 + }, + { + "epoch": 6.503024290331872, + "grad_norm": 0.52734375, + "learning_rate": 0.00034797580567734503, + "loss": 5.1854, + "step": 1016000 + }, + { + "epoch": 6.506224597561366, + "grad_norm": 0.52734375, + "learning_rate": 0.00034795020321950907, + "loss": 5.1893, + "step": 1016500 + }, + { + "epoch": 6.50942490479086, + "grad_norm": 0.4921875, + "learning_rate": 0.0003479246007616731, + "loss": 5.1924, + "step": 1017000 + }, + { + "epoch": 6.512625212020354, + "grad_norm": 0.51953125, + "learning_rate": 0.0003478989983038372, + "loss": 5.1975, + "step": 1017500 + }, + { + "epoch": 6.515825519249848, + "grad_norm": 0.52734375, + "learning_rate": 0.0003478733958460012, + "loss": 5.1912, + "step": 1018000 + }, + { + "epoch": 6.519025826479342, + "grad_norm": 0.671875, + "learning_rate": 0.0003478477933881653, + "loss": 5.1807, + "step": 1018500 + }, + { + "epoch": 6.522226133708836, + "grad_norm": 0.51953125, + "learning_rate": 0.00034782219093032935, + "loss": 5.1958, + "step": 1019000 + }, + { + "epoch": 6.52542644093833, + "grad_norm": 0.54296875, + "learning_rate": 0.0003477965884724934, + "loss": 5.1963, + "step": 1019500 + }, + { + "epoch": 6.528626748167824, + "grad_norm": 0.57421875, + "learning_rate": 0.0003477709860146574, + "loss": 5.1893, + "step": 1020000 + }, + { + "epoch": 6.531827055397319, + "grad_norm": 0.5, + "learning_rate": 0.0003477453835568215, + "loss": 5.1881, + "step": 1020500 + }, + { + "epoch": 6.5350273626268125, + "grad_norm": 0.56640625, + "learning_rate": 0.00034771978109898554, + "loss": 5.1903, + "step": 1021000 + }, + { + "epoch": 6.538227669856306, + "grad_norm": 0.515625, + "learning_rate": 0.00034769417864114957, + "loss": 5.193, + "step": 1021500 + }, + { + "epoch": 6.5414279770858, + "grad_norm": 0.58984375, + "learning_rate": 0.0003476685761833136, + "loss": 5.195, + "step": 1022000 + }, + { + "epoch": 6.544628284315294, + "grad_norm": 0.4765625, + "learning_rate": 0.00034764297372547764, + "loss": 5.1893, + "step": 1022500 + }, + { + "epoch": 6.547828591544788, + "grad_norm": 0.53125, + "learning_rate": 0.0003476173712676417, + "loss": 5.1921, + "step": 1023000 + }, + { + "epoch": 6.551028898774282, + "grad_norm": 0.52734375, + "learning_rate": 0.00034759176880980576, + "loss": 5.1918, + "step": 1023500 + }, + { + "epoch": 6.554229206003776, + "grad_norm": 0.5078125, + "learning_rate": 0.0003475661663519698, + "loss": 5.1978, + "step": 1024000 + }, + { + "epoch": 6.55742951323327, + "grad_norm": 0.58203125, + "learning_rate": 0.00034754056389413383, + "loss": 5.189, + "step": 1024500 + }, + { + "epoch": 6.560629820462765, + "grad_norm": 0.49609375, + "learning_rate": 0.0003475149614362979, + "loss": 5.1892, + "step": 1025000 + }, + { + "epoch": 6.563830127692259, + "grad_norm": 0.55859375, + "learning_rate": 0.00034748935897846195, + "loss": 5.1897, + "step": 1025500 + }, + { + "epoch": 6.567030434921753, + "grad_norm": 0.4921875, + "learning_rate": 0.00034746375652062604, + "loss": 5.1944, + "step": 1026000 + }, + { + "epoch": 6.570230742151247, + "grad_norm": 0.48828125, + "learning_rate": 0.0003474381540627901, + "loss": 5.192, + "step": 1026500 + }, + { + "epoch": 6.573431049380741, + "grad_norm": 0.55078125, + "learning_rate": 0.0003474125516049541, + "loss": 5.1942, + "step": 1027000 + }, + { + "epoch": 6.576631356610235, + "grad_norm": 0.53515625, + "learning_rate": 0.00034738694914711814, + "loss": 5.185, + "step": 1027500 + }, + { + "epoch": 6.5798316638397285, + "grad_norm": 0.51171875, + "learning_rate": 0.0003473613466892822, + "loss": 5.1918, + "step": 1028000 + }, + { + "epoch": 6.583031971069222, + "grad_norm": 0.546875, + "learning_rate": 0.0003473357442314462, + "loss": 5.1896, + "step": 1028500 + }, + { + "epoch": 6.586232278298716, + "grad_norm": 0.46875, + "learning_rate": 0.0003473101417736103, + "loss": 5.1934, + "step": 1029000 + }, + { + "epoch": 6.589432585528211, + "grad_norm": 0.578125, + "learning_rate": 0.00034728453931577434, + "loss": 5.1935, + "step": 1029500 + }, + { + "epoch": 6.592632892757704, + "grad_norm": 0.53515625, + "learning_rate": 0.00034725893685793837, + "loss": 5.1943, + "step": 1030000 + }, + { + "epoch": 6.595833199987199, + "grad_norm": 0.51171875, + "learning_rate": 0.0003472333344001024, + "loss": 5.1917, + "step": 1030500 + }, + { + "epoch": 6.599033507216693, + "grad_norm": 0.5625, + "learning_rate": 0.00034720773194226644, + "loss": 5.1918, + "step": 1031000 + }, + { + "epoch": 6.602233814446187, + "grad_norm": 0.55859375, + "learning_rate": 0.0003471821294844305, + "loss": 5.1897, + "step": 1031500 + }, + { + "epoch": 6.605434121675681, + "grad_norm": 0.474609375, + "learning_rate": 0.00034715652702659456, + "loss": 5.1899, + "step": 1032000 + }, + { + "epoch": 6.608634428905175, + "grad_norm": 0.52734375, + "learning_rate": 0.0003471309245687586, + "loss": 5.1846, + "step": 1032500 + }, + { + "epoch": 6.611834736134669, + "grad_norm": 0.5390625, + "learning_rate": 0.0003471053221109227, + "loss": 5.1905, + "step": 1033000 + }, + { + "epoch": 6.615035043364163, + "grad_norm": 0.55859375, + "learning_rate": 0.0003470797196530867, + "loss": 5.1933, + "step": 1033500 + }, + { + "epoch": 6.618235350593657, + "grad_norm": 0.52734375, + "learning_rate": 0.00034705411719525075, + "loss": 5.197, + "step": 1034000 + }, + { + "epoch": 6.621435657823151, + "grad_norm": 0.52734375, + "learning_rate": 0.00034702851473741484, + "loss": 5.1931, + "step": 1034500 + }, + { + "epoch": 6.624635965052645, + "grad_norm": 0.54296875, + "learning_rate": 0.0003470029122795789, + "loss": 5.1942, + "step": 1035000 + }, + { + "epoch": 6.627836272282139, + "grad_norm": 0.4765625, + "learning_rate": 0.0003469773098217429, + "loss": 5.1872, + "step": 1035500 + }, + { + "epoch": 6.631036579511633, + "grad_norm": 0.56640625, + "learning_rate": 0.00034695170736390694, + "loss": 5.1943, + "step": 1036000 + }, + { + "epoch": 6.634236886741127, + "grad_norm": 0.52734375, + "learning_rate": 0.000346926104906071, + "loss": 5.1812, + "step": 1036500 + }, + { + "epoch": 6.637437193970621, + "grad_norm": 0.53515625, + "learning_rate": 0.00034690050244823507, + "loss": 5.1939, + "step": 1037000 + }, + { + "epoch": 6.640637501200115, + "grad_norm": 0.55078125, + "learning_rate": 0.0003468748999903991, + "loss": 5.1886, + "step": 1037500 + }, + { + "epoch": 6.643837808429609, + "grad_norm": 0.578125, + "learning_rate": 0.00034684929753256313, + "loss": 5.1897, + "step": 1038000 + }, + { + "epoch": 6.647038115659103, + "grad_norm": 0.52734375, + "learning_rate": 0.00034682369507472717, + "loss": 5.1931, + "step": 1038500 + }, + { + "epoch": 6.650238422888597, + "grad_norm": 0.53125, + "learning_rate": 0.0003467980926168912, + "loss": 5.1965, + "step": 1039000 + }, + { + "epoch": 6.653438730118092, + "grad_norm": 0.53125, + "learning_rate": 0.0003467724901590553, + "loss": 5.1879, + "step": 1039500 + }, + { + "epoch": 6.656639037347586, + "grad_norm": 0.58984375, + "learning_rate": 0.0003467468877012193, + "loss": 5.1892, + "step": 1040000 + }, + { + "epoch": 6.65983934457708, + "grad_norm": 0.5703125, + "learning_rate": 0.0003467212852433834, + "loss": 5.1931, + "step": 1040500 + }, + { + "epoch": 6.6630396518065735, + "grad_norm": 0.640625, + "learning_rate": 0.00034669568278554745, + "loss": 5.1952, + "step": 1041000 + }, + { + "epoch": 6.6662399590360675, + "grad_norm": 0.55078125, + "learning_rate": 0.0003466700803277115, + "loss": 5.1862, + "step": 1041500 + }, + { + "epoch": 6.669440266265561, + "grad_norm": 0.5078125, + "learning_rate": 0.0003466444778698755, + "loss": 5.1895, + "step": 1042000 + }, + { + "epoch": 6.672640573495055, + "grad_norm": 0.53515625, + "learning_rate": 0.0003466188754120396, + "loss": 5.1934, + "step": 1042500 + }, + { + "epoch": 6.675840880724549, + "grad_norm": 0.63671875, + "learning_rate": 0.00034659327295420364, + "loss": 5.1865, + "step": 1043000 + }, + { + "epoch": 6.679041187954043, + "grad_norm": 0.5390625, + "learning_rate": 0.0003465676704963677, + "loss": 5.1901, + "step": 1043500 + }, + { + "epoch": 6.682241495183538, + "grad_norm": 0.48046875, + "learning_rate": 0.0003465420680385317, + "loss": 5.1898, + "step": 1044000 + }, + { + "epoch": 6.685441802413032, + "grad_norm": 0.546875, + "learning_rate": 0.00034651646558069574, + "loss": 5.1869, + "step": 1044500 + }, + { + "epoch": 6.688642109642526, + "grad_norm": 0.53515625, + "learning_rate": 0.00034649086312285983, + "loss": 5.1884, + "step": 1045000 + }, + { + "epoch": 6.69184241687202, + "grad_norm": 0.53125, + "learning_rate": 0.00034646526066502386, + "loss": 5.1853, + "step": 1045500 + }, + { + "epoch": 6.695042724101514, + "grad_norm": 0.5390625, + "learning_rate": 0.0003464396582071879, + "loss": 5.1859, + "step": 1046000 + }, + { + "epoch": 6.698243031331008, + "grad_norm": 0.546875, + "learning_rate": 0.00034641405574935193, + "loss": 5.1969, + "step": 1046500 + }, + { + "epoch": 6.701443338560502, + "grad_norm": 0.59375, + "learning_rate": 0.00034638845329151597, + "loss": 5.1975, + "step": 1047000 + }, + { + "epoch": 6.704643645789996, + "grad_norm": 0.49609375, + "learning_rate": 0.00034636285083368006, + "loss": 5.1999, + "step": 1047500 + }, + { + "epoch": 6.7078439530194895, + "grad_norm": 0.5390625, + "learning_rate": 0.0003463372483758441, + "loss": 5.1876, + "step": 1048000 + }, + { + "epoch": 6.711044260248984, + "grad_norm": 0.50390625, + "learning_rate": 0.0003463116459180082, + "loss": 5.1959, + "step": 1048500 + }, + { + "epoch": 6.714244567478478, + "grad_norm": 0.5234375, + "learning_rate": 0.0003462860434601722, + "loss": 5.1904, + "step": 1049000 + }, + { + "epoch": 6.717444874707972, + "grad_norm": 0.55859375, + "learning_rate": 0.00034626044100233625, + "loss": 5.1928, + "step": 1049500 + }, + { + "epoch": 6.720645181937466, + "grad_norm": 0.6640625, + "learning_rate": 0.0003462348385445003, + "loss": 5.1965, + "step": 1050000 + }, + { + "epoch": 6.72384548916696, + "grad_norm": 0.5390625, + "learning_rate": 0.00034620923608666437, + "loss": 5.1949, + "step": 1050500 + }, + { + "epoch": 6.727045796396454, + "grad_norm": 0.61328125, + "learning_rate": 0.0003461836336288284, + "loss": 5.187, + "step": 1051000 + }, + { + "epoch": 6.730246103625948, + "grad_norm": 0.5078125, + "learning_rate": 0.00034615803117099244, + "loss": 5.1885, + "step": 1051500 + }, + { + "epoch": 6.733446410855442, + "grad_norm": 0.59375, + "learning_rate": 0.00034613242871315647, + "loss": 5.1862, + "step": 1052000 + }, + { + "epoch": 6.736646718084936, + "grad_norm": 0.54296875, + "learning_rate": 0.0003461068262553205, + "loss": 5.1924, + "step": 1052500 + }, + { + "epoch": 6.73984702531443, + "grad_norm": 0.48828125, + "learning_rate": 0.00034608122379748454, + "loss": 5.1912, + "step": 1053000 + }, + { + "epoch": 6.743047332543924, + "grad_norm": 0.59765625, + "learning_rate": 0.00034605562133964863, + "loss": 5.1894, + "step": 1053500 + }, + { + "epoch": 6.746247639773419, + "grad_norm": 0.59375, + "learning_rate": 0.00034603001888181266, + "loss": 5.1912, + "step": 1054000 + }, + { + "epoch": 6.7494479470029125, + "grad_norm": 0.5390625, + "learning_rate": 0.0003460044164239767, + "loss": 5.1874, + "step": 1054500 + }, + { + "epoch": 6.752648254232406, + "grad_norm": 0.52734375, + "learning_rate": 0.00034597881396614073, + "loss": 5.1947, + "step": 1055000 + }, + { + "epoch": 6.7558485614619, + "grad_norm": 0.498046875, + "learning_rate": 0.0003459532115083048, + "loss": 5.1855, + "step": 1055500 + }, + { + "epoch": 6.759048868691394, + "grad_norm": 0.56640625, + "learning_rate": 0.00034592760905046885, + "loss": 5.1935, + "step": 1056000 + }, + { + "epoch": 6.762249175920888, + "grad_norm": 0.55859375, + "learning_rate": 0.00034590200659263294, + "loss": 5.1905, + "step": 1056500 + }, + { + "epoch": 6.765449483150382, + "grad_norm": 0.5234375, + "learning_rate": 0.000345876404134797, + "loss": 5.1933, + "step": 1057000 + }, + { + "epoch": 6.768649790379876, + "grad_norm": 0.62890625, + "learning_rate": 0.000345850801676961, + "loss": 5.1927, + "step": 1057500 + }, + { + "epoch": 6.77185009760937, + "grad_norm": 0.5234375, + "learning_rate": 0.00034582519921912505, + "loss": 5.1847, + "step": 1058000 + }, + { + "epoch": 6.775050404838865, + "grad_norm": 0.5078125, + "learning_rate": 0.0003457995967612891, + "loss": 5.1947, + "step": 1058500 + }, + { + "epoch": 6.778250712068359, + "grad_norm": 0.57421875, + "learning_rate": 0.00034577399430345317, + "loss": 5.1914, + "step": 1059000 + }, + { + "epoch": 6.781451019297853, + "grad_norm": 0.50390625, + "learning_rate": 0.0003457483918456172, + "loss": 5.1856, + "step": 1059500 + }, + { + "epoch": 6.784651326527347, + "grad_norm": 0.53515625, + "learning_rate": 0.00034572278938778124, + "loss": 5.1888, + "step": 1060000 + }, + { + "epoch": 6.787851633756841, + "grad_norm": 0.5078125, + "learning_rate": 0.00034569718692994527, + "loss": 5.1896, + "step": 1060500 + }, + { + "epoch": 6.791051940986335, + "grad_norm": 0.494140625, + "learning_rate": 0.0003456715844721093, + "loss": 5.1867, + "step": 1061000 + }, + { + "epoch": 6.7942522482158285, + "grad_norm": 0.58203125, + "learning_rate": 0.0003456459820142734, + "loss": 5.1903, + "step": 1061500 + }, + { + "epoch": 6.7974525554453225, + "grad_norm": 0.53125, + "learning_rate": 0.00034562037955643743, + "loss": 5.1917, + "step": 1062000 + }, + { + "epoch": 6.800652862674816, + "grad_norm": 0.56640625, + "learning_rate": 0.00034559477709860146, + "loss": 5.1901, + "step": 1062500 + }, + { + "epoch": 6.803853169904311, + "grad_norm": 0.53125, + "learning_rate": 0.00034556917464076555, + "loss": 5.1923, + "step": 1063000 + }, + { + "epoch": 6.807053477133805, + "grad_norm": 0.62890625, + "learning_rate": 0.0003455435721829296, + "loss": 5.1872, + "step": 1063500 + }, + { + "epoch": 6.810253784363299, + "grad_norm": 0.55859375, + "learning_rate": 0.0003455179697250936, + "loss": 5.1963, + "step": 1064000 + }, + { + "epoch": 6.813454091592793, + "grad_norm": 0.51953125, + "learning_rate": 0.0003454923672672577, + "loss": 5.1925, + "step": 1064500 + }, + { + "epoch": 6.816654398822287, + "grad_norm": 0.5390625, + "learning_rate": 0.00034546676480942174, + "loss": 5.188, + "step": 1065000 + }, + { + "epoch": 6.819854706051781, + "grad_norm": 0.53125, + "learning_rate": 0.0003454411623515858, + "loss": 5.1882, + "step": 1065500 + }, + { + "epoch": 6.823055013281275, + "grad_norm": 0.58984375, + "learning_rate": 0.0003454155598937498, + "loss": 5.1874, + "step": 1066000 + }, + { + "epoch": 6.826255320510769, + "grad_norm": 0.494140625, + "learning_rate": 0.00034538995743591384, + "loss": 5.1914, + "step": 1066500 + }, + { + "epoch": 6.829455627740263, + "grad_norm": 0.55078125, + "learning_rate": 0.00034536435497807793, + "loss": 5.1978, + "step": 1067000 + }, + { + "epoch": 6.8326559349697575, + "grad_norm": 0.60546875, + "learning_rate": 0.00034533875252024197, + "loss": 5.1938, + "step": 1067500 + }, + { + "epoch": 6.8358562421992515, + "grad_norm": 0.53125, + "learning_rate": 0.000345313150062406, + "loss": 5.1847, + "step": 1068000 + }, + { + "epoch": 6.839056549428745, + "grad_norm": 0.5546875, + "learning_rate": 0.00034528754760457004, + "loss": 5.1907, + "step": 1068500 + }, + { + "epoch": 6.842256856658239, + "grad_norm": 0.578125, + "learning_rate": 0.00034526194514673407, + "loss": 5.1946, + "step": 1069000 + }, + { + "epoch": 6.845457163887733, + "grad_norm": 0.4765625, + "learning_rate": 0.0003452363426888981, + "loss": 5.1881, + "step": 1069500 + }, + { + "epoch": 6.848657471117227, + "grad_norm": 0.55078125, + "learning_rate": 0.0003452107402310622, + "loss": 5.1939, + "step": 1070000 + }, + { + "epoch": 6.851857778346721, + "grad_norm": 0.51953125, + "learning_rate": 0.00034518513777322623, + "loss": 5.1925, + "step": 1070500 + }, + { + "epoch": 6.855058085576215, + "grad_norm": 0.5546875, + "learning_rate": 0.0003451595353153903, + "loss": 5.1939, + "step": 1071000 + }, + { + "epoch": 6.858258392805709, + "grad_norm": 0.5078125, + "learning_rate": 0.00034513393285755435, + "loss": 5.1893, + "step": 1071500 + }, + { + "epoch": 6.861458700035204, + "grad_norm": 0.58984375, + "learning_rate": 0.0003451083303997184, + "loss": 5.1903, + "step": 1072000 + }, + { + "epoch": 6.864659007264698, + "grad_norm": 0.5625, + "learning_rate": 0.00034508272794188247, + "loss": 5.1848, + "step": 1072500 + }, + { + "epoch": 6.867859314494192, + "grad_norm": 0.515625, + "learning_rate": 0.0003450571254840465, + "loss": 5.1885, + "step": 1073000 + }, + { + "epoch": 6.871059621723686, + "grad_norm": 0.5703125, + "learning_rate": 0.00034503152302621054, + "loss": 5.1888, + "step": 1073500 + }, + { + "epoch": 6.87425992895318, + "grad_norm": 0.6171875, + "learning_rate": 0.0003450059205683746, + "loss": 5.1897, + "step": 1074000 + }, + { + "epoch": 6.8774602361826735, + "grad_norm": 0.5859375, + "learning_rate": 0.0003449803181105386, + "loss": 5.1896, + "step": 1074500 + }, + { + "epoch": 6.8806605434121675, + "grad_norm": 0.640625, + "learning_rate": 0.0003449547156527027, + "loss": 5.1949, + "step": 1075000 + }, + { + "epoch": 6.883860850641661, + "grad_norm": 0.58984375, + "learning_rate": 0.00034492911319486673, + "loss": 5.1969, + "step": 1075500 + }, + { + "epoch": 6.887061157871155, + "grad_norm": 0.54296875, + "learning_rate": 0.00034490351073703077, + "loss": 5.1871, + "step": 1076000 + }, + { + "epoch": 6.890261465100649, + "grad_norm": 0.546875, + "learning_rate": 0.0003448779082791948, + "loss": 5.1961, + "step": 1076500 + }, + { + "epoch": 6.893461772330143, + "grad_norm": 0.5859375, + "learning_rate": 0.00034485230582135884, + "loss": 5.1902, + "step": 1077000 + }, + { + "epoch": 6.896662079559638, + "grad_norm": 0.52734375, + "learning_rate": 0.00034482670336352287, + "loss": 5.1938, + "step": 1077500 + }, + { + "epoch": 6.899862386789132, + "grad_norm": 0.54296875, + "learning_rate": 0.00034480110090568696, + "loss": 5.1915, + "step": 1078000 + }, + { + "epoch": 6.903062694018626, + "grad_norm": 0.5, + "learning_rate": 0.00034477549844785105, + "loss": 5.1905, + "step": 1078500 + }, + { + "epoch": 6.90626300124812, + "grad_norm": 0.546875, + "learning_rate": 0.0003447498959900151, + "loss": 5.1938, + "step": 1079000 + }, + { + "epoch": 6.909463308477614, + "grad_norm": 0.53515625, + "learning_rate": 0.0003447242935321791, + "loss": 5.1896, + "step": 1079500 + }, + { + "epoch": 6.912663615707108, + "grad_norm": 0.50390625, + "learning_rate": 0.00034469869107434315, + "loss": 5.1903, + "step": 1080000 + }, + { + "epoch": 6.915863922936602, + "grad_norm": 0.57421875, + "learning_rate": 0.00034467308861650724, + "loss": 5.195, + "step": 1080500 + }, + { + "epoch": 6.919064230166096, + "grad_norm": 0.49609375, + "learning_rate": 0.00034464748615867127, + "loss": 5.1908, + "step": 1081000 + }, + { + "epoch": 6.92226453739559, + "grad_norm": 0.5625, + "learning_rate": 0.0003446218837008353, + "loss": 5.1953, + "step": 1081500 + }, + { + "epoch": 6.925464844625084, + "grad_norm": 0.5234375, + "learning_rate": 0.00034459628124299934, + "loss": 5.1934, + "step": 1082000 + }, + { + "epoch": 6.928665151854578, + "grad_norm": 0.546875, + "learning_rate": 0.0003445706787851634, + "loss": 5.1893, + "step": 1082500 + }, + { + "epoch": 6.931865459084072, + "grad_norm": 0.5859375, + "learning_rate": 0.0003445450763273274, + "loss": 5.1902, + "step": 1083000 + }, + { + "epoch": 6.935065766313566, + "grad_norm": 0.55859375, + "learning_rate": 0.0003445194738694915, + "loss": 5.1884, + "step": 1083500 + }, + { + "epoch": 6.93826607354306, + "grad_norm": 0.51953125, + "learning_rate": 0.00034449387141165553, + "loss": 5.1871, + "step": 1084000 + }, + { + "epoch": 6.941466380772554, + "grad_norm": 0.62890625, + "learning_rate": 0.00034446826895381957, + "loss": 5.1905, + "step": 1084500 + }, + { + "epoch": 6.944666688002048, + "grad_norm": 0.53125, + "learning_rate": 0.0003444426664959836, + "loss": 5.1862, + "step": 1085000 + }, + { + "epoch": 6.947866995231542, + "grad_norm": 0.546875, + "learning_rate": 0.0003444170640381477, + "loss": 5.1877, + "step": 1085500 + }, + { + "epoch": 6.951067302461036, + "grad_norm": 0.625, + "learning_rate": 0.0003443914615803117, + "loss": 5.1917, + "step": 1086000 + }, + { + "epoch": 6.954267609690531, + "grad_norm": 0.5390625, + "learning_rate": 0.0003443658591224758, + "loss": 5.1928, + "step": 1086500 + }, + { + "epoch": 6.957467916920025, + "grad_norm": 0.5390625, + "learning_rate": 0.00034434025666463985, + "loss": 5.1903, + "step": 1087000 + }, + { + "epoch": 6.960668224149519, + "grad_norm": 0.52734375, + "learning_rate": 0.0003443146542068039, + "loss": 5.1885, + "step": 1087500 + }, + { + "epoch": 6.9638685313790125, + "grad_norm": 0.546875, + "learning_rate": 0.0003442890517489679, + "loss": 5.1883, + "step": 1088000 + }, + { + "epoch": 6.9670688386085065, + "grad_norm": 0.5859375, + "learning_rate": 0.00034426344929113195, + "loss": 5.184, + "step": 1088500 + }, + { + "epoch": 6.970269145838, + "grad_norm": 0.57421875, + "learning_rate": 0.00034423784683329604, + "loss": 5.1899, + "step": 1089000 + }, + { + "epoch": 6.973469453067494, + "grad_norm": 0.59765625, + "learning_rate": 0.00034421224437546007, + "loss": 5.1946, + "step": 1089500 + }, + { + "epoch": 6.976669760296988, + "grad_norm": 0.5625, + "learning_rate": 0.0003441866419176241, + "loss": 5.1946, + "step": 1090000 + }, + { + "epoch": 6.979870067526482, + "grad_norm": 0.494140625, + "learning_rate": 0.00034416103945978814, + "loss": 5.1905, + "step": 1090500 + }, + { + "epoch": 6.983070374755977, + "grad_norm": 0.515625, + "learning_rate": 0.0003441354370019522, + "loss": 5.1917, + "step": 1091000 + }, + { + "epoch": 6.986270681985471, + "grad_norm": 0.52734375, + "learning_rate": 0.00034410983454411626, + "loss": 5.1895, + "step": 1091500 + }, + { + "epoch": 6.989470989214965, + "grad_norm": 0.52734375, + "learning_rate": 0.0003440842320862803, + "loss": 5.1894, + "step": 1092000 + }, + { + "epoch": 6.992671296444459, + "grad_norm": 0.58203125, + "learning_rate": 0.00034405862962844433, + "loss": 5.1908, + "step": 1092500 + }, + { + "epoch": 6.995871603673953, + "grad_norm": 0.578125, + "learning_rate": 0.00034403302717060836, + "loss": 5.1952, + "step": 1093000 + }, + { + "epoch": 6.999071910903447, + "grad_norm": 0.57421875, + "learning_rate": 0.00034400742471277245, + "loss": 5.1905, + "step": 1093500 + }, + { + "epoch": 7.0, + "eval_loss": 5.1858015060424805, + "eval_runtime": 1.1382, + "eval_samples_per_second": 878.603, + "eval_steps_per_second": 14.058, + "step": 1093645 + } + ], + "logging_steps": 500, + "max_steps": 7811750, + "num_input_tokens_seen": 0, + "num_train_epochs": 50, + "save_steps": 500, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.473295707163853e+19, + "train_batch_size": 64, + "trial_name": null, + "trial_params": null +}