diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a87e3954609fae2a4b9bcab1ab6d7a527d91d943 Binary files /dev/null and b/.DS_Store differ diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d4507be88293606900a542574f09ce87322b185b --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e566c61bed3b05cde1ebb3a325febfce0fafe3d3f3e648acb48bf8bd33cedb20 +size 5919456 diff --git a/checkpoints/.DS_Store b/checkpoints/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..319da2ad88eb2d840c5fe704eb9e32c055d29681 Binary files /dev/null and b/checkpoints/.DS_Store differ diff --git a/checkpoints/checkpoint-1000/.DS_Store b/checkpoints/checkpoint-1000/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/checkpoints/checkpoint-1000/.DS_Store differ diff --git a/checkpoints/checkpoint-1000/adapter_config.json b/checkpoints/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-1000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-1000/adapter_model.safetensors b/checkpoints/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ada62a9056d2bc3e2d2cbb277e551f55e6b4aa0b --- /dev/null +++ b/checkpoints/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7021d0f53b2af54bc0fb2b36f002603368f00cc5e37cf8ffa4d35e14e850cfc +size 5919456 diff --git a/checkpoints/checkpoint-1000/optimizer.pt b/checkpoints/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e68730afa32e5f46d317ce668b5fa018beb68976 --- /dev/null +++ b/checkpoints/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cae149eaee16a95e002bf921dbb3bd8869b631f62813911b3680edf18d7ff723 +size 11930938 diff --git a/checkpoints/checkpoint-1000/rng_state_0.pth b/checkpoints/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..63c03aea1b9f7d3ab56583f85946ad1c03bba717 --- /dev/null +++ b/checkpoints/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:287014c23e0c4e9613e974e969516cd0fc0bdce46daf1c6f1c0b66e3eb091e50 +size 15024 diff --git a/checkpoints/checkpoint-1000/rng_state_1.pth b/checkpoints/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..fced8d099bdfbcd4544fef78e3314059f786c5cd --- /dev/null +++ b/checkpoints/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab87bd92d10c436f5b79e618401a8481e12e99083829692c2fc0de84edcad99 +size 15024 diff --git a/checkpoints/checkpoint-1000/rng_state_2.pth b/checkpoints/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..10c8b81e903b01c272edbdb09a6d11cdd8111bb9 --- /dev/null +++ b/checkpoints/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8553566dff78f7c07a5a3e5517a0a640c6ef80a0e95eb328fed4c566945f6fd0 +size 15024 diff --git a/checkpoints/checkpoint-1000/rng_state_3.pth b/checkpoints/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..0cc13eb6a1f6df01cb785380fb1717d312f38f54 --- /dev/null +++ b/checkpoints/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60c899ef713ad40446331266b18149afa90e55eed1210839243a923ea8aa772d +size 15024 diff --git a/checkpoints/checkpoint-1000/scheduler.pt b/checkpoints/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a0d6f7f9da279b71ea03802b24c083ad94591354 --- /dev/null +++ b/checkpoints/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feeecf82b24e0ddf3c8d8285a678fe39a1184c1c961cf677b5ac8d36409a9a05 +size 1064 diff --git a/checkpoints/checkpoint-1000/trainer_state.json b/checkpoints/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..390fc3ea2539619e97f2debda2245ee0140a82df --- /dev/null +++ b/checkpoints/checkpoint-1000/trainer_state.json @@ -0,0 +1,721 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1015950421619425, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-1000/training_args.bin b/checkpoints/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-1500/.DS_Store b/checkpoints/checkpoint-1500/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/checkpoints/checkpoint-1500/.DS_Store differ diff --git a/checkpoints/checkpoint-1500/adapter_config.json b/checkpoints/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-1500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-1500/adapter_model.safetensors b/checkpoints/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..019ae5acc3c39c11dcf29cb8a850039b38bc7fa3 --- /dev/null +++ b/checkpoints/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0429f41a7679282d393919c7b40c076ed2bde721f9f95c9c14f1f57fa0b63f6 +size 5919456 diff --git a/checkpoints/checkpoint-1500/optimizer.pt b/checkpoints/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8ea20aac602d99eebb3facca793da7481fb829f --- /dev/null +++ b/checkpoints/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1cffc930b18a6507977b7928c70647d37429193d760799a24e4aa466c787fd3 +size 11930938 diff --git a/checkpoints/checkpoint-1500/rng_state_0.pth b/checkpoints/checkpoint-1500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..dd23fb6362f7699ea3bcc22f01e5b102e65d2118 --- /dev/null +++ b/checkpoints/checkpoint-1500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2583d0490e9c0c8a5832ae63a5d486d9951078b0ec9594ee0125a1807e7528 +size 15024 diff --git a/checkpoints/checkpoint-1500/rng_state_1.pth b/checkpoints/checkpoint-1500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d2851666bc50646272a2546dd1712a3eea0259bd --- /dev/null +++ b/checkpoints/checkpoint-1500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af9fd6e5a8f29754daaa4f1a3c57f904108d748a165ca0e1ad16571d90e39fa3 +size 15024 diff --git a/checkpoints/checkpoint-1500/rng_state_2.pth b/checkpoints/checkpoint-1500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ac093b6104318e553a760be63eb06cd0d3f1ec17 --- /dev/null +++ b/checkpoints/checkpoint-1500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd55da1784f3e2dba7244fb29d3bbc59fbefd6b4bb1357a4ded5822c60485304 +size 15024 diff --git a/checkpoints/checkpoint-1500/rng_state_3.pth b/checkpoints/checkpoint-1500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7ddbc69dcf0ed5d1b65deb88a8f1131eccc34e07 --- /dev/null +++ b/checkpoints/checkpoint-1500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b10bab8fcd02c56978a924d6d0be691a36e321523009f8caa015318d52823f2 +size 15024 diff --git a/checkpoints/checkpoint-1500/scheduler.pt b/checkpoints/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..51876d3aaf0be1aae071cb0c017d51b61448d7ed --- /dev/null +++ b/checkpoints/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68a6cef91a2e578166a3e870fa8585312c20b665553a57976a77b1e7d2ca0ef3 +size 1064 diff --git a/checkpoints/checkpoint-1500/trainer_state.json b/checkpoints/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..228276cc158ac36aef26a0b87e0e57950412ba46 --- /dev/null +++ b/checkpoints/checkpoint-1500/trainer_state.json @@ -0,0 +1,1071 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15239256324291375, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-1500/training_args.bin b/checkpoints/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-2000/.DS_Store b/checkpoints/checkpoint-2000/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/checkpoints/checkpoint-2000/.DS_Store differ diff --git a/checkpoints/checkpoint-2000/adapter_config.json b/checkpoints/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-2000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-2000/adapter_model.safetensors b/checkpoints/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4c42278d439b28a8eb84d8f3d7b93d55df223451 --- /dev/null +++ b/checkpoints/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f511e364452e06b28c03299a544dc2d5fd730fdfd45007cd4ec530e30144139 +size 5919456 diff --git a/checkpoints/checkpoint-2000/optimizer.pt b/checkpoints/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..27718bc4fc2e46b271db6199e877b9ca8ddec162 --- /dev/null +++ b/checkpoints/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4121e055d593f91209d1cfca077444bb962a8848054f778e04d22d7e269a91e3 +size 11930938 diff --git a/checkpoints/checkpoint-2000/rng_state_0.pth b/checkpoints/checkpoint-2000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..3b1b550785550980db2dab7c4db776074d507397 --- /dev/null +++ b/checkpoints/checkpoint-2000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7814979e8476866bd135a55e64d375b840c3c1436aa60c3d69ece3f0a10c3408 +size 15024 diff --git a/checkpoints/checkpoint-2000/rng_state_1.pth b/checkpoints/checkpoint-2000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7a3ed4fbc5afe1f89fd45284b352bf27deedb8ea --- /dev/null +++ b/checkpoints/checkpoint-2000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4b8a9f5b33e3bc1f4b31217176bbabc65ace6c56a7bf77b1b7153dc062ba709 +size 15024 diff --git a/checkpoints/checkpoint-2000/rng_state_2.pth b/checkpoints/checkpoint-2000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6f9d319e7d0f09d65d040354c5e0320b59dec0f0 --- /dev/null +++ b/checkpoints/checkpoint-2000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70445c98497e5f1d7bea8f44a93bc0211d0226177b834ac2949fe24e3b538d05 +size 15024 diff --git a/checkpoints/checkpoint-2000/rng_state_3.pth b/checkpoints/checkpoint-2000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c23f68acc06264aeb9b09bf60269640ce8b147c8 --- /dev/null +++ b/checkpoints/checkpoint-2000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c244dce3fafef8abbf503718fb81319ff12831928a8845134b78a845e0c6e14 +size 15024 diff --git a/checkpoints/checkpoint-2000/scheduler.pt b/checkpoints/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c133553a2b6f7143f327ed61a33a83fcc4a43b74 --- /dev/null +++ b/checkpoints/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9cf822c73733a63e3bebbb2802abc0acedf3d9423d9f25c6bb103f490864c06 +size 1064 diff --git a/checkpoints/checkpoint-2000/trainer_state.json b/checkpoints/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e7a9301a9ac4e7f3efde74d978b93fbd74160b85 --- /dev/null +++ b/checkpoints/checkpoint-2000/trainer_state.json @@ -0,0 +1,1421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.203190084323885, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-2000/training_args.bin b/checkpoints/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-2500/.DS_Store b/checkpoints/checkpoint-2500/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/checkpoints/checkpoint-2500/.DS_Store differ diff --git a/checkpoints/checkpoint-2500/adapter_config.json b/checkpoints/checkpoint-2500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-2500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-2500/adapter_model.safetensors b/checkpoints/checkpoint-2500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b72c393034479ae51f68bfdb488baeb0eca04321 --- /dev/null +++ b/checkpoints/checkpoint-2500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56179446117ddaff95696487189a032d48a215d8456339c2ae5eda1870df93b9 +size 5919456 diff --git a/checkpoints/checkpoint-2500/optimizer.pt b/checkpoints/checkpoint-2500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0b52ed1ee54b291770f15300bf140925bca02fde --- /dev/null +++ b/checkpoints/checkpoint-2500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd04baeeb8fe514953f04bca66abb944e11003693bffa63c977e38849273cb49 +size 11930938 diff --git a/checkpoints/checkpoint-2500/rng_state_0.pth b/checkpoints/checkpoint-2500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..17380c6e5ac5082c526e6f93eccf203432124116 --- /dev/null +++ b/checkpoints/checkpoint-2500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:460a81dc2425be030afb2c9930b7e9fc9de54ad9cf988c330851fefe47a118c2 +size 15024 diff --git a/checkpoints/checkpoint-2500/rng_state_1.pth b/checkpoints/checkpoint-2500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..38dd76771028ab39d8a7742325c670728c9d9b17 --- /dev/null +++ b/checkpoints/checkpoint-2500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f39c713bcb800d5d9d55e44b2d2e744bbee1e449a935b9a681868e507ac58f86 +size 15024 diff --git a/checkpoints/checkpoint-2500/rng_state_2.pth b/checkpoints/checkpoint-2500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba8fc1fc2b8078c75a9a126da3a3f68cd3411b42 --- /dev/null +++ b/checkpoints/checkpoint-2500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6b1c81cf4c5daaf0fe1a5aa4e87259ede9f631f0867a6d085b8eba9a03f7275 +size 15024 diff --git a/checkpoints/checkpoint-2500/rng_state_3.pth b/checkpoints/checkpoint-2500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..7f389e22cd4c2984db2f53ef472201c762aff9e6 --- /dev/null +++ b/checkpoints/checkpoint-2500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7ae3d71695a016b734120d0e84d8510429872cccd4cdfc5ad051249bdceb709 +size 15024 diff --git a/checkpoints/checkpoint-2500/scheduler.pt b/checkpoints/checkpoint-2500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cfe96bf12bb338be15a0e572c935955e022dddb6 --- /dev/null +++ b/checkpoints/checkpoint-2500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0992ea36b796a9efd875f53514a1a7b72426fb94846549bcad84ea3eae0acee +size 1064 diff --git a/checkpoints/checkpoint-2500/trainer_state.json b/checkpoints/checkpoint-2500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..085f59bbcf897cba748b6ce5aee019fdbd65bb0d --- /dev/null +++ b/checkpoints/checkpoint-2500/trainer_state.json @@ -0,0 +1,1771 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.25398760540485626, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-2500/training_args.bin b/checkpoints/checkpoint-2500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-2500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-3000/.DS_Store b/checkpoints/checkpoint-3000/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/checkpoints/checkpoint-3000/.DS_Store differ diff --git a/checkpoints/checkpoint-3000/adapter_config.json b/checkpoints/checkpoint-3000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-3000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-3000/adapter_model.safetensors b/checkpoints/checkpoint-3000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f00b979e893c419aa35f668c6083983f38a702aa --- /dev/null +++ b/checkpoints/checkpoint-3000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24a237f0667aea50351b9a62bb2ddead2e09a567815c526d250be96397fa3798 +size 5919456 diff --git a/checkpoints/checkpoint-3000/optimizer.pt b/checkpoints/checkpoint-3000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5dfd65a95d4b75ec54a7c63d0fd72ce2529b8de1 --- /dev/null +++ b/checkpoints/checkpoint-3000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb7f56c657475efd0fee1d492240e0c5e5d10ed8ec46630177e04192f2950d28 +size 11930938 diff --git a/checkpoints/checkpoint-3000/rng_state_0.pth b/checkpoints/checkpoint-3000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..fd0da84997e5db079fe7df2a31aeef3efb336831 --- /dev/null +++ b/checkpoints/checkpoint-3000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75565d94ecdf86007209f425495e6767052cfd0684203dcaf72c23b7c2dc2740 +size 15024 diff --git a/checkpoints/checkpoint-3000/rng_state_1.pth b/checkpoints/checkpoint-3000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9582aaaef271b2855ca000b54510b991df4f866d --- /dev/null +++ b/checkpoints/checkpoint-3000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f986f3238448cdfd898312547f70e6317c7c4c40d4f49e7782f5380769c5f64 +size 15024 diff --git a/checkpoints/checkpoint-3000/rng_state_2.pth b/checkpoints/checkpoint-3000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0673c505e8967506093c68f5b6c6140a292635b5 --- /dev/null +++ b/checkpoints/checkpoint-3000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:113f522722f00018489ce132b51c24506ef9b8b849c0c2a783913565b35f6cba +size 15024 diff --git a/checkpoints/checkpoint-3000/rng_state_3.pth b/checkpoints/checkpoint-3000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..af1dca7b2f752be096e9e7b1b84f0d1897e9f780 --- /dev/null +++ b/checkpoints/checkpoint-3000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1134f64cfe9f598e01f68bfb6bc3e69460c4b0c7ef7617bf88d7f539babada99 +size 15024 diff --git a/checkpoints/checkpoint-3000/scheduler.pt b/checkpoints/checkpoint-3000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..78eed6ed36accb49b60ba0c6db316b6bf427c96b --- /dev/null +++ b/checkpoints/checkpoint-3000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c5a5e7ee1063db317ee978d346cf3b726f5d9858cb8bd50568280ada9f34910 +size 1064 diff --git a/checkpoints/checkpoint-3000/trainer_state.json b/checkpoints/checkpoint-3000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b813032acb2fc29774636a82567e34aeaaddb1b4 --- /dev/null +++ b/checkpoints/checkpoint-3000/trainer_state.json @@ -0,0 +1,2121 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3047851264858275, + "eval_steps": 500, + "global_step": 3000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-3000/training_args.bin b/checkpoints/checkpoint-3000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-3000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-3500/adapter_config.json b/checkpoints/checkpoint-3500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-3500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-3500/adapter_model.safetensors b/checkpoints/checkpoint-3500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..def26c035ec01dbe6399e1deda89633a3951ba73 --- /dev/null +++ b/checkpoints/checkpoint-3500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf46c695f24f5160623672ddb7b6e3230a79b1c62b8448e3dedb83005ca9935 +size 5919456 diff --git a/checkpoints/checkpoint-3500/optimizer.pt b/checkpoints/checkpoint-3500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c107db7039ba7af8dcfa32fd7aa7a8fbd21c74f --- /dev/null +++ b/checkpoints/checkpoint-3500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f791945b5138ee83b222ac582e4f208ff52e423ba51084873ae89d31ac1845 +size 11930938 diff --git a/checkpoints/checkpoint-3500/rng_state_0.pth b/checkpoints/checkpoint-3500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..6e2e5670f0b559873d46a22a975469d24350dae5 --- /dev/null +++ b/checkpoints/checkpoint-3500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:937c0b4f9e0770f6eca6c4cdbab48674491808073f9a0008242c7747205b4b0a +size 15024 diff --git a/checkpoints/checkpoint-3500/rng_state_1.pth b/checkpoints/checkpoint-3500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7eb9d5104e7e96e55a0cd774e3249194f48334fc --- /dev/null +++ b/checkpoints/checkpoint-3500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db86a3a2768366792f057841f721820dff6815adc16d464af7828df941223c01 +size 15024 diff --git a/checkpoints/checkpoint-3500/rng_state_2.pth b/checkpoints/checkpoint-3500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..fda380880dfe7ab57f96a6a7914e3ae9c2b57485 --- /dev/null +++ b/checkpoints/checkpoint-3500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fc2e3f389bdedf3f7c940ce7cfd74a54d5ef411c9271a0b570f0f8078360d61 +size 15024 diff --git a/checkpoints/checkpoint-3500/rng_state_3.pth b/checkpoints/checkpoint-3500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8422b115f583e37a495ec690fab1f7a2441ec0a8 --- /dev/null +++ b/checkpoints/checkpoint-3500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:700a170dc517eaa370c876455bedd04e0b25e7140ee505e562880ac0c268e199 +size 15024 diff --git a/checkpoints/checkpoint-3500/scheduler.pt b/checkpoints/checkpoint-3500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c79541e6a9cb3467a5d3ceaa6d2065ec4525bfb --- /dev/null +++ b/checkpoints/checkpoint-3500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:586f3b2ba8da5de2fa8f36edc9313f3dbd8056e41828a6210b6c8e0807689cc2 +size 1064 diff --git a/checkpoints/checkpoint-3500/trainer_state.json b/checkpoints/checkpoint-3500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d8f21bb043f14ed3122d67861c839124c000b40f --- /dev/null +++ b/checkpoints/checkpoint-3500/trainer_state.json @@ -0,0 +1,2471 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.35558264756679875, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-3500/training_args.bin b/checkpoints/checkpoint-3500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-3500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-4000/adapter_config.json b/checkpoints/checkpoint-4000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-4000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-4000/adapter_model.safetensors b/checkpoints/checkpoint-4000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0010e62e10adb58b59a4171d87f3fa37c74f4b27 --- /dev/null +++ b/checkpoints/checkpoint-4000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bfc0a45e20d761b2e036c7f419351073780fc963f478f6726d61bf6913308b5 +size 5919456 diff --git a/checkpoints/checkpoint-4000/optimizer.pt b/checkpoints/checkpoint-4000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7c063fe1aecc2f351c80b43394373815b139c4c7 --- /dev/null +++ b/checkpoints/checkpoint-4000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:030a97b642b9ca1bca5998b5c77cfdcf063fd297b4aded2538f62462ffcb0d5f +size 11930938 diff --git a/checkpoints/checkpoint-4000/rng_state_0.pth b/checkpoints/checkpoint-4000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5c7dcab2c1fa96e39da4e4616e5e8b282a2f6923 --- /dev/null +++ b/checkpoints/checkpoint-4000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdcaaa7cfa8093e078769990ecfb571790fea6d1f1143531eadfe0ef4d53d941 +size 15024 diff --git a/checkpoints/checkpoint-4000/rng_state_1.pth b/checkpoints/checkpoint-4000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..db8664d3862d07cb99bf09b73c3eb39bf3a8cfb0 --- /dev/null +++ b/checkpoints/checkpoint-4000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e818c945017fe6ffde32b0e04843796059b044ff660b74cf4c5e8397ef5bc3 +size 15024 diff --git a/checkpoints/checkpoint-4000/rng_state_2.pth b/checkpoints/checkpoint-4000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..485a59f7e6666a852992eb89a47736cd878a4cf8 --- /dev/null +++ b/checkpoints/checkpoint-4000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2847ec9ff5d020b7601c16b4794123d4fc5216fa782ba5350f8d8d42d63dec99 +size 15024 diff --git a/checkpoints/checkpoint-4000/rng_state_3.pth b/checkpoints/checkpoint-4000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9f9eb06dbb4d30c19a4bbda9fb0234299c087a7 --- /dev/null +++ b/checkpoints/checkpoint-4000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d3b7dd272ec99203448b0a7997c01b2d5e414ac34fcaf4a77f23495f1864166 +size 15024 diff --git a/checkpoints/checkpoint-4000/scheduler.pt b/checkpoints/checkpoint-4000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f9cc3da0d207e2f0ffb59d7ccbff46198b1d23f --- /dev/null +++ b/checkpoints/checkpoint-4000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa29240df220021e7e33aee35824e879cf6ebdc5ef315b12a0da849e15ca9816 +size 1064 diff --git a/checkpoints/checkpoint-4000/trainer_state.json b/checkpoints/checkpoint-4000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f1ed37679ac992e6c6dcdfdce95ee04973dd71d7 --- /dev/null +++ b/checkpoints/checkpoint-4000/trainer_state.json @@ -0,0 +1,2821 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.40638016864777, + "eval_steps": 500, + "global_step": 4000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-4000/training_args.bin b/checkpoints/checkpoint-4000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-4000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-4500/adapter_config.json b/checkpoints/checkpoint-4500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-4500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-4500/adapter_model.safetensors b/checkpoints/checkpoint-4500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..7aa3071a8b60a1911581ad4fc94db44a264cd1e1 --- /dev/null +++ b/checkpoints/checkpoint-4500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c6963c0425b0c2a06a71e6d47d4fbd98e2a133e8d06948e611eee31aa731b5 +size 5919456 diff --git a/checkpoints/checkpoint-4500/optimizer.pt b/checkpoints/checkpoint-4500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0cf2fc1750b9ba6635b3270b2477e9c69023282 --- /dev/null +++ b/checkpoints/checkpoint-4500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0ca7ae9cafbccffe17ee656b749d23931f579d38d02f6c6b7b74c6a48567952 +size 11930938 diff --git a/checkpoints/checkpoint-4500/rng_state_0.pth b/checkpoints/checkpoint-4500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..79c43bfab83f7a282b4b114757cd92312afef77e --- /dev/null +++ b/checkpoints/checkpoint-4500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7db56d699f6c4a471d74081289b1dc738167e2c8cb96555bccff59ced1cca60e +size 15024 diff --git a/checkpoints/checkpoint-4500/rng_state_1.pth b/checkpoints/checkpoint-4500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0c66d8f438b500c92afd06878b0368f9291d1969 --- /dev/null +++ b/checkpoints/checkpoint-4500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fcf572c6921c759e3118791b83271fd15d5f57461b5f6071b7e35d40b08d7c33 +size 15024 diff --git a/checkpoints/checkpoint-4500/rng_state_2.pth b/checkpoints/checkpoint-4500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..b19443304b49a814d70104106104e9252ab03822 --- /dev/null +++ b/checkpoints/checkpoint-4500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce499e2b0e0eea0c44c5e4fe25e271573ced5a8fefd0acff896293acd6102969 +size 15024 diff --git a/checkpoints/checkpoint-4500/rng_state_3.pth b/checkpoints/checkpoint-4500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..118e3785c401a599c8cce9ab1d24766b140cee99 --- /dev/null +++ b/checkpoints/checkpoint-4500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b4a17f4cb7c3a2fc3e2abbc1b812e3de44942fd00e096dc1860327c3e174cf1 +size 15024 diff --git a/checkpoints/checkpoint-4500/scheduler.pt b/checkpoints/checkpoint-4500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3cf7e49f7f8f9c6c02c1fea57551f9b19ff36b35 --- /dev/null +++ b/checkpoints/checkpoint-4500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:157e40aa454761c5046fdbfd6ead1c155f32a8d8a91eba044250c5c2d1b7f7fa +size 1064 diff --git a/checkpoints/checkpoint-4500/trainer_state.json b/checkpoints/checkpoint-4500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f17c9ab3c88953159e52b1f4f07808a0b09ac082 --- /dev/null +++ b/checkpoints/checkpoint-4500/trainer_state.json @@ -0,0 +1,3171 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.45717768972874123, + "eval_steps": 500, + "global_step": 4500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-4500/training_args.bin b/checkpoints/checkpoint-4500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-4500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-500/.DS_Store b/checkpoints/checkpoint-500/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 Binary files /dev/null and b/checkpoints/checkpoint-500/.DS_Store differ diff --git a/checkpoints/checkpoint-500/adapter_config.json b/checkpoints/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-500/adapter_model.safetensors b/checkpoints/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e9303fa912a4ef3e12e8166a7375958634ce8f36 --- /dev/null +++ b/checkpoints/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe875d0568aa37f2927a8a3eb8f6951cb2a06db4bb4acf3f08994191b8fcd074 +size 5919456 diff --git a/checkpoints/checkpoint-500/optimizer.pt b/checkpoints/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..df3daed252c5700e51cc06cd0106fb5befa94f45 --- /dev/null +++ b/checkpoints/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcefad058bb98bc378f909e2924345eaa383cbd31ac20a7c96a0144fbdad481b +size 11930938 diff --git a/checkpoints/checkpoint-500/rng_state_0.pth b/checkpoints/checkpoint-500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5e831911e803d34db0dee43e81dbe00dd8ce41a1 --- /dev/null +++ b/checkpoints/checkpoint-500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809bd09d083f3cc7af22a0d5fa482e3b4a80ee095c2d25606491adff7d437298 +size 15024 diff --git a/checkpoints/checkpoint-500/rng_state_1.pth b/checkpoints/checkpoint-500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..5fc931d40d8b42b24ee9524277102c98e47a7f1c --- /dev/null +++ b/checkpoints/checkpoint-500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f59c5bba33ae6a197dd330c2dad3e367df715420869ffee03f041f6bf374bd47 +size 15024 diff --git a/checkpoints/checkpoint-500/rng_state_2.pth b/checkpoints/checkpoint-500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..db1a2781c331b27ce633c771e2eb5ef32c951b3b --- /dev/null +++ b/checkpoints/checkpoint-500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e1bb7909b9abc9e4e3f3acf48720cbfc308fbfb27de6a9043a72cfc99abc32 +size 15024 diff --git a/checkpoints/checkpoint-500/rng_state_3.pth b/checkpoints/checkpoint-500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..58211359f16a7adad0b07c2acd96237202353fe7 --- /dev/null +++ b/checkpoints/checkpoint-500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ade95ac7fe747f684693248dcde4845f243707debd6487625345278fdd7ad55 +size 15024 diff --git a/checkpoints/checkpoint-500/scheduler.pt b/checkpoints/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f50d6e23087c0212f07d2b373bd3914a857dbcf4 --- /dev/null +++ b/checkpoints/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9494fbf9d2b6d20c7d21fb5ce991f87947f9899a55c5171fe969b0e4974d2103 +size 1064 diff --git a/checkpoints/checkpoint-500/trainer_state.json b/checkpoints/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0d2b5be8b9988aeafaf4997b3b16453e990e766 --- /dev/null +++ b/checkpoints/checkpoint-500/trainer_state.json @@ -0,0 +1,371 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.05079752108097125, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-500/training_args.bin b/checkpoints/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-5000/adapter_config.json b/checkpoints/checkpoint-5000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-5000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-5000/adapter_model.safetensors b/checkpoints/checkpoint-5000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c5323954094d8a6496c38b30ffa0df0ac0cfffe8 --- /dev/null +++ b/checkpoints/checkpoint-5000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ae5f313d7b029213719465291718ddf1e014b263490e78fc2a5169bf0b5252a +size 5919456 diff --git a/checkpoints/checkpoint-5000/optimizer.pt b/checkpoints/checkpoint-5000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..a79946ce5dd1275af1622625dd5f1f7e9f277b2d --- /dev/null +++ b/checkpoints/checkpoint-5000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12db3a411b1a777bb2de716e77eb87fa0eb100990039bc8de878ad51ab65b732 +size 11930938 diff --git a/checkpoints/checkpoint-5000/rng_state_0.pth b/checkpoints/checkpoint-5000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..3af645b6e41cbac80e3d515e0208c0dfd154af79 --- /dev/null +++ b/checkpoints/checkpoint-5000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67bfaa3beff54b75efd8c9a44ed5971676113e2db1f831aa1c6af1ce6a9caa0b +size 15024 diff --git a/checkpoints/checkpoint-5000/rng_state_1.pth b/checkpoints/checkpoint-5000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..597c43e1460e8c299d649484b08ee0173163f16f --- /dev/null +++ b/checkpoints/checkpoint-5000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41ed0b09eacb9d8a8c6e72cf8f95eee4f179b7f40a1ead1975ec33f39a10b112 +size 15024 diff --git a/checkpoints/checkpoint-5000/rng_state_2.pth b/checkpoints/checkpoint-5000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..c826825016b292604ccf1c156229766246fb116e --- /dev/null +++ b/checkpoints/checkpoint-5000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8d837f21251c93900eb66777d51bff1fbc27c649b7852fa3ff8933fb4214304 +size 15024 diff --git a/checkpoints/checkpoint-5000/rng_state_3.pth b/checkpoints/checkpoint-5000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..c344167e5746676b6dde2a30375e5d12f6600233 --- /dev/null +++ b/checkpoints/checkpoint-5000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efef25c82d2ac74d69a7a158e6fb9e4ce251ed5ebd2235b890bf805ca5dc093e +size 15024 diff --git a/checkpoints/checkpoint-5000/scheduler.pt b/checkpoints/checkpoint-5000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c0604ddc2c73d6b2fa706115b186b8876b4db0f1 --- /dev/null +++ b/checkpoints/checkpoint-5000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee42deb5c1610056c95ecf4b744f4223a6749192630e5e7100203f9bf0540ef4 +size 1064 diff --git a/checkpoints/checkpoint-5000/trainer_state.json b/checkpoints/checkpoint-5000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..360c1e39d3d8b26d00d06e5b1ec148375669ff0d --- /dev/null +++ b/checkpoints/checkpoint-5000/trainer_state.json @@ -0,0 +1,3521 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5079752108097125, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-5000/training_args.bin b/checkpoints/checkpoint-5000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-5000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-5500/adapter_config.json b/checkpoints/checkpoint-5500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-5500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-5500/adapter_model.safetensors b/checkpoints/checkpoint-5500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..21091e5fb953f81db93b5a0a1b0e35419d285ff5 --- /dev/null +++ b/checkpoints/checkpoint-5500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb810b92dc1e55cf94b723b80404b69163c821876b612daca7c18abd50e5cc29 +size 5919456 diff --git a/checkpoints/checkpoint-5500/optimizer.pt b/checkpoints/checkpoint-5500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..283b36631e755fd00b60aa6cd5641a403ac76671 --- /dev/null +++ b/checkpoints/checkpoint-5500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf105da9fd77aa95ac72217635110bc055276ac42b13c61c0ea8afe029df437c +size 11930938 diff --git a/checkpoints/checkpoint-5500/rng_state_0.pth b/checkpoints/checkpoint-5500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b48e0fe6cf94e78b55f9c79e0ea7aa19d768484 --- /dev/null +++ b/checkpoints/checkpoint-5500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70343f9cb54717109b4b086674f4317c77a6d5236e01a3f60ae87d47f6637943 +size 15024 diff --git a/checkpoints/checkpoint-5500/rng_state_1.pth b/checkpoints/checkpoint-5500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..f870e3cd3d0baad3b6137f60d4d1eed197e6b688 --- /dev/null +++ b/checkpoints/checkpoint-5500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17e6a7bceffa2b144c23b4b2c0efacf64ed5a2f0808ff897d8e65e0e129c709d +size 15024 diff --git a/checkpoints/checkpoint-5500/rng_state_2.pth b/checkpoints/checkpoint-5500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..a3f7b4465900221f00761bcc0d874e660643e438 --- /dev/null +++ b/checkpoints/checkpoint-5500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:109af0ffa2fb7f677448e11bba29f896c314794923e50f5ab77002d63db44682 +size 15024 diff --git a/checkpoints/checkpoint-5500/rng_state_3.pth b/checkpoints/checkpoint-5500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..3912593f43081836761363a4b700970c467e59f3 --- /dev/null +++ b/checkpoints/checkpoint-5500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:043288038a590b0ede4ede98760e44fe949c1db3d9630240030a93cb18c91259 +size 15024 diff --git a/checkpoints/checkpoint-5500/scheduler.pt b/checkpoints/checkpoint-5500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b047cded6249de59455f00473cd79106488c266 --- /dev/null +++ b/checkpoints/checkpoint-5500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5608c457217d0ace38955805f4007ad8ac45872c2d7cde753cbd1a93ae8304bd +size 1064 diff --git a/checkpoints/checkpoint-5500/trainer_state.json b/checkpoints/checkpoint-5500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d651783d2a32611ed806f44e48a3ee9af386d8df --- /dev/null +++ b/checkpoints/checkpoint-5500/trainer_state.json @@ -0,0 +1,3871 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5587727318906838, + "eval_steps": 500, + "global_step": 5500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-5500/training_args.bin b/checkpoints/checkpoint-5500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-5500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-6000/adapter_config.json b/checkpoints/checkpoint-6000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-6000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-6000/adapter_model.safetensors b/checkpoints/checkpoint-6000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4952b4876cdc8d06ed1f82e75e7acfb784d7c302 --- /dev/null +++ b/checkpoints/checkpoint-6000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69f4641ab691c0b74c8e8075b2ec23906859a6fe6d1279160c1d6f7281d2611e +size 5919456 diff --git a/checkpoints/checkpoint-6000/optimizer.pt b/checkpoints/checkpoint-6000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a6d428db6bdf0026167613f3f78e4cbb391cf2e --- /dev/null +++ b/checkpoints/checkpoint-6000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ccafd81d481629f8476e5ad9a763e445cfca7e7ec2aba5cffe2ed9ccacdb684 +size 11930938 diff --git a/checkpoints/checkpoint-6000/rng_state_0.pth b/checkpoints/checkpoint-6000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..856f6e848cfe511c4d3682477fc96768fa71e17b --- /dev/null +++ b/checkpoints/checkpoint-6000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf4e27664b653acfc21de70ae172ec0726ec640b898e117c9e038d403049764b +size 15024 diff --git a/checkpoints/checkpoint-6000/rng_state_1.pth b/checkpoints/checkpoint-6000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..91439d971c1906d490e43d5ab3edec52712a346c --- /dev/null +++ b/checkpoints/checkpoint-6000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee19749a64079934bb3c47b0f06798649c23ef5b2d28a94a6161759a9b11f5ae +size 15024 diff --git a/checkpoints/checkpoint-6000/rng_state_2.pth b/checkpoints/checkpoint-6000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..6426756daceb7e8531a5a3124f838933e0855709 --- /dev/null +++ b/checkpoints/checkpoint-6000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1eb0b946551fdb70140918364d81243b8abd5ef8ab1df8ba0a040cd91d240e5 +size 15024 diff --git a/checkpoints/checkpoint-6000/rng_state_3.pth b/checkpoints/checkpoint-6000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..8c11f16945bb9db73e4cb93bc2cc5c8ebd465457 --- /dev/null +++ b/checkpoints/checkpoint-6000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5684284bf0bd122be00905a7d1688bf38e87d37239ed219e780f5b8d8f5f3eb +size 15024 diff --git a/checkpoints/checkpoint-6000/scheduler.pt b/checkpoints/checkpoint-6000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e228e34327b45950a62a960d14420ebe108d897 --- /dev/null +++ b/checkpoints/checkpoint-6000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:202173950488d1b3f4ccd086089dbc7f02c295eb11ea39c10e13fac52ece8f3a +size 1064 diff --git a/checkpoints/checkpoint-6000/trainer_state.json b/checkpoints/checkpoint-6000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8bfa9cc79f70409d1c63f3a37aa77adcf038e581 --- /dev/null +++ b/checkpoints/checkpoint-6000/trainer_state.json @@ -0,0 +1,4221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.609570252971655, + "eval_steps": 500, + "global_step": 6000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + }, + { + "epoch": 0.5597886823123032, + "grad_norm": 1.9921875, + "learning_rate": 2.0680849202968743e-05, + "loss": 0.2139, + "step": 5510 + }, + { + "epoch": 0.5608046327339226, + "grad_norm": 2.4375, + "learning_rate": 2.060147242557674e-05, + "loss": 0.183, + "step": 5520 + }, + { + "epoch": 0.5618205831555421, + "grad_norm": 5.5, + "learning_rate": 2.0522141380299308e-05, + "loss": 0.1673, + "step": 5530 + }, + { + "epoch": 0.5628365335771615, + "grad_norm": 4.25, + "learning_rate": 2.044285689195258e-05, + "loss": 0.1674, + "step": 5540 + }, + { + "epoch": 0.5638524839987809, + "grad_norm": 2.109375, + "learning_rate": 2.0363619784868604e-05, + "loss": 0.1531, + "step": 5550 + }, + { + "epoch": 0.5648684344204002, + "grad_norm": 2.59375, + "learning_rate": 2.0284430882886836e-05, + "loss": 0.1665, + "step": 5560 + }, + { + "epoch": 0.5658843848420197, + "grad_norm": 3.984375, + "learning_rate": 2.020529100934549e-05, + "loss": 0.1717, + "step": 5570 + }, + { + "epoch": 0.5669003352636391, + "grad_norm": 1.6015625, + "learning_rate": 2.012620098707306e-05, + "loss": 0.1167, + "step": 5580 + }, + { + "epoch": 0.5679162856852585, + "grad_norm": 6.0625, + "learning_rate": 2.004716163837972e-05, + "loss": 0.2084, + "step": 5590 + }, + { + "epoch": 0.5689322361068779, + "grad_norm": 2.5625, + "learning_rate": 1.996817378504876e-05, + "loss": 0.1939, + "step": 5600 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 3.109375, + "learning_rate": 1.9889238248328108e-05, + "loss": 0.1241, + "step": 5610 + }, + { + "epoch": 0.5709641369501168, + "grad_norm": 4.875, + "learning_rate": 1.981035584892171e-05, + "loss": 0.1865, + "step": 5620 + }, + { + "epoch": 0.5719800873717362, + "grad_norm": 2.984375, + "learning_rate": 1.9731527406981072e-05, + "loss": 0.1639, + "step": 5630 + }, + { + "epoch": 0.5729960377933557, + "grad_norm": 4.4375, + "learning_rate": 1.9652753742096655e-05, + "loss": 0.2019, + "step": 5640 + }, + { + "epoch": 0.5740119882149751, + "grad_norm": 4.3125, + "learning_rate": 1.9574035673289432e-05, + "loss": 0.1829, + "step": 5650 + }, + { + "epoch": 0.5750279386365945, + "grad_norm": 3.203125, + "learning_rate": 1.9495374019002312e-05, + "loss": 0.2267, + "step": 5660 + }, + { + "epoch": 0.5760438890582139, + "grad_norm": 1.765625, + "learning_rate": 1.9416769597091673e-05, + "loss": 0.1411, + "step": 5670 + }, + { + "epoch": 0.5770598394798334, + "grad_norm": 2.640625, + "learning_rate": 1.9338223224818818e-05, + "loss": 0.1476, + "step": 5680 + }, + { + "epoch": 0.5780757899014528, + "grad_norm": 4.84375, + "learning_rate": 1.9259735718841524e-05, + "loss": 0.1417, + "step": 5690 + }, + { + "epoch": 0.5790917403230722, + "grad_norm": 2.421875, + "learning_rate": 1.918130789520551e-05, + "loss": 0.1592, + "step": 5700 + }, + { + "epoch": 0.5801076907446917, + "grad_norm": 2.984375, + "learning_rate": 1.9102940569335963e-05, + "loss": 0.161, + "step": 5710 + }, + { + "epoch": 0.5811236411663111, + "grad_norm": 1.0234375, + "learning_rate": 1.9024634556029093e-05, + "loss": 0.1614, + "step": 5720 + }, + { + "epoch": 0.5821395915879305, + "grad_norm": 2.90625, + "learning_rate": 1.89463906694436e-05, + "loss": 0.1505, + "step": 5730 + }, + { + "epoch": 0.5831555420095499, + "grad_norm": 2.875, + "learning_rate": 1.8868209723092286e-05, + "loss": 0.1674, + "step": 5740 + }, + { + "epoch": 0.5841714924311694, + "grad_norm": 0.408203125, + "learning_rate": 1.8790092529833508e-05, + "loss": 0.1468, + "step": 5750 + }, + { + "epoch": 0.5851874428527888, + "grad_norm": 5.1875, + "learning_rate": 1.871203990186281e-05, + "loss": 0.1903, + "step": 5760 + }, + { + "epoch": 0.5862033932744082, + "grad_norm": 0.5546875, + "learning_rate": 1.8634052650704415e-05, + "loss": 0.2644, + "step": 5770 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 3.203125, + "learning_rate": 1.8556131587202848e-05, + "loss": 0.1968, + "step": 5780 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484375, + "learning_rate": 1.8478277521514424e-05, + "loss": 0.2249, + "step": 5790 + }, + { + "epoch": 0.5892512445392665, + "grad_norm": 4.0, + "learning_rate": 1.8400491263098906e-05, + "loss": 0.1881, + "step": 5800 + }, + { + "epoch": 0.5902671949608859, + "grad_norm": 1.90625, + "learning_rate": 1.832277362071106e-05, + "loss": 0.1352, + "step": 5810 + }, + { + "epoch": 0.5912831453825054, + "grad_norm": 2.765625, + "learning_rate": 1.824512540239221e-05, + "loss": 0.2737, + "step": 5820 + }, + { + "epoch": 0.5922990958041248, + "grad_norm": 2.609375, + "learning_rate": 1.81675474154619e-05, + "loss": 0.1566, + "step": 5830 + }, + { + "epoch": 0.5933150462257442, + "grad_norm": 2.6875, + "learning_rate": 1.8090040466509444e-05, + "loss": 0.1999, + "step": 5840 + }, + { + "epoch": 0.5943309966473637, + "grad_norm": 2.609375, + "learning_rate": 1.8012605361385592e-05, + "loss": 0.2372, + "step": 5850 + }, + { + "epoch": 0.5953469470689831, + "grad_norm": 8.125, + "learning_rate": 1.7935242905194087e-05, + "loss": 0.2411, + "step": 5860 + }, + { + "epoch": 0.5963628974906025, + "grad_norm": 3.46875, + "learning_rate": 1.785795390228336e-05, + "loss": 0.138, + "step": 5870 + }, + { + "epoch": 0.5973788479122218, + "grad_norm": 2.3125, + "learning_rate": 1.7780739156238125e-05, + "loss": 0.1867, + "step": 5880 + }, + { + "epoch": 0.5983947983338413, + "grad_norm": 4.0625, + "learning_rate": 1.770359946987105e-05, + "loss": 0.2091, + "step": 5890 + }, + { + "epoch": 0.5994107487554607, + "grad_norm": 5.21875, + "learning_rate": 1.7626535645214378e-05, + "loss": 0.2091, + "step": 5900 + }, + { + "epoch": 0.6004266991770801, + "grad_norm": 3.15625, + "learning_rate": 1.7549548483511614e-05, + "loss": 0.1927, + "step": 5910 + }, + { + "epoch": 0.6014426495986995, + "grad_norm": 4.71875, + "learning_rate": 1.7472638785209198e-05, + "loss": 0.1893, + "step": 5920 + }, + { + "epoch": 0.602458600020319, + "grad_norm": 3.015625, + "learning_rate": 1.7395807349948145e-05, + "loss": 0.1557, + "step": 5930 + }, + { + "epoch": 0.6034745504419384, + "grad_norm": 2.9375, + "learning_rate": 1.73190549765558e-05, + "loss": 0.1717, + "step": 5940 + }, + { + "epoch": 0.6044905008635578, + "grad_norm": 3.109375, + "learning_rate": 1.724238246303745e-05, + "loss": 0.1879, + "step": 5950 + }, + { + "epoch": 0.6055064512851773, + "grad_norm": 3.875, + "learning_rate": 1.71657906065681e-05, + "loss": 0.1908, + "step": 5960 + }, + { + "epoch": 0.6065224017067967, + "grad_norm": 5.09375, + "learning_rate": 1.7089280203484115e-05, + "loss": 0.1712, + "step": 5970 + }, + { + "epoch": 0.6075383521284161, + "grad_norm": 3.015625, + "learning_rate": 1.701285204927502e-05, + "loss": 0.1454, + "step": 5980 + }, + { + "epoch": 0.6085543025500355, + "grad_norm": 3.265625, + "learning_rate": 1.693650693857515e-05, + "loss": 0.2283, + "step": 5990 + }, + { + "epoch": 0.609570252971655, + "grad_norm": 3.40625, + "learning_rate": 1.6860245665155466e-05, + "loss": 0.2188, + "step": 6000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-6000/training_args.bin b/checkpoints/checkpoint-6000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-6000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-6500/adapter_config.json b/checkpoints/checkpoint-6500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-6500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-6500/adapter_model.safetensors b/checkpoints/checkpoint-6500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea7be9ee67cb55626a3d8d3ef209fc4f1bbf148f --- /dev/null +++ b/checkpoints/checkpoint-6500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14ab2e0853dd44a4b03a9d15251deba57b15ed3261fc5e75ea0c78d8f9481486 +size 5919456 diff --git a/checkpoints/checkpoint-6500/optimizer.pt b/checkpoints/checkpoint-6500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5ce7862baad54ec5623f4b4c5b4996f11f14e8c1 --- /dev/null +++ b/checkpoints/checkpoint-6500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7047b3d5af563c786060cdd116349c114053ee2307bebbbf5f5214200bcde1ce +size 11930938 diff --git a/checkpoints/checkpoint-6500/rng_state_0.pth b/checkpoints/checkpoint-6500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e8e09c2f4a00e6486fc03f4d8a2e8a9eb1caac7 --- /dev/null +++ b/checkpoints/checkpoint-6500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a591de0feafd38f3dfb34a5639b5f07fc69364d8918cb41bee6dd6766d4ef1d2 +size 15024 diff --git a/checkpoints/checkpoint-6500/rng_state_1.pth b/checkpoints/checkpoint-6500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b9a1717d7d349f476f13e5192f06dc234ca82ea --- /dev/null +++ b/checkpoints/checkpoint-6500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da06f3a5ac82888a33cdffca90c924defae45ba6fd9ff5004219e2c9f4170e79 +size 15024 diff --git a/checkpoints/checkpoint-6500/rng_state_2.pth b/checkpoints/checkpoint-6500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..504d07665b63dfacbdb864145313ac5c3fee5a57 --- /dev/null +++ b/checkpoints/checkpoint-6500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d391446f8c14431e9c25329555ff1b3c6971fcfbfabf701b23c665e748c381f +size 15024 diff --git a/checkpoints/checkpoint-6500/rng_state_3.pth b/checkpoints/checkpoint-6500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..fcf06f5ee914106315184abc1f4e43a3e0dc6a3e --- /dev/null +++ b/checkpoints/checkpoint-6500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b3a168af7b46a8a4885487b4c51f6f868f8ac39e0a20191646e4671ae739ea +size 15024 diff --git a/checkpoints/checkpoint-6500/scheduler.pt b/checkpoints/checkpoint-6500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..bed16b1cc961349a026d0a7300f4c19a4a4cb3b3 --- /dev/null +++ b/checkpoints/checkpoint-6500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bdea953796bedc1929c6f94c7498a5dbf5c6f421163832ce66f06c7e038f707 +size 1064 diff --git a/checkpoints/checkpoint-6500/trainer_state.json b/checkpoints/checkpoint-6500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1e3fe36bbb17bb1b607af62a6682ab9d45672e47 --- /dev/null +++ b/checkpoints/checkpoint-6500/trainer_state.json @@ -0,0 +1,4571 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6603677740526263, + "eval_steps": 500, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + }, + { + "epoch": 0.5597886823123032, + "grad_norm": 1.9921875, + "learning_rate": 2.0680849202968743e-05, + "loss": 0.2139, + "step": 5510 + }, + { + "epoch": 0.5608046327339226, + "grad_norm": 2.4375, + "learning_rate": 2.060147242557674e-05, + "loss": 0.183, + "step": 5520 + }, + { + "epoch": 0.5618205831555421, + "grad_norm": 5.5, + "learning_rate": 2.0522141380299308e-05, + "loss": 0.1673, + "step": 5530 + }, + { + "epoch": 0.5628365335771615, + "grad_norm": 4.25, + "learning_rate": 2.044285689195258e-05, + "loss": 0.1674, + "step": 5540 + }, + { + "epoch": 0.5638524839987809, + "grad_norm": 2.109375, + "learning_rate": 2.0363619784868604e-05, + "loss": 0.1531, + "step": 5550 + }, + { + "epoch": 0.5648684344204002, + "grad_norm": 2.59375, + "learning_rate": 2.0284430882886836e-05, + "loss": 0.1665, + "step": 5560 + }, + { + "epoch": 0.5658843848420197, + "grad_norm": 3.984375, + "learning_rate": 2.020529100934549e-05, + "loss": 0.1717, + "step": 5570 + }, + { + "epoch": 0.5669003352636391, + "grad_norm": 1.6015625, + "learning_rate": 2.012620098707306e-05, + "loss": 0.1167, + "step": 5580 + }, + { + "epoch": 0.5679162856852585, + "grad_norm": 6.0625, + "learning_rate": 2.004716163837972e-05, + "loss": 0.2084, + "step": 5590 + }, + { + "epoch": 0.5689322361068779, + "grad_norm": 2.5625, + "learning_rate": 1.996817378504876e-05, + "loss": 0.1939, + "step": 5600 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 3.109375, + "learning_rate": 1.9889238248328108e-05, + "loss": 0.1241, + "step": 5610 + }, + { + "epoch": 0.5709641369501168, + "grad_norm": 4.875, + "learning_rate": 1.981035584892171e-05, + "loss": 0.1865, + "step": 5620 + }, + { + "epoch": 0.5719800873717362, + "grad_norm": 2.984375, + "learning_rate": 1.9731527406981072e-05, + "loss": 0.1639, + "step": 5630 + }, + { + "epoch": 0.5729960377933557, + "grad_norm": 4.4375, + "learning_rate": 1.9652753742096655e-05, + "loss": 0.2019, + "step": 5640 + }, + { + "epoch": 0.5740119882149751, + "grad_norm": 4.3125, + "learning_rate": 1.9574035673289432e-05, + "loss": 0.1829, + "step": 5650 + }, + { + "epoch": 0.5750279386365945, + "grad_norm": 3.203125, + "learning_rate": 1.9495374019002312e-05, + "loss": 0.2267, + "step": 5660 + }, + { + "epoch": 0.5760438890582139, + "grad_norm": 1.765625, + "learning_rate": 1.9416769597091673e-05, + "loss": 0.1411, + "step": 5670 + }, + { + "epoch": 0.5770598394798334, + "grad_norm": 2.640625, + "learning_rate": 1.9338223224818818e-05, + "loss": 0.1476, + "step": 5680 + }, + { + "epoch": 0.5780757899014528, + "grad_norm": 4.84375, + "learning_rate": 1.9259735718841524e-05, + "loss": 0.1417, + "step": 5690 + }, + { + "epoch": 0.5790917403230722, + "grad_norm": 2.421875, + "learning_rate": 1.918130789520551e-05, + "loss": 0.1592, + "step": 5700 + }, + { + "epoch": 0.5801076907446917, + "grad_norm": 2.984375, + "learning_rate": 1.9102940569335963e-05, + "loss": 0.161, + "step": 5710 + }, + { + "epoch": 0.5811236411663111, + "grad_norm": 1.0234375, + "learning_rate": 1.9024634556029093e-05, + "loss": 0.1614, + "step": 5720 + }, + { + "epoch": 0.5821395915879305, + "grad_norm": 2.90625, + "learning_rate": 1.89463906694436e-05, + "loss": 0.1505, + "step": 5730 + }, + { + "epoch": 0.5831555420095499, + "grad_norm": 2.875, + "learning_rate": 1.8868209723092286e-05, + "loss": 0.1674, + "step": 5740 + }, + { + "epoch": 0.5841714924311694, + "grad_norm": 0.408203125, + "learning_rate": 1.8790092529833508e-05, + "loss": 0.1468, + "step": 5750 + }, + { + "epoch": 0.5851874428527888, + "grad_norm": 5.1875, + "learning_rate": 1.871203990186281e-05, + "loss": 0.1903, + "step": 5760 + }, + { + "epoch": 0.5862033932744082, + "grad_norm": 0.5546875, + "learning_rate": 1.8634052650704415e-05, + "loss": 0.2644, + "step": 5770 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 3.203125, + "learning_rate": 1.8556131587202848e-05, + "loss": 0.1968, + "step": 5780 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484375, + "learning_rate": 1.8478277521514424e-05, + "loss": 0.2249, + "step": 5790 + }, + { + "epoch": 0.5892512445392665, + "grad_norm": 4.0, + "learning_rate": 1.8400491263098906e-05, + "loss": 0.1881, + "step": 5800 + }, + { + "epoch": 0.5902671949608859, + "grad_norm": 1.90625, + "learning_rate": 1.832277362071106e-05, + "loss": 0.1352, + "step": 5810 + }, + { + "epoch": 0.5912831453825054, + "grad_norm": 2.765625, + "learning_rate": 1.824512540239221e-05, + "loss": 0.2737, + "step": 5820 + }, + { + "epoch": 0.5922990958041248, + "grad_norm": 2.609375, + "learning_rate": 1.81675474154619e-05, + "loss": 0.1566, + "step": 5830 + }, + { + "epoch": 0.5933150462257442, + "grad_norm": 2.6875, + "learning_rate": 1.8090040466509444e-05, + "loss": 0.1999, + "step": 5840 + }, + { + "epoch": 0.5943309966473637, + "grad_norm": 2.609375, + "learning_rate": 1.8012605361385592e-05, + "loss": 0.2372, + "step": 5850 + }, + { + "epoch": 0.5953469470689831, + "grad_norm": 8.125, + "learning_rate": 1.7935242905194087e-05, + "loss": 0.2411, + "step": 5860 + }, + { + "epoch": 0.5963628974906025, + "grad_norm": 3.46875, + "learning_rate": 1.785795390228336e-05, + "loss": 0.138, + "step": 5870 + }, + { + "epoch": 0.5973788479122218, + "grad_norm": 2.3125, + "learning_rate": 1.7780739156238125e-05, + "loss": 0.1867, + "step": 5880 + }, + { + "epoch": 0.5983947983338413, + "grad_norm": 4.0625, + "learning_rate": 1.770359946987105e-05, + "loss": 0.2091, + "step": 5890 + }, + { + "epoch": 0.5994107487554607, + "grad_norm": 5.21875, + "learning_rate": 1.7626535645214378e-05, + "loss": 0.2091, + "step": 5900 + }, + { + "epoch": 0.6004266991770801, + "grad_norm": 3.15625, + "learning_rate": 1.7549548483511614e-05, + "loss": 0.1927, + "step": 5910 + }, + { + "epoch": 0.6014426495986995, + "grad_norm": 4.71875, + "learning_rate": 1.7472638785209198e-05, + "loss": 0.1893, + "step": 5920 + }, + { + "epoch": 0.602458600020319, + "grad_norm": 3.015625, + "learning_rate": 1.7395807349948145e-05, + "loss": 0.1557, + "step": 5930 + }, + { + "epoch": 0.6034745504419384, + "grad_norm": 2.9375, + "learning_rate": 1.73190549765558e-05, + "loss": 0.1717, + "step": 5940 + }, + { + "epoch": 0.6044905008635578, + "grad_norm": 3.109375, + "learning_rate": 1.724238246303745e-05, + "loss": 0.1879, + "step": 5950 + }, + { + "epoch": 0.6055064512851773, + "grad_norm": 3.875, + "learning_rate": 1.71657906065681e-05, + "loss": 0.1908, + "step": 5960 + }, + { + "epoch": 0.6065224017067967, + "grad_norm": 5.09375, + "learning_rate": 1.7089280203484115e-05, + "loss": 0.1712, + "step": 5970 + }, + { + "epoch": 0.6075383521284161, + "grad_norm": 3.015625, + "learning_rate": 1.701285204927502e-05, + "loss": 0.1454, + "step": 5980 + }, + { + "epoch": 0.6085543025500355, + "grad_norm": 3.265625, + "learning_rate": 1.693650693857515e-05, + "loss": 0.2283, + "step": 5990 + }, + { + "epoch": 0.609570252971655, + "grad_norm": 3.40625, + "learning_rate": 1.6860245665155466e-05, + "loss": 0.2188, + "step": 6000 + }, + { + "epoch": 0.6105862033932744, + "grad_norm": 2.5625, + "learning_rate": 1.678406902191521e-05, + "loss": 0.1605, + "step": 6010 + }, + { + "epoch": 0.6116021538148938, + "grad_norm": 0.6796875, + "learning_rate": 1.670797780087374e-05, + "loss": 0.1472, + "step": 6020 + }, + { + "epoch": 0.6126181042365133, + "grad_norm": 2.234375, + "learning_rate": 1.6631972793162288e-05, + "loss": 0.1676, + "step": 6030 + }, + { + "epoch": 0.6136340546581327, + "grad_norm": 1.25, + "learning_rate": 1.6556054789015662e-05, + "loss": 0.1508, + "step": 6040 + }, + { + "epoch": 0.6146500050797521, + "grad_norm": 4.78125, + "learning_rate": 1.6480224577764132e-05, + "loss": 0.1981, + "step": 6050 + }, + { + "epoch": 0.6156659555013715, + "grad_norm": 3.46875, + "learning_rate": 1.6404482947825137e-05, + "loss": 0.2514, + "step": 6060 + }, + { + "epoch": 0.616681905922991, + "grad_norm": 1.265625, + "learning_rate": 1.6328830686695154e-05, + "loss": 0.2397, + "step": 6070 + }, + { + "epoch": 0.6176978563446104, + "grad_norm": 1.953125, + "learning_rate": 1.625326858094144e-05, + "loss": 0.1523, + "step": 6080 + }, + { + "epoch": 0.6187138067662298, + "grad_norm": 3.484375, + "learning_rate": 1.6177797416193953e-05, + "loss": 0.218, + "step": 6090 + }, + { + "epoch": 0.6197297571878493, + "grad_norm": 3.484375, + "learning_rate": 1.6102417977137052e-05, + "loss": 0.1476, + "step": 6100 + }, + { + "epoch": 0.6207457076094687, + "grad_norm": 4.90625, + "learning_rate": 1.602713104750147e-05, + "loss": 0.1818, + "step": 6110 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 4.375, + "learning_rate": 1.5951937410056087e-05, + "loss": 0.2061, + "step": 6120 + }, + { + "epoch": 0.6227776084527075, + "grad_norm": 6.3125, + "learning_rate": 1.587683784659979e-05, + "loss": 0.1566, + "step": 6130 + }, + { + "epoch": 0.623793558874327, + "grad_norm": 2.828125, + "learning_rate": 1.58018331379534e-05, + "loss": 0.1376, + "step": 6140 + }, + { + "epoch": 0.6248095092959464, + "grad_norm": 2.40625, + "learning_rate": 1.572692406395149e-05, + "loss": 0.1655, + "step": 6150 + }, + { + "epoch": 0.6258254597175658, + "grad_norm": 4.34375, + "learning_rate": 1.5652111403434338e-05, + "loss": 0.2363, + "step": 6160 + }, + { + "epoch": 0.6268414101391853, + "grad_norm": 2.453125, + "learning_rate": 1.5577395934239757e-05, + "loss": 0.2464, + "step": 6170 + }, + { + "epoch": 0.6278573605608047, + "grad_norm": 2.53125, + "learning_rate": 1.5502778433195085e-05, + "loss": 0.1898, + "step": 6180 + }, + { + "epoch": 0.628873310982424, + "grad_norm": 2.28125, + "learning_rate": 1.5428259676109048e-05, + "loss": 0.1804, + "step": 6190 + }, + { + "epoch": 0.6298892614040434, + "grad_norm": 4.3125, + "learning_rate": 1.5353840437763732e-05, + "loss": 0.1409, + "step": 6200 + }, + { + "epoch": 0.630905211825663, + "grad_norm": 2.5625, + "learning_rate": 1.5279521491906496e-05, + "loss": 0.2449, + "step": 6210 + }, + { + "epoch": 0.6319211622472823, + "grad_norm": 3.0625, + "learning_rate": 1.520530361124195e-05, + "loss": 0.2103, + "step": 6220 + }, + { + "epoch": 0.6329371126689017, + "grad_norm": 2.609375, + "learning_rate": 1.5131187567423937e-05, + "loss": 0.2156, + "step": 6230 + }, + { + "epoch": 0.6339530630905211, + "grad_norm": 2.703125, + "learning_rate": 1.5057174131047446e-05, + "loss": 0.161, + "step": 6240 + }, + { + "epoch": 0.6349690135121406, + "grad_norm": 3.265625, + "learning_rate": 1.4983264071640679e-05, + "loss": 0.1757, + "step": 6250 + }, + { + "epoch": 0.63598496393376, + "grad_norm": 3.15625, + "learning_rate": 1.490945815765699e-05, + "loss": 0.2011, + "step": 6260 + }, + { + "epoch": 0.6370009143553794, + "grad_norm": 5.375, + "learning_rate": 1.4835757156466945e-05, + "loss": 0.1658, + "step": 6270 + }, + { + "epoch": 0.6380168647769989, + "grad_norm": 2.984375, + "learning_rate": 1.4762161834350271e-05, + "loss": 0.1754, + "step": 6280 + }, + { + "epoch": 0.6390328151986183, + "grad_norm": 2.015625, + "learning_rate": 1.4688672956487987e-05, + "loss": 0.1427, + "step": 6290 + }, + { + "epoch": 0.6400487656202377, + "grad_norm": 3.78125, + "learning_rate": 1.4615291286954352e-05, + "loss": 0.1517, + "step": 6300 + }, + { + "epoch": 0.6410647160418571, + "grad_norm": 2.859375, + "learning_rate": 1.4542017588709005e-05, + "loss": 0.2348, + "step": 6310 + }, + { + "epoch": 0.6420806664634766, + "grad_norm": 2.421875, + "learning_rate": 1.4468852623588961e-05, + "loss": 0.2089, + "step": 6320 + }, + { + "epoch": 0.643096616885096, + "grad_norm": 2.15625, + "learning_rate": 1.4395797152300719e-05, + "loss": 0.1702, + "step": 6330 + }, + { + "epoch": 0.6441125673067154, + "grad_norm": 1.53125, + "learning_rate": 1.4322851934412382e-05, + "loss": 0.1017, + "step": 6340 + }, + { + "epoch": 0.6451285177283349, + "grad_norm": 1.90625, + "learning_rate": 1.4250017728345716e-05, + "loss": 0.1813, + "step": 6350 + }, + { + "epoch": 0.6461444681499543, + "grad_norm": 2.015625, + "learning_rate": 1.4177295291368292e-05, + "loss": 0.1095, + "step": 6360 + }, + { + "epoch": 0.6471604185715737, + "grad_norm": 2.625, + "learning_rate": 1.410468537958558e-05, + "loss": 0.2259, + "step": 6370 + }, + { + "epoch": 0.6481763689931931, + "grad_norm": 3.5, + "learning_rate": 1.4032188747933136e-05, + "loss": 0.1595, + "step": 6380 + }, + { + "epoch": 0.6491923194148126, + "grad_norm": 5.21875, + "learning_rate": 1.39598061501687e-05, + "loss": 0.2226, + "step": 6390 + }, + { + "epoch": 0.650208269836432, + "grad_norm": 5.34375, + "learning_rate": 1.388753833886442e-05, + "loss": 0.2132, + "step": 6400 + }, + { + "epoch": 0.6512242202580514, + "grad_norm": 3.640625, + "learning_rate": 1.3815386065398945e-05, + "loss": 0.1227, + "step": 6410 + }, + { + "epoch": 0.6522401706796709, + "grad_norm": 1.0, + "learning_rate": 1.3743350079949705e-05, + "loss": 0.1755, + "step": 6420 + }, + { + "epoch": 0.6532561211012903, + "grad_norm": 2.359375, + "learning_rate": 1.3671431131485057e-05, + "loss": 0.1552, + "step": 6430 + }, + { + "epoch": 0.6542720715229097, + "grad_norm": 5.3125, + "learning_rate": 1.3599629967756483e-05, + "loss": 0.1917, + "step": 6440 + }, + { + "epoch": 0.6552880219445291, + "grad_norm": 4.625, + "learning_rate": 1.3527947335290877e-05, + "loss": 0.1812, + "step": 6450 + }, + { + "epoch": 0.6563039723661486, + "grad_norm": 1.234375, + "learning_rate": 1.3456383979382708e-05, + "loss": 0.1896, + "step": 6460 + }, + { + "epoch": 0.657319922787768, + "grad_norm": 3.984375, + "learning_rate": 1.3384940644086352e-05, + "loss": 0.1484, + "step": 6470 + }, + { + "epoch": 0.6583358732093874, + "grad_norm": 2.40625, + "learning_rate": 1.3313618072208268e-05, + "loss": 0.1334, + "step": 6480 + }, + { + "epoch": 0.6593518236310069, + "grad_norm": 4.375, + "learning_rate": 1.3242417005299357e-05, + "loss": 0.1351, + "step": 6490 + }, + { + "epoch": 0.6603677740526263, + "grad_norm": 2.640625, + "learning_rate": 1.31713381836472e-05, + "loss": 0.1717, + "step": 6500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-6500/training_args.bin b/checkpoints/checkpoint-6500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-6500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-7000/adapter_config.json b/checkpoints/checkpoint-7000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-7000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-7000/adapter_model.safetensors b/checkpoints/checkpoint-7000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f035c3b02bb414b55a51e0686d93c6e41268b717 --- /dev/null +++ b/checkpoints/checkpoint-7000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcfbb907ba164ea8f4886b499272e552c75b66b5f57c2758a433e516afc4d4ce +size 5919456 diff --git a/checkpoints/checkpoint-7000/optimizer.pt b/checkpoints/checkpoint-7000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2b419628c564de6327192effa738d6dcc841ee6 --- /dev/null +++ b/checkpoints/checkpoint-7000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18252d03a633e94bef652fd14050f4f4c005fa3e1408f904a8582bde8cf6615c +size 11930938 diff --git a/checkpoints/checkpoint-7000/rng_state_0.pth b/checkpoints/checkpoint-7000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8a64a4f50d135d9aab30a3efe7dd0f870f273099 --- /dev/null +++ b/checkpoints/checkpoint-7000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:189e349a3350f8ede7259e8b9c3805a87a95924562b5e53a021c94a808a1c148 +size 15024 diff --git a/checkpoints/checkpoint-7000/rng_state_1.pth b/checkpoints/checkpoint-7000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3669897a1580ca7339f57897968f1cd06fa56704 --- /dev/null +++ b/checkpoints/checkpoint-7000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb75d1d5d631f8db237b129d4d42263dc618eca39919111a64bdee685ea49d66 +size 15024 diff --git a/checkpoints/checkpoint-7000/rng_state_2.pth b/checkpoints/checkpoint-7000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..26649e78cb314240d04745c505919161570ff823 --- /dev/null +++ b/checkpoints/checkpoint-7000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35edf2986023cc71305d54b71c695198c4e470f12b08318d2bdd928b6f0040bb +size 15024 diff --git a/checkpoints/checkpoint-7000/rng_state_3.pth b/checkpoints/checkpoint-7000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..18f1294c99a3714188a7fd11fee212fba2a544af --- /dev/null +++ b/checkpoints/checkpoint-7000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6aeb7ee84fba724996f3c432c15837e87dd1f98cb887991d0b5902eadb092ea +size 15024 diff --git a/checkpoints/checkpoint-7000/scheduler.pt b/checkpoints/checkpoint-7000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..38eb3798e21f4a386d944a1c7b7f72bbd3680866 --- /dev/null +++ b/checkpoints/checkpoint-7000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c38f0818bc728352543f333573634bed4254978229e01cce08ff84863c12dc13 +size 1064 diff --git a/checkpoints/checkpoint-7000/trainer_state.json b/checkpoints/checkpoint-7000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2be0b7f759106ae9ab905f59d43ffd30cde0054b --- /dev/null +++ b/checkpoints/checkpoint-7000/trainer_state.json @@ -0,0 +1,4921 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7111652951335975, + "eval_steps": 500, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + }, + { + "epoch": 0.5597886823123032, + "grad_norm": 1.9921875, + "learning_rate": 2.0680849202968743e-05, + "loss": 0.2139, + "step": 5510 + }, + { + "epoch": 0.5608046327339226, + "grad_norm": 2.4375, + "learning_rate": 2.060147242557674e-05, + "loss": 0.183, + "step": 5520 + }, + { + "epoch": 0.5618205831555421, + "grad_norm": 5.5, + "learning_rate": 2.0522141380299308e-05, + "loss": 0.1673, + "step": 5530 + }, + { + "epoch": 0.5628365335771615, + "grad_norm": 4.25, + "learning_rate": 2.044285689195258e-05, + "loss": 0.1674, + "step": 5540 + }, + { + "epoch": 0.5638524839987809, + "grad_norm": 2.109375, + "learning_rate": 2.0363619784868604e-05, + "loss": 0.1531, + "step": 5550 + }, + { + "epoch": 0.5648684344204002, + "grad_norm": 2.59375, + "learning_rate": 2.0284430882886836e-05, + "loss": 0.1665, + "step": 5560 + }, + { + "epoch": 0.5658843848420197, + "grad_norm": 3.984375, + "learning_rate": 2.020529100934549e-05, + "loss": 0.1717, + "step": 5570 + }, + { + "epoch": 0.5669003352636391, + "grad_norm": 1.6015625, + "learning_rate": 2.012620098707306e-05, + "loss": 0.1167, + "step": 5580 + }, + { + "epoch": 0.5679162856852585, + "grad_norm": 6.0625, + "learning_rate": 2.004716163837972e-05, + "loss": 0.2084, + "step": 5590 + }, + { + "epoch": 0.5689322361068779, + "grad_norm": 2.5625, + "learning_rate": 1.996817378504876e-05, + "loss": 0.1939, + "step": 5600 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 3.109375, + "learning_rate": 1.9889238248328108e-05, + "loss": 0.1241, + "step": 5610 + }, + { + "epoch": 0.5709641369501168, + "grad_norm": 4.875, + "learning_rate": 1.981035584892171e-05, + "loss": 0.1865, + "step": 5620 + }, + { + "epoch": 0.5719800873717362, + "grad_norm": 2.984375, + "learning_rate": 1.9731527406981072e-05, + "loss": 0.1639, + "step": 5630 + }, + { + "epoch": 0.5729960377933557, + "grad_norm": 4.4375, + "learning_rate": 1.9652753742096655e-05, + "loss": 0.2019, + "step": 5640 + }, + { + "epoch": 0.5740119882149751, + "grad_norm": 4.3125, + "learning_rate": 1.9574035673289432e-05, + "loss": 0.1829, + "step": 5650 + }, + { + "epoch": 0.5750279386365945, + "grad_norm": 3.203125, + "learning_rate": 1.9495374019002312e-05, + "loss": 0.2267, + "step": 5660 + }, + { + "epoch": 0.5760438890582139, + "grad_norm": 1.765625, + "learning_rate": 1.9416769597091673e-05, + "loss": 0.1411, + "step": 5670 + }, + { + "epoch": 0.5770598394798334, + "grad_norm": 2.640625, + "learning_rate": 1.9338223224818818e-05, + "loss": 0.1476, + "step": 5680 + }, + { + "epoch": 0.5780757899014528, + "grad_norm": 4.84375, + "learning_rate": 1.9259735718841524e-05, + "loss": 0.1417, + "step": 5690 + }, + { + "epoch": 0.5790917403230722, + "grad_norm": 2.421875, + "learning_rate": 1.918130789520551e-05, + "loss": 0.1592, + "step": 5700 + }, + { + "epoch": 0.5801076907446917, + "grad_norm": 2.984375, + "learning_rate": 1.9102940569335963e-05, + "loss": 0.161, + "step": 5710 + }, + { + "epoch": 0.5811236411663111, + "grad_norm": 1.0234375, + "learning_rate": 1.9024634556029093e-05, + "loss": 0.1614, + "step": 5720 + }, + { + "epoch": 0.5821395915879305, + "grad_norm": 2.90625, + "learning_rate": 1.89463906694436e-05, + "loss": 0.1505, + "step": 5730 + }, + { + "epoch": 0.5831555420095499, + "grad_norm": 2.875, + "learning_rate": 1.8868209723092286e-05, + "loss": 0.1674, + "step": 5740 + }, + { + "epoch": 0.5841714924311694, + "grad_norm": 0.408203125, + "learning_rate": 1.8790092529833508e-05, + "loss": 0.1468, + "step": 5750 + }, + { + "epoch": 0.5851874428527888, + "grad_norm": 5.1875, + "learning_rate": 1.871203990186281e-05, + "loss": 0.1903, + "step": 5760 + }, + { + "epoch": 0.5862033932744082, + "grad_norm": 0.5546875, + "learning_rate": 1.8634052650704415e-05, + "loss": 0.2644, + "step": 5770 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 3.203125, + "learning_rate": 1.8556131587202848e-05, + "loss": 0.1968, + "step": 5780 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484375, + "learning_rate": 1.8478277521514424e-05, + "loss": 0.2249, + "step": 5790 + }, + { + "epoch": 0.5892512445392665, + "grad_norm": 4.0, + "learning_rate": 1.8400491263098906e-05, + "loss": 0.1881, + "step": 5800 + }, + { + "epoch": 0.5902671949608859, + "grad_norm": 1.90625, + "learning_rate": 1.832277362071106e-05, + "loss": 0.1352, + "step": 5810 + }, + { + "epoch": 0.5912831453825054, + "grad_norm": 2.765625, + "learning_rate": 1.824512540239221e-05, + "loss": 0.2737, + "step": 5820 + }, + { + "epoch": 0.5922990958041248, + "grad_norm": 2.609375, + "learning_rate": 1.81675474154619e-05, + "loss": 0.1566, + "step": 5830 + }, + { + "epoch": 0.5933150462257442, + "grad_norm": 2.6875, + "learning_rate": 1.8090040466509444e-05, + "loss": 0.1999, + "step": 5840 + }, + { + "epoch": 0.5943309966473637, + "grad_norm": 2.609375, + "learning_rate": 1.8012605361385592e-05, + "loss": 0.2372, + "step": 5850 + }, + { + "epoch": 0.5953469470689831, + "grad_norm": 8.125, + "learning_rate": 1.7935242905194087e-05, + "loss": 0.2411, + "step": 5860 + }, + { + "epoch": 0.5963628974906025, + "grad_norm": 3.46875, + "learning_rate": 1.785795390228336e-05, + "loss": 0.138, + "step": 5870 + }, + { + "epoch": 0.5973788479122218, + "grad_norm": 2.3125, + "learning_rate": 1.7780739156238125e-05, + "loss": 0.1867, + "step": 5880 + }, + { + "epoch": 0.5983947983338413, + "grad_norm": 4.0625, + "learning_rate": 1.770359946987105e-05, + "loss": 0.2091, + "step": 5890 + }, + { + "epoch": 0.5994107487554607, + "grad_norm": 5.21875, + "learning_rate": 1.7626535645214378e-05, + "loss": 0.2091, + "step": 5900 + }, + { + "epoch": 0.6004266991770801, + "grad_norm": 3.15625, + "learning_rate": 1.7549548483511614e-05, + "loss": 0.1927, + "step": 5910 + }, + { + "epoch": 0.6014426495986995, + "grad_norm": 4.71875, + "learning_rate": 1.7472638785209198e-05, + "loss": 0.1893, + "step": 5920 + }, + { + "epoch": 0.602458600020319, + "grad_norm": 3.015625, + "learning_rate": 1.7395807349948145e-05, + "loss": 0.1557, + "step": 5930 + }, + { + "epoch": 0.6034745504419384, + "grad_norm": 2.9375, + "learning_rate": 1.73190549765558e-05, + "loss": 0.1717, + "step": 5940 + }, + { + "epoch": 0.6044905008635578, + "grad_norm": 3.109375, + "learning_rate": 1.724238246303745e-05, + "loss": 0.1879, + "step": 5950 + }, + { + "epoch": 0.6055064512851773, + "grad_norm": 3.875, + "learning_rate": 1.71657906065681e-05, + "loss": 0.1908, + "step": 5960 + }, + { + "epoch": 0.6065224017067967, + "grad_norm": 5.09375, + "learning_rate": 1.7089280203484115e-05, + "loss": 0.1712, + "step": 5970 + }, + { + "epoch": 0.6075383521284161, + "grad_norm": 3.015625, + "learning_rate": 1.701285204927502e-05, + "loss": 0.1454, + "step": 5980 + }, + { + "epoch": 0.6085543025500355, + "grad_norm": 3.265625, + "learning_rate": 1.693650693857515e-05, + "loss": 0.2283, + "step": 5990 + }, + { + "epoch": 0.609570252971655, + "grad_norm": 3.40625, + "learning_rate": 1.6860245665155466e-05, + "loss": 0.2188, + "step": 6000 + }, + { + "epoch": 0.6105862033932744, + "grad_norm": 2.5625, + "learning_rate": 1.678406902191521e-05, + "loss": 0.1605, + "step": 6010 + }, + { + "epoch": 0.6116021538148938, + "grad_norm": 0.6796875, + "learning_rate": 1.670797780087374e-05, + "loss": 0.1472, + "step": 6020 + }, + { + "epoch": 0.6126181042365133, + "grad_norm": 2.234375, + "learning_rate": 1.6631972793162288e-05, + "loss": 0.1676, + "step": 6030 + }, + { + "epoch": 0.6136340546581327, + "grad_norm": 1.25, + "learning_rate": 1.6556054789015662e-05, + "loss": 0.1508, + "step": 6040 + }, + { + "epoch": 0.6146500050797521, + "grad_norm": 4.78125, + "learning_rate": 1.6480224577764132e-05, + "loss": 0.1981, + "step": 6050 + }, + { + "epoch": 0.6156659555013715, + "grad_norm": 3.46875, + "learning_rate": 1.6404482947825137e-05, + "loss": 0.2514, + "step": 6060 + }, + { + "epoch": 0.616681905922991, + "grad_norm": 1.265625, + "learning_rate": 1.6328830686695154e-05, + "loss": 0.2397, + "step": 6070 + }, + { + "epoch": 0.6176978563446104, + "grad_norm": 1.953125, + "learning_rate": 1.625326858094144e-05, + "loss": 0.1523, + "step": 6080 + }, + { + "epoch": 0.6187138067662298, + "grad_norm": 3.484375, + "learning_rate": 1.6177797416193953e-05, + "loss": 0.218, + "step": 6090 + }, + { + "epoch": 0.6197297571878493, + "grad_norm": 3.484375, + "learning_rate": 1.6102417977137052e-05, + "loss": 0.1476, + "step": 6100 + }, + { + "epoch": 0.6207457076094687, + "grad_norm": 4.90625, + "learning_rate": 1.602713104750147e-05, + "loss": 0.1818, + "step": 6110 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 4.375, + "learning_rate": 1.5951937410056087e-05, + "loss": 0.2061, + "step": 6120 + }, + { + "epoch": 0.6227776084527075, + "grad_norm": 6.3125, + "learning_rate": 1.587683784659979e-05, + "loss": 0.1566, + "step": 6130 + }, + { + "epoch": 0.623793558874327, + "grad_norm": 2.828125, + "learning_rate": 1.58018331379534e-05, + "loss": 0.1376, + "step": 6140 + }, + { + "epoch": 0.6248095092959464, + "grad_norm": 2.40625, + "learning_rate": 1.572692406395149e-05, + "loss": 0.1655, + "step": 6150 + }, + { + "epoch": 0.6258254597175658, + "grad_norm": 4.34375, + "learning_rate": 1.5652111403434338e-05, + "loss": 0.2363, + "step": 6160 + }, + { + "epoch": 0.6268414101391853, + "grad_norm": 2.453125, + "learning_rate": 1.5577395934239757e-05, + "loss": 0.2464, + "step": 6170 + }, + { + "epoch": 0.6278573605608047, + "grad_norm": 2.53125, + "learning_rate": 1.5502778433195085e-05, + "loss": 0.1898, + "step": 6180 + }, + { + "epoch": 0.628873310982424, + "grad_norm": 2.28125, + "learning_rate": 1.5428259676109048e-05, + "loss": 0.1804, + "step": 6190 + }, + { + "epoch": 0.6298892614040434, + "grad_norm": 4.3125, + "learning_rate": 1.5353840437763732e-05, + "loss": 0.1409, + "step": 6200 + }, + { + "epoch": 0.630905211825663, + "grad_norm": 2.5625, + "learning_rate": 1.5279521491906496e-05, + "loss": 0.2449, + "step": 6210 + }, + { + "epoch": 0.6319211622472823, + "grad_norm": 3.0625, + "learning_rate": 1.520530361124195e-05, + "loss": 0.2103, + "step": 6220 + }, + { + "epoch": 0.6329371126689017, + "grad_norm": 2.609375, + "learning_rate": 1.5131187567423937e-05, + "loss": 0.2156, + "step": 6230 + }, + { + "epoch": 0.6339530630905211, + "grad_norm": 2.703125, + "learning_rate": 1.5057174131047446e-05, + "loss": 0.161, + "step": 6240 + }, + { + "epoch": 0.6349690135121406, + "grad_norm": 3.265625, + "learning_rate": 1.4983264071640679e-05, + "loss": 0.1757, + "step": 6250 + }, + { + "epoch": 0.63598496393376, + "grad_norm": 3.15625, + "learning_rate": 1.490945815765699e-05, + "loss": 0.2011, + "step": 6260 + }, + { + "epoch": 0.6370009143553794, + "grad_norm": 5.375, + "learning_rate": 1.4835757156466945e-05, + "loss": 0.1658, + "step": 6270 + }, + { + "epoch": 0.6380168647769989, + "grad_norm": 2.984375, + "learning_rate": 1.4762161834350271e-05, + "loss": 0.1754, + "step": 6280 + }, + { + "epoch": 0.6390328151986183, + "grad_norm": 2.015625, + "learning_rate": 1.4688672956487987e-05, + "loss": 0.1427, + "step": 6290 + }, + { + "epoch": 0.6400487656202377, + "grad_norm": 3.78125, + "learning_rate": 1.4615291286954352e-05, + "loss": 0.1517, + "step": 6300 + }, + { + "epoch": 0.6410647160418571, + "grad_norm": 2.859375, + "learning_rate": 1.4542017588709005e-05, + "loss": 0.2348, + "step": 6310 + }, + { + "epoch": 0.6420806664634766, + "grad_norm": 2.421875, + "learning_rate": 1.4468852623588961e-05, + "loss": 0.2089, + "step": 6320 + }, + { + "epoch": 0.643096616885096, + "grad_norm": 2.15625, + "learning_rate": 1.4395797152300719e-05, + "loss": 0.1702, + "step": 6330 + }, + { + "epoch": 0.6441125673067154, + "grad_norm": 1.53125, + "learning_rate": 1.4322851934412382e-05, + "loss": 0.1017, + "step": 6340 + }, + { + "epoch": 0.6451285177283349, + "grad_norm": 1.90625, + "learning_rate": 1.4250017728345716e-05, + "loss": 0.1813, + "step": 6350 + }, + { + "epoch": 0.6461444681499543, + "grad_norm": 2.015625, + "learning_rate": 1.4177295291368292e-05, + "loss": 0.1095, + "step": 6360 + }, + { + "epoch": 0.6471604185715737, + "grad_norm": 2.625, + "learning_rate": 1.410468537958558e-05, + "loss": 0.2259, + "step": 6370 + }, + { + "epoch": 0.6481763689931931, + "grad_norm": 3.5, + "learning_rate": 1.4032188747933136e-05, + "loss": 0.1595, + "step": 6380 + }, + { + "epoch": 0.6491923194148126, + "grad_norm": 5.21875, + "learning_rate": 1.39598061501687e-05, + "loss": 0.2226, + "step": 6390 + }, + { + "epoch": 0.650208269836432, + "grad_norm": 5.34375, + "learning_rate": 1.388753833886442e-05, + "loss": 0.2132, + "step": 6400 + }, + { + "epoch": 0.6512242202580514, + "grad_norm": 3.640625, + "learning_rate": 1.3815386065398945e-05, + "loss": 0.1227, + "step": 6410 + }, + { + "epoch": 0.6522401706796709, + "grad_norm": 1.0, + "learning_rate": 1.3743350079949705e-05, + "loss": 0.1755, + "step": 6420 + }, + { + "epoch": 0.6532561211012903, + "grad_norm": 2.359375, + "learning_rate": 1.3671431131485057e-05, + "loss": 0.1552, + "step": 6430 + }, + { + "epoch": 0.6542720715229097, + "grad_norm": 5.3125, + "learning_rate": 1.3599629967756483e-05, + "loss": 0.1917, + "step": 6440 + }, + { + "epoch": 0.6552880219445291, + "grad_norm": 4.625, + "learning_rate": 1.3527947335290877e-05, + "loss": 0.1812, + "step": 6450 + }, + { + "epoch": 0.6563039723661486, + "grad_norm": 1.234375, + "learning_rate": 1.3456383979382708e-05, + "loss": 0.1896, + "step": 6460 + }, + { + "epoch": 0.657319922787768, + "grad_norm": 3.984375, + "learning_rate": 1.3384940644086352e-05, + "loss": 0.1484, + "step": 6470 + }, + { + "epoch": 0.6583358732093874, + "grad_norm": 2.40625, + "learning_rate": 1.3313618072208268e-05, + "loss": 0.1334, + "step": 6480 + }, + { + "epoch": 0.6593518236310069, + "grad_norm": 4.375, + "learning_rate": 1.3242417005299357e-05, + "loss": 0.1351, + "step": 6490 + }, + { + "epoch": 0.6603677740526263, + "grad_norm": 2.640625, + "learning_rate": 1.31713381836472e-05, + "loss": 0.1717, + "step": 6500 + }, + { + "epoch": 0.6613837244742456, + "grad_norm": 2.640625, + "learning_rate": 1.3100382346268392e-05, + "loss": 0.1867, + "step": 6510 + }, + { + "epoch": 0.662399674895865, + "grad_norm": 1.734375, + "learning_rate": 1.3029550230900812e-05, + "loss": 0.1997, + "step": 6520 + }, + { + "epoch": 0.6634156253174845, + "grad_norm": 3.609375, + "learning_rate": 1.2958842573996016e-05, + "loss": 0.1969, + "step": 6530 + }, + { + "epoch": 0.6644315757391039, + "grad_norm": 3.578125, + "learning_rate": 1.2888260110711525e-05, + "loss": 0.1469, + "step": 6540 + }, + { + "epoch": 0.6654475261607233, + "grad_norm": 1.3515625, + "learning_rate": 1.2817803574903212e-05, + "loss": 0.1524, + "step": 6550 + }, + { + "epoch": 0.6664634765823427, + "grad_norm": 2.109375, + "learning_rate": 1.2747473699117668e-05, + "loss": 0.159, + "step": 6560 + }, + { + "epoch": 0.6674794270039622, + "grad_norm": 1.53125, + "learning_rate": 1.267727121458458e-05, + "loss": 0.1999, + "step": 6570 + }, + { + "epoch": 0.6684953774255816, + "grad_norm": 1.7265625, + "learning_rate": 1.2607196851209137e-05, + "loss": 0.2216, + "step": 6580 + }, + { + "epoch": 0.669511327847201, + "grad_norm": 3.125, + "learning_rate": 1.2537251337564412e-05, + "loss": 0.1607, + "step": 6590 + }, + { + "epoch": 0.6705272782688205, + "grad_norm": 2.421875, + "learning_rate": 1.2467435400883839e-05, + "loss": 0.2187, + "step": 6600 + }, + { + "epoch": 0.6715432286904399, + "grad_norm": 1.5078125, + "learning_rate": 1.239774976705359e-05, + "loss": 0.1753, + "step": 6610 + }, + { + "epoch": 0.6725591791120593, + "grad_norm": 1.140625, + "learning_rate": 1.2328195160605092e-05, + "loss": 0.194, + "step": 6620 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 4.9375, + "learning_rate": 1.225877230470743e-05, + "loss": 0.1485, + "step": 6630 + }, + { + "epoch": 0.6745910799552982, + "grad_norm": 3.65625, + "learning_rate": 1.218948192115988e-05, + "loss": 0.1847, + "step": 6640 + }, + { + "epoch": 0.6756070303769176, + "grad_norm": 3.875, + "learning_rate": 1.21203247303844e-05, + "loss": 0.1874, + "step": 6650 + }, + { + "epoch": 0.676622980798537, + "grad_norm": 2.65625, + "learning_rate": 1.2051301451418073e-05, + "loss": 0.2377, + "step": 6660 + }, + { + "epoch": 0.6776389312201565, + "grad_norm": 2.09375, + "learning_rate": 1.198241280190574e-05, + "loss": 0.1508, + "step": 6670 + }, + { + "epoch": 0.6786548816417759, + "grad_norm": 2.203125, + "learning_rate": 1.1913659498092431e-05, + "loss": 0.1537, + "step": 6680 + }, + { + "epoch": 0.6796708320633953, + "grad_norm": 2.484375, + "learning_rate": 1.184504225481601e-05, + "loss": 0.2339, + "step": 6690 + }, + { + "epoch": 0.6806867824850147, + "grad_norm": 5.625, + "learning_rate": 1.177656178549966e-05, + "loss": 0.2102, + "step": 6700 + }, + { + "epoch": 0.6817027329066342, + "grad_norm": 2.5, + "learning_rate": 1.1708218802144536e-05, + "loss": 0.1435, + "step": 6710 + }, + { + "epoch": 0.6827186833282536, + "grad_norm": 3.84375, + "learning_rate": 1.1640014015322323e-05, + "loss": 0.1823, + "step": 6720 + }, + { + "epoch": 0.683734633749873, + "grad_norm": 2.359375, + "learning_rate": 1.1571948134167862e-05, + "loss": 0.1154, + "step": 6730 + }, + { + "epoch": 0.6847505841714925, + "grad_norm": 2.90625, + "learning_rate": 1.1504021866371761e-05, + "loss": 0.2105, + "step": 6740 + }, + { + "epoch": 0.6857665345931119, + "grad_norm": 5.46875, + "learning_rate": 1.143623591817304e-05, + "loss": 0.1317, + "step": 6750 + }, + { + "epoch": 0.6867824850147313, + "grad_norm": 3.34375, + "learning_rate": 1.1368590994351835e-05, + "loss": 0.1406, + "step": 6760 + }, + { + "epoch": 0.6877984354363507, + "grad_norm": 3.78125, + "learning_rate": 1.130108779822198e-05, + "loss": 0.1425, + "step": 6770 + }, + { + "epoch": 0.6888143858579702, + "grad_norm": 0.77734375, + "learning_rate": 1.1233727031623783e-05, + "loss": 0.1623, + "step": 6780 + }, + { + "epoch": 0.6898303362795896, + "grad_norm": 4.625, + "learning_rate": 1.1166509394916682e-05, + "loss": 0.1591, + "step": 6790 + }, + { + "epoch": 0.690846286701209, + "grad_norm": 3.84375, + "learning_rate": 1.1099435586971982e-05, + "loss": 0.1758, + "step": 6800 + }, + { + "epoch": 0.6918622371228285, + "grad_norm": 2.4375, + "learning_rate": 1.1032506305165555e-05, + "loss": 0.1018, + "step": 6810 + }, + { + "epoch": 0.6928781875444479, + "grad_norm": 3.203125, + "learning_rate": 1.0965722245370641e-05, + "loss": 0.1485, + "step": 6820 + }, + { + "epoch": 0.6938941379660672, + "grad_norm": 0.7109375, + "learning_rate": 1.0899084101950561e-05, + "loss": 0.1762, + "step": 6830 + }, + { + "epoch": 0.6949100883876866, + "grad_norm": 1.9765625, + "learning_rate": 1.0832592567751555e-05, + "loss": 0.1402, + "step": 6840 + }, + { + "epoch": 0.6959260388093061, + "grad_norm": 1.4609375, + "learning_rate": 1.0766248334095505e-05, + "loss": 0.2278, + "step": 6850 + }, + { + "epoch": 0.6969419892309255, + "grad_norm": 3.953125, + "learning_rate": 1.0700052090772828e-05, + "loss": 0.1969, + "step": 6860 + }, + { + "epoch": 0.6979579396525449, + "grad_norm": 2.453125, + "learning_rate": 1.0634004526035249e-05, + "loss": 0.2073, + "step": 6870 + }, + { + "epoch": 0.6989738900741643, + "grad_norm": 1.6171875, + "learning_rate": 1.0568106326588645e-05, + "loss": 0.1902, + "step": 6880 + }, + { + "epoch": 0.6999898404957838, + "grad_norm": 1.2734375, + "learning_rate": 1.0502358177585953e-05, + "loss": 0.2165, + "step": 6890 + }, + { + "epoch": 0.7010057909174032, + "grad_norm": 1.671875, + "learning_rate": 1.0436760762619977e-05, + "loss": 0.1952, + "step": 6900 + }, + { + "epoch": 0.7020217413390226, + "grad_norm": 2.8125, + "learning_rate": 1.0371314763716347e-05, + "loss": 0.1422, + "step": 6910 + }, + { + "epoch": 0.7030376917606421, + "grad_norm": 2.53125, + "learning_rate": 1.0306020861326388e-05, + "loss": 0.0961, + "step": 6920 + }, + { + "epoch": 0.7040536421822615, + "grad_norm": 3.046875, + "learning_rate": 1.0240879734320068e-05, + "loss": 0.1542, + "step": 6930 + }, + { + "epoch": 0.7050695926038809, + "grad_norm": 2.859375, + "learning_rate": 1.0175892059978901e-05, + "loss": 0.1748, + "step": 6940 + }, + { + "epoch": 0.7060855430255003, + "grad_norm": 2.671875, + "learning_rate": 1.0111058513988958e-05, + "loss": 0.0819, + "step": 6950 + }, + { + "epoch": 0.7071014934471198, + "grad_norm": 3.5625, + "learning_rate": 1.0046379770433803e-05, + "loss": 0.1933, + "step": 6960 + }, + { + "epoch": 0.7081174438687392, + "grad_norm": 2.859375, + "learning_rate": 9.98185650178749e-06, + "loss": 0.1891, + "step": 6970 + }, + { + "epoch": 0.7091333942903586, + "grad_norm": 3.15625, + "learning_rate": 9.917489378907591e-06, + "loss": 0.2102, + "step": 6980 + }, + { + "epoch": 0.7101493447119781, + "grad_norm": 6.40625, + "learning_rate": 9.853279071028212e-06, + "loss": 0.1714, + "step": 6990 + }, + { + "epoch": 0.7111652951335975, + "grad_norm": 2.375, + "learning_rate": 9.78922624575303e-06, + "loss": 0.1299, + "step": 7000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-7000/training_args.bin b/checkpoints/checkpoint-7000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-7000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-7500/adapter_config.json b/checkpoints/checkpoint-7500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-7500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-7500/adapter_model.safetensors b/checkpoints/checkpoint-7500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8b566b27331cac4819b9e695fbf784bff8e79cd5 --- /dev/null +++ b/checkpoints/checkpoint-7500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eea7fbe808952b90c42a26bb6a156e294c6b01b74f4561e32845a867894829fc +size 5919456 diff --git a/checkpoints/checkpoint-7500/optimizer.pt b/checkpoints/checkpoint-7500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd01feb285dc960cdf690295976b4050319066c1 --- /dev/null +++ b/checkpoints/checkpoint-7500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dd7bb44df6f3a93ddba0c17a7309fbe69b5d9611641b66779f565c2707e80dd +size 11930938 diff --git a/checkpoints/checkpoint-7500/rng_state_0.pth b/checkpoints/checkpoint-7500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..0b99b3affc0b088b35c4713d5dbf363b6fb09e01 --- /dev/null +++ b/checkpoints/checkpoint-7500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81a176e4f417232286c66488b5554a046fc1af84e1b5eff446ad37a4dc31c907 +size 15024 diff --git a/checkpoints/checkpoint-7500/rng_state_1.pth b/checkpoints/checkpoint-7500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..9e56badfa0d1b294971878e842fe14a8595016f1 --- /dev/null +++ b/checkpoints/checkpoint-7500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3899c82e743501e2db6cd0e409a1b646a2eae511ad1e21aa77bc3f897edbf07a +size 15024 diff --git a/checkpoints/checkpoint-7500/rng_state_2.pth b/checkpoints/checkpoint-7500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d5c75385d7ded955107873c0bccd2a63bd21f7ce --- /dev/null +++ b/checkpoints/checkpoint-7500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e12f41676d53322341924bbe6951161bec80e7d0e4adb780e12732cfa714e98d +size 15024 diff --git a/checkpoints/checkpoint-7500/rng_state_3.pth b/checkpoints/checkpoint-7500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f0ce3c448fcfd7f4444003295f128c63d9e46cd --- /dev/null +++ b/checkpoints/checkpoint-7500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b87c6fdf947fac25cf632f98a25e1c5f87369325d38ab0c3dfe29dea4f62eb75 +size 15024 diff --git a/checkpoints/checkpoint-7500/scheduler.pt b/checkpoints/checkpoint-7500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a8d5e25619fe365bd3104d2c93dae8be1ab1f87 --- /dev/null +++ b/checkpoints/checkpoint-7500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01aa4970faf173b76d7b990defe083ac27fc2fbe404a2ecca54f2098c3a5e177 +size 1064 diff --git a/checkpoints/checkpoint-7500/trainer_state.json b/checkpoints/checkpoint-7500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ebe79f3a10531feb2f6b9e25d7e9001aa29a39fc --- /dev/null +++ b/checkpoints/checkpoint-7500/trainer_state.json @@ -0,0 +1,5271 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7619628162145687, + "eval_steps": 500, + "global_step": 7500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + }, + { + "epoch": 0.5597886823123032, + "grad_norm": 1.9921875, + "learning_rate": 2.0680849202968743e-05, + "loss": 0.2139, + "step": 5510 + }, + { + "epoch": 0.5608046327339226, + "grad_norm": 2.4375, + "learning_rate": 2.060147242557674e-05, + "loss": 0.183, + "step": 5520 + }, + { + "epoch": 0.5618205831555421, + "grad_norm": 5.5, + "learning_rate": 2.0522141380299308e-05, + "loss": 0.1673, + "step": 5530 + }, + { + "epoch": 0.5628365335771615, + "grad_norm": 4.25, + "learning_rate": 2.044285689195258e-05, + "loss": 0.1674, + "step": 5540 + }, + { + "epoch": 0.5638524839987809, + "grad_norm": 2.109375, + "learning_rate": 2.0363619784868604e-05, + "loss": 0.1531, + "step": 5550 + }, + { + "epoch": 0.5648684344204002, + "grad_norm": 2.59375, + "learning_rate": 2.0284430882886836e-05, + "loss": 0.1665, + "step": 5560 + }, + { + "epoch": 0.5658843848420197, + "grad_norm": 3.984375, + "learning_rate": 2.020529100934549e-05, + "loss": 0.1717, + "step": 5570 + }, + { + "epoch": 0.5669003352636391, + "grad_norm": 1.6015625, + "learning_rate": 2.012620098707306e-05, + "loss": 0.1167, + "step": 5580 + }, + { + "epoch": 0.5679162856852585, + "grad_norm": 6.0625, + "learning_rate": 2.004716163837972e-05, + "loss": 0.2084, + "step": 5590 + }, + { + "epoch": 0.5689322361068779, + "grad_norm": 2.5625, + "learning_rate": 1.996817378504876e-05, + "loss": 0.1939, + "step": 5600 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 3.109375, + "learning_rate": 1.9889238248328108e-05, + "loss": 0.1241, + "step": 5610 + }, + { + "epoch": 0.5709641369501168, + "grad_norm": 4.875, + "learning_rate": 1.981035584892171e-05, + "loss": 0.1865, + "step": 5620 + }, + { + "epoch": 0.5719800873717362, + "grad_norm": 2.984375, + "learning_rate": 1.9731527406981072e-05, + "loss": 0.1639, + "step": 5630 + }, + { + "epoch": 0.5729960377933557, + "grad_norm": 4.4375, + "learning_rate": 1.9652753742096655e-05, + "loss": 0.2019, + "step": 5640 + }, + { + "epoch": 0.5740119882149751, + "grad_norm": 4.3125, + "learning_rate": 1.9574035673289432e-05, + "loss": 0.1829, + "step": 5650 + }, + { + "epoch": 0.5750279386365945, + "grad_norm": 3.203125, + "learning_rate": 1.9495374019002312e-05, + "loss": 0.2267, + "step": 5660 + }, + { + "epoch": 0.5760438890582139, + "grad_norm": 1.765625, + "learning_rate": 1.9416769597091673e-05, + "loss": 0.1411, + "step": 5670 + }, + { + "epoch": 0.5770598394798334, + "grad_norm": 2.640625, + "learning_rate": 1.9338223224818818e-05, + "loss": 0.1476, + "step": 5680 + }, + { + "epoch": 0.5780757899014528, + "grad_norm": 4.84375, + "learning_rate": 1.9259735718841524e-05, + "loss": 0.1417, + "step": 5690 + }, + { + "epoch": 0.5790917403230722, + "grad_norm": 2.421875, + "learning_rate": 1.918130789520551e-05, + "loss": 0.1592, + "step": 5700 + }, + { + "epoch": 0.5801076907446917, + "grad_norm": 2.984375, + "learning_rate": 1.9102940569335963e-05, + "loss": 0.161, + "step": 5710 + }, + { + "epoch": 0.5811236411663111, + "grad_norm": 1.0234375, + "learning_rate": 1.9024634556029093e-05, + "loss": 0.1614, + "step": 5720 + }, + { + "epoch": 0.5821395915879305, + "grad_norm": 2.90625, + "learning_rate": 1.89463906694436e-05, + "loss": 0.1505, + "step": 5730 + }, + { + "epoch": 0.5831555420095499, + "grad_norm": 2.875, + "learning_rate": 1.8868209723092286e-05, + "loss": 0.1674, + "step": 5740 + }, + { + "epoch": 0.5841714924311694, + "grad_norm": 0.408203125, + "learning_rate": 1.8790092529833508e-05, + "loss": 0.1468, + "step": 5750 + }, + { + "epoch": 0.5851874428527888, + "grad_norm": 5.1875, + "learning_rate": 1.871203990186281e-05, + "loss": 0.1903, + "step": 5760 + }, + { + "epoch": 0.5862033932744082, + "grad_norm": 0.5546875, + "learning_rate": 1.8634052650704415e-05, + "loss": 0.2644, + "step": 5770 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 3.203125, + "learning_rate": 1.8556131587202848e-05, + "loss": 0.1968, + "step": 5780 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484375, + "learning_rate": 1.8478277521514424e-05, + "loss": 0.2249, + "step": 5790 + }, + { + "epoch": 0.5892512445392665, + "grad_norm": 4.0, + "learning_rate": 1.8400491263098906e-05, + "loss": 0.1881, + "step": 5800 + }, + { + "epoch": 0.5902671949608859, + "grad_norm": 1.90625, + "learning_rate": 1.832277362071106e-05, + "loss": 0.1352, + "step": 5810 + }, + { + "epoch": 0.5912831453825054, + "grad_norm": 2.765625, + "learning_rate": 1.824512540239221e-05, + "loss": 0.2737, + "step": 5820 + }, + { + "epoch": 0.5922990958041248, + "grad_norm": 2.609375, + "learning_rate": 1.81675474154619e-05, + "loss": 0.1566, + "step": 5830 + }, + { + "epoch": 0.5933150462257442, + "grad_norm": 2.6875, + "learning_rate": 1.8090040466509444e-05, + "loss": 0.1999, + "step": 5840 + }, + { + "epoch": 0.5943309966473637, + "grad_norm": 2.609375, + "learning_rate": 1.8012605361385592e-05, + "loss": 0.2372, + "step": 5850 + }, + { + "epoch": 0.5953469470689831, + "grad_norm": 8.125, + "learning_rate": 1.7935242905194087e-05, + "loss": 0.2411, + "step": 5860 + }, + { + "epoch": 0.5963628974906025, + "grad_norm": 3.46875, + "learning_rate": 1.785795390228336e-05, + "loss": 0.138, + "step": 5870 + }, + { + "epoch": 0.5973788479122218, + "grad_norm": 2.3125, + "learning_rate": 1.7780739156238125e-05, + "loss": 0.1867, + "step": 5880 + }, + { + "epoch": 0.5983947983338413, + "grad_norm": 4.0625, + "learning_rate": 1.770359946987105e-05, + "loss": 0.2091, + "step": 5890 + }, + { + "epoch": 0.5994107487554607, + "grad_norm": 5.21875, + "learning_rate": 1.7626535645214378e-05, + "loss": 0.2091, + "step": 5900 + }, + { + "epoch": 0.6004266991770801, + "grad_norm": 3.15625, + "learning_rate": 1.7549548483511614e-05, + "loss": 0.1927, + "step": 5910 + }, + { + "epoch": 0.6014426495986995, + "grad_norm": 4.71875, + "learning_rate": 1.7472638785209198e-05, + "loss": 0.1893, + "step": 5920 + }, + { + "epoch": 0.602458600020319, + "grad_norm": 3.015625, + "learning_rate": 1.7395807349948145e-05, + "loss": 0.1557, + "step": 5930 + }, + { + "epoch": 0.6034745504419384, + "grad_norm": 2.9375, + "learning_rate": 1.73190549765558e-05, + "loss": 0.1717, + "step": 5940 + }, + { + "epoch": 0.6044905008635578, + "grad_norm": 3.109375, + "learning_rate": 1.724238246303745e-05, + "loss": 0.1879, + "step": 5950 + }, + { + "epoch": 0.6055064512851773, + "grad_norm": 3.875, + "learning_rate": 1.71657906065681e-05, + "loss": 0.1908, + "step": 5960 + }, + { + "epoch": 0.6065224017067967, + "grad_norm": 5.09375, + "learning_rate": 1.7089280203484115e-05, + "loss": 0.1712, + "step": 5970 + }, + { + "epoch": 0.6075383521284161, + "grad_norm": 3.015625, + "learning_rate": 1.701285204927502e-05, + "loss": 0.1454, + "step": 5980 + }, + { + "epoch": 0.6085543025500355, + "grad_norm": 3.265625, + "learning_rate": 1.693650693857515e-05, + "loss": 0.2283, + "step": 5990 + }, + { + "epoch": 0.609570252971655, + "grad_norm": 3.40625, + "learning_rate": 1.6860245665155466e-05, + "loss": 0.2188, + "step": 6000 + }, + { + "epoch": 0.6105862033932744, + "grad_norm": 2.5625, + "learning_rate": 1.678406902191521e-05, + "loss": 0.1605, + "step": 6010 + }, + { + "epoch": 0.6116021538148938, + "grad_norm": 0.6796875, + "learning_rate": 1.670797780087374e-05, + "loss": 0.1472, + "step": 6020 + }, + { + "epoch": 0.6126181042365133, + "grad_norm": 2.234375, + "learning_rate": 1.6631972793162288e-05, + "loss": 0.1676, + "step": 6030 + }, + { + "epoch": 0.6136340546581327, + "grad_norm": 1.25, + "learning_rate": 1.6556054789015662e-05, + "loss": 0.1508, + "step": 6040 + }, + { + "epoch": 0.6146500050797521, + "grad_norm": 4.78125, + "learning_rate": 1.6480224577764132e-05, + "loss": 0.1981, + "step": 6050 + }, + { + "epoch": 0.6156659555013715, + "grad_norm": 3.46875, + "learning_rate": 1.6404482947825137e-05, + "loss": 0.2514, + "step": 6060 + }, + { + "epoch": 0.616681905922991, + "grad_norm": 1.265625, + "learning_rate": 1.6328830686695154e-05, + "loss": 0.2397, + "step": 6070 + }, + { + "epoch": 0.6176978563446104, + "grad_norm": 1.953125, + "learning_rate": 1.625326858094144e-05, + "loss": 0.1523, + "step": 6080 + }, + { + "epoch": 0.6187138067662298, + "grad_norm": 3.484375, + "learning_rate": 1.6177797416193953e-05, + "loss": 0.218, + "step": 6090 + }, + { + "epoch": 0.6197297571878493, + "grad_norm": 3.484375, + "learning_rate": 1.6102417977137052e-05, + "loss": 0.1476, + "step": 6100 + }, + { + "epoch": 0.6207457076094687, + "grad_norm": 4.90625, + "learning_rate": 1.602713104750147e-05, + "loss": 0.1818, + "step": 6110 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 4.375, + "learning_rate": 1.5951937410056087e-05, + "loss": 0.2061, + "step": 6120 + }, + { + "epoch": 0.6227776084527075, + "grad_norm": 6.3125, + "learning_rate": 1.587683784659979e-05, + "loss": 0.1566, + "step": 6130 + }, + { + "epoch": 0.623793558874327, + "grad_norm": 2.828125, + "learning_rate": 1.58018331379534e-05, + "loss": 0.1376, + "step": 6140 + }, + { + "epoch": 0.6248095092959464, + "grad_norm": 2.40625, + "learning_rate": 1.572692406395149e-05, + "loss": 0.1655, + "step": 6150 + }, + { + "epoch": 0.6258254597175658, + "grad_norm": 4.34375, + "learning_rate": 1.5652111403434338e-05, + "loss": 0.2363, + "step": 6160 + }, + { + "epoch": 0.6268414101391853, + "grad_norm": 2.453125, + "learning_rate": 1.5577395934239757e-05, + "loss": 0.2464, + "step": 6170 + }, + { + "epoch": 0.6278573605608047, + "grad_norm": 2.53125, + "learning_rate": 1.5502778433195085e-05, + "loss": 0.1898, + "step": 6180 + }, + { + "epoch": 0.628873310982424, + "grad_norm": 2.28125, + "learning_rate": 1.5428259676109048e-05, + "loss": 0.1804, + "step": 6190 + }, + { + "epoch": 0.6298892614040434, + "grad_norm": 4.3125, + "learning_rate": 1.5353840437763732e-05, + "loss": 0.1409, + "step": 6200 + }, + { + "epoch": 0.630905211825663, + "grad_norm": 2.5625, + "learning_rate": 1.5279521491906496e-05, + "loss": 0.2449, + "step": 6210 + }, + { + "epoch": 0.6319211622472823, + "grad_norm": 3.0625, + "learning_rate": 1.520530361124195e-05, + "loss": 0.2103, + "step": 6220 + }, + { + "epoch": 0.6329371126689017, + "grad_norm": 2.609375, + "learning_rate": 1.5131187567423937e-05, + "loss": 0.2156, + "step": 6230 + }, + { + "epoch": 0.6339530630905211, + "grad_norm": 2.703125, + "learning_rate": 1.5057174131047446e-05, + "loss": 0.161, + "step": 6240 + }, + { + "epoch": 0.6349690135121406, + "grad_norm": 3.265625, + "learning_rate": 1.4983264071640679e-05, + "loss": 0.1757, + "step": 6250 + }, + { + "epoch": 0.63598496393376, + "grad_norm": 3.15625, + "learning_rate": 1.490945815765699e-05, + "loss": 0.2011, + "step": 6260 + }, + { + "epoch": 0.6370009143553794, + "grad_norm": 5.375, + "learning_rate": 1.4835757156466945e-05, + "loss": 0.1658, + "step": 6270 + }, + { + "epoch": 0.6380168647769989, + "grad_norm": 2.984375, + "learning_rate": 1.4762161834350271e-05, + "loss": 0.1754, + "step": 6280 + }, + { + "epoch": 0.6390328151986183, + "grad_norm": 2.015625, + "learning_rate": 1.4688672956487987e-05, + "loss": 0.1427, + "step": 6290 + }, + { + "epoch": 0.6400487656202377, + "grad_norm": 3.78125, + "learning_rate": 1.4615291286954352e-05, + "loss": 0.1517, + "step": 6300 + }, + { + "epoch": 0.6410647160418571, + "grad_norm": 2.859375, + "learning_rate": 1.4542017588709005e-05, + "loss": 0.2348, + "step": 6310 + }, + { + "epoch": 0.6420806664634766, + "grad_norm": 2.421875, + "learning_rate": 1.4468852623588961e-05, + "loss": 0.2089, + "step": 6320 + }, + { + "epoch": 0.643096616885096, + "grad_norm": 2.15625, + "learning_rate": 1.4395797152300719e-05, + "loss": 0.1702, + "step": 6330 + }, + { + "epoch": 0.6441125673067154, + "grad_norm": 1.53125, + "learning_rate": 1.4322851934412382e-05, + "loss": 0.1017, + "step": 6340 + }, + { + "epoch": 0.6451285177283349, + "grad_norm": 1.90625, + "learning_rate": 1.4250017728345716e-05, + "loss": 0.1813, + "step": 6350 + }, + { + "epoch": 0.6461444681499543, + "grad_norm": 2.015625, + "learning_rate": 1.4177295291368292e-05, + "loss": 0.1095, + "step": 6360 + }, + { + "epoch": 0.6471604185715737, + "grad_norm": 2.625, + "learning_rate": 1.410468537958558e-05, + "loss": 0.2259, + "step": 6370 + }, + { + "epoch": 0.6481763689931931, + "grad_norm": 3.5, + "learning_rate": 1.4032188747933136e-05, + "loss": 0.1595, + "step": 6380 + }, + { + "epoch": 0.6491923194148126, + "grad_norm": 5.21875, + "learning_rate": 1.39598061501687e-05, + "loss": 0.2226, + "step": 6390 + }, + { + "epoch": 0.650208269836432, + "grad_norm": 5.34375, + "learning_rate": 1.388753833886442e-05, + "loss": 0.2132, + "step": 6400 + }, + { + "epoch": 0.6512242202580514, + "grad_norm": 3.640625, + "learning_rate": 1.3815386065398945e-05, + "loss": 0.1227, + "step": 6410 + }, + { + "epoch": 0.6522401706796709, + "grad_norm": 1.0, + "learning_rate": 1.3743350079949705e-05, + "loss": 0.1755, + "step": 6420 + }, + { + "epoch": 0.6532561211012903, + "grad_norm": 2.359375, + "learning_rate": 1.3671431131485057e-05, + "loss": 0.1552, + "step": 6430 + }, + { + "epoch": 0.6542720715229097, + "grad_norm": 5.3125, + "learning_rate": 1.3599629967756483e-05, + "loss": 0.1917, + "step": 6440 + }, + { + "epoch": 0.6552880219445291, + "grad_norm": 4.625, + "learning_rate": 1.3527947335290877e-05, + "loss": 0.1812, + "step": 6450 + }, + { + "epoch": 0.6563039723661486, + "grad_norm": 1.234375, + "learning_rate": 1.3456383979382708e-05, + "loss": 0.1896, + "step": 6460 + }, + { + "epoch": 0.657319922787768, + "grad_norm": 3.984375, + "learning_rate": 1.3384940644086352e-05, + "loss": 0.1484, + "step": 6470 + }, + { + "epoch": 0.6583358732093874, + "grad_norm": 2.40625, + "learning_rate": 1.3313618072208268e-05, + "loss": 0.1334, + "step": 6480 + }, + { + "epoch": 0.6593518236310069, + "grad_norm": 4.375, + "learning_rate": 1.3242417005299357e-05, + "loss": 0.1351, + "step": 6490 + }, + { + "epoch": 0.6603677740526263, + "grad_norm": 2.640625, + "learning_rate": 1.31713381836472e-05, + "loss": 0.1717, + "step": 6500 + }, + { + "epoch": 0.6613837244742456, + "grad_norm": 2.640625, + "learning_rate": 1.3100382346268392e-05, + "loss": 0.1867, + "step": 6510 + }, + { + "epoch": 0.662399674895865, + "grad_norm": 1.734375, + "learning_rate": 1.3029550230900812e-05, + "loss": 0.1997, + "step": 6520 + }, + { + "epoch": 0.6634156253174845, + "grad_norm": 3.609375, + "learning_rate": 1.2958842573996016e-05, + "loss": 0.1969, + "step": 6530 + }, + { + "epoch": 0.6644315757391039, + "grad_norm": 3.578125, + "learning_rate": 1.2888260110711525e-05, + "loss": 0.1469, + "step": 6540 + }, + { + "epoch": 0.6654475261607233, + "grad_norm": 1.3515625, + "learning_rate": 1.2817803574903212e-05, + "loss": 0.1524, + "step": 6550 + }, + { + "epoch": 0.6664634765823427, + "grad_norm": 2.109375, + "learning_rate": 1.2747473699117668e-05, + "loss": 0.159, + "step": 6560 + }, + { + "epoch": 0.6674794270039622, + "grad_norm": 1.53125, + "learning_rate": 1.267727121458458e-05, + "loss": 0.1999, + "step": 6570 + }, + { + "epoch": 0.6684953774255816, + "grad_norm": 1.7265625, + "learning_rate": 1.2607196851209137e-05, + "loss": 0.2216, + "step": 6580 + }, + { + "epoch": 0.669511327847201, + "grad_norm": 3.125, + "learning_rate": 1.2537251337564412e-05, + "loss": 0.1607, + "step": 6590 + }, + { + "epoch": 0.6705272782688205, + "grad_norm": 2.421875, + "learning_rate": 1.2467435400883839e-05, + "loss": 0.2187, + "step": 6600 + }, + { + "epoch": 0.6715432286904399, + "grad_norm": 1.5078125, + "learning_rate": 1.239774976705359e-05, + "loss": 0.1753, + "step": 6610 + }, + { + "epoch": 0.6725591791120593, + "grad_norm": 1.140625, + "learning_rate": 1.2328195160605092e-05, + "loss": 0.194, + "step": 6620 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 4.9375, + "learning_rate": 1.225877230470743e-05, + "loss": 0.1485, + "step": 6630 + }, + { + "epoch": 0.6745910799552982, + "grad_norm": 3.65625, + "learning_rate": 1.218948192115988e-05, + "loss": 0.1847, + "step": 6640 + }, + { + "epoch": 0.6756070303769176, + "grad_norm": 3.875, + "learning_rate": 1.21203247303844e-05, + "loss": 0.1874, + "step": 6650 + }, + { + "epoch": 0.676622980798537, + "grad_norm": 2.65625, + "learning_rate": 1.2051301451418073e-05, + "loss": 0.2377, + "step": 6660 + }, + { + "epoch": 0.6776389312201565, + "grad_norm": 2.09375, + "learning_rate": 1.198241280190574e-05, + "loss": 0.1508, + "step": 6670 + }, + { + "epoch": 0.6786548816417759, + "grad_norm": 2.203125, + "learning_rate": 1.1913659498092431e-05, + "loss": 0.1537, + "step": 6680 + }, + { + "epoch": 0.6796708320633953, + "grad_norm": 2.484375, + "learning_rate": 1.184504225481601e-05, + "loss": 0.2339, + "step": 6690 + }, + { + "epoch": 0.6806867824850147, + "grad_norm": 5.625, + "learning_rate": 1.177656178549966e-05, + "loss": 0.2102, + "step": 6700 + }, + { + "epoch": 0.6817027329066342, + "grad_norm": 2.5, + "learning_rate": 1.1708218802144536e-05, + "loss": 0.1435, + "step": 6710 + }, + { + "epoch": 0.6827186833282536, + "grad_norm": 3.84375, + "learning_rate": 1.1640014015322323e-05, + "loss": 0.1823, + "step": 6720 + }, + { + "epoch": 0.683734633749873, + "grad_norm": 2.359375, + "learning_rate": 1.1571948134167862e-05, + "loss": 0.1154, + "step": 6730 + }, + { + "epoch": 0.6847505841714925, + "grad_norm": 2.90625, + "learning_rate": 1.1504021866371761e-05, + "loss": 0.2105, + "step": 6740 + }, + { + "epoch": 0.6857665345931119, + "grad_norm": 5.46875, + "learning_rate": 1.143623591817304e-05, + "loss": 0.1317, + "step": 6750 + }, + { + "epoch": 0.6867824850147313, + "grad_norm": 3.34375, + "learning_rate": 1.1368590994351835e-05, + "loss": 0.1406, + "step": 6760 + }, + { + "epoch": 0.6877984354363507, + "grad_norm": 3.78125, + "learning_rate": 1.130108779822198e-05, + "loss": 0.1425, + "step": 6770 + }, + { + "epoch": 0.6888143858579702, + "grad_norm": 0.77734375, + "learning_rate": 1.1233727031623783e-05, + "loss": 0.1623, + "step": 6780 + }, + { + "epoch": 0.6898303362795896, + "grad_norm": 4.625, + "learning_rate": 1.1166509394916682e-05, + "loss": 0.1591, + "step": 6790 + }, + { + "epoch": 0.690846286701209, + "grad_norm": 3.84375, + "learning_rate": 1.1099435586971982e-05, + "loss": 0.1758, + "step": 6800 + }, + { + "epoch": 0.6918622371228285, + "grad_norm": 2.4375, + "learning_rate": 1.1032506305165555e-05, + "loss": 0.1018, + "step": 6810 + }, + { + "epoch": 0.6928781875444479, + "grad_norm": 3.203125, + "learning_rate": 1.0965722245370641e-05, + "loss": 0.1485, + "step": 6820 + }, + { + "epoch": 0.6938941379660672, + "grad_norm": 0.7109375, + "learning_rate": 1.0899084101950561e-05, + "loss": 0.1762, + "step": 6830 + }, + { + "epoch": 0.6949100883876866, + "grad_norm": 1.9765625, + "learning_rate": 1.0832592567751555e-05, + "loss": 0.1402, + "step": 6840 + }, + { + "epoch": 0.6959260388093061, + "grad_norm": 1.4609375, + "learning_rate": 1.0766248334095505e-05, + "loss": 0.2278, + "step": 6850 + }, + { + "epoch": 0.6969419892309255, + "grad_norm": 3.953125, + "learning_rate": 1.0700052090772828e-05, + "loss": 0.1969, + "step": 6860 + }, + { + "epoch": 0.6979579396525449, + "grad_norm": 2.453125, + "learning_rate": 1.0634004526035249e-05, + "loss": 0.2073, + "step": 6870 + }, + { + "epoch": 0.6989738900741643, + "grad_norm": 1.6171875, + "learning_rate": 1.0568106326588645e-05, + "loss": 0.1902, + "step": 6880 + }, + { + "epoch": 0.6999898404957838, + "grad_norm": 1.2734375, + "learning_rate": 1.0502358177585953e-05, + "loss": 0.2165, + "step": 6890 + }, + { + "epoch": 0.7010057909174032, + "grad_norm": 1.671875, + "learning_rate": 1.0436760762619977e-05, + "loss": 0.1952, + "step": 6900 + }, + { + "epoch": 0.7020217413390226, + "grad_norm": 2.8125, + "learning_rate": 1.0371314763716347e-05, + "loss": 0.1422, + "step": 6910 + }, + { + "epoch": 0.7030376917606421, + "grad_norm": 2.53125, + "learning_rate": 1.0306020861326388e-05, + "loss": 0.0961, + "step": 6920 + }, + { + "epoch": 0.7040536421822615, + "grad_norm": 3.046875, + "learning_rate": 1.0240879734320068e-05, + "loss": 0.1542, + "step": 6930 + }, + { + "epoch": 0.7050695926038809, + "grad_norm": 2.859375, + "learning_rate": 1.0175892059978901e-05, + "loss": 0.1748, + "step": 6940 + }, + { + "epoch": 0.7060855430255003, + "grad_norm": 2.671875, + "learning_rate": 1.0111058513988958e-05, + "loss": 0.0819, + "step": 6950 + }, + { + "epoch": 0.7071014934471198, + "grad_norm": 3.5625, + "learning_rate": 1.0046379770433803e-05, + "loss": 0.1933, + "step": 6960 + }, + { + "epoch": 0.7081174438687392, + "grad_norm": 2.859375, + "learning_rate": 9.98185650178749e-06, + "loss": 0.1891, + "step": 6970 + }, + { + "epoch": 0.7091333942903586, + "grad_norm": 3.15625, + "learning_rate": 9.917489378907591e-06, + "loss": 0.2102, + "step": 6980 + }, + { + "epoch": 0.7101493447119781, + "grad_norm": 6.40625, + "learning_rate": 9.853279071028212e-06, + "loss": 0.1714, + "step": 6990 + }, + { + "epoch": 0.7111652951335975, + "grad_norm": 2.375, + "learning_rate": 9.78922624575303e-06, + "loss": 0.1299, + "step": 7000 + }, + { + "epoch": 0.7121812455552169, + "grad_norm": 2.078125, + "learning_rate": 9.72533156904833e-06, + "loss": 0.1914, + "step": 7010 + }, + { + "epoch": 0.7131971959768363, + "grad_norm": 3.859375, + "learning_rate": 9.661595705236137e-06, + "loss": 0.2377, + "step": 7020 + }, + { + "epoch": 0.7142131463984558, + "grad_norm": 1.171875, + "learning_rate": 9.598019316987244e-06, + "loss": 0.1851, + "step": 7030 + }, + { + "epoch": 0.7152290968200752, + "grad_norm": 1.078125, + "learning_rate": 9.53460306531439e-06, + "loss": 0.2661, + "step": 7040 + }, + { + "epoch": 0.7162450472416946, + "grad_norm": 1.6484375, + "learning_rate": 9.471347609565311e-06, + "loss": 0.1669, + "step": 7050 + }, + { + "epoch": 0.7172609976633141, + "grad_norm": 4.59375, + "learning_rate": 9.408253607415957e-06, + "loss": 0.2487, + "step": 7060 + }, + { + "epoch": 0.7182769480849335, + "grad_norm": 3.09375, + "learning_rate": 9.345321714863614e-06, + "loss": 0.186, + "step": 7070 + }, + { + "epoch": 0.7192928985065529, + "grad_norm": 6.0625, + "learning_rate": 9.282552586220075e-06, + "loss": 0.2249, + "step": 7080 + }, + { + "epoch": 0.7203088489281723, + "grad_norm": 1.5703125, + "learning_rate": 9.219946874104885e-06, + "loss": 0.1255, + "step": 7090 + }, + { + "epoch": 0.7213247993497918, + "grad_norm": 1.9453125, + "learning_rate": 9.157505229438481e-06, + "loss": 0.1999, + "step": 7100 + }, + { + "epoch": 0.7223407497714112, + "grad_norm": 5.1875, + "learning_rate": 9.095228301435518e-06, + "loss": 0.199, + "step": 7110 + }, + { + "epoch": 0.7233567001930306, + "grad_norm": 2.078125, + "learning_rate": 9.03311673759802e-06, + "loss": 0.2182, + "step": 7120 + }, + { + "epoch": 0.7243726506146501, + "grad_norm": 6.46875, + "learning_rate": 8.971171183708733e-06, + "loss": 0.1573, + "step": 7130 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 3.015625, + "learning_rate": 8.909392283824353e-06, + "loss": 0.2044, + "step": 7140 + }, + { + "epoch": 0.7264045514578888, + "grad_norm": 2.921875, + "learning_rate": 8.847780680268872e-06, + "loss": 0.11, + "step": 7150 + }, + { + "epoch": 0.7274205018795082, + "grad_norm": 2.96875, + "learning_rate": 8.786337013626853e-06, + "loss": 0.1897, + "step": 7160 + }, + { + "epoch": 0.7284364523011277, + "grad_norm": 1.7578125, + "learning_rate": 8.725061922736799e-06, + "loss": 0.153, + "step": 7170 + }, + { + "epoch": 0.7294524027227471, + "grad_norm": 1.609375, + "learning_rate": 8.663956044684532e-06, + "loss": 0.1746, + "step": 7180 + }, + { + "epoch": 0.7304683531443665, + "grad_norm": 1.9375, + "learning_rate": 8.603020014796507e-06, + "loss": 0.2284, + "step": 7190 + }, + { + "epoch": 0.7314843035659859, + "grad_norm": 1.515625, + "learning_rate": 8.542254466633273e-06, + "loss": 0.1186, + "step": 7200 + }, + { + "epoch": 0.7325002539876054, + "grad_norm": 1.671875, + "learning_rate": 8.481660031982844e-06, + "loss": 0.1971, + "step": 7210 + }, + { + "epoch": 0.7335162044092248, + "grad_norm": 1.453125, + "learning_rate": 8.421237340854157e-06, + "loss": 0.196, + "step": 7220 + }, + { + "epoch": 0.7345321548308442, + "grad_norm": 0.65234375, + "learning_rate": 8.360987021470479e-06, + "loss": 0.1724, + "step": 7230 + }, + { + "epoch": 0.7355481052524637, + "grad_norm": 2.84375, + "learning_rate": 8.300909700262929e-06, + "loss": 0.175, + "step": 7240 + }, + { + "epoch": 0.7365640556740831, + "grad_norm": 3.109375, + "learning_rate": 8.241006001863924e-06, + "loss": 0.2276, + "step": 7250 + }, + { + "epoch": 0.7375800060957025, + "grad_norm": 4.8125, + "learning_rate": 8.181276549100714e-06, + "loss": 0.2029, + "step": 7260 + }, + { + "epoch": 0.7385959565173219, + "grad_norm": 4.03125, + "learning_rate": 8.12172196298887e-06, + "loss": 0.175, + "step": 7270 + }, + { + "epoch": 0.7396119069389414, + "grad_norm": 3.046875, + "learning_rate": 8.062342862725878e-06, + "loss": 0.1662, + "step": 7280 + }, + { + "epoch": 0.7406278573605608, + "grad_norm": 3.375, + "learning_rate": 8.003139865684662e-06, + "loss": 0.1616, + "step": 7290 + }, + { + "epoch": 0.7416438077821802, + "grad_norm": 2.5625, + "learning_rate": 7.944113587407157e-06, + "loss": 0.2448, + "step": 7300 + }, + { + "epoch": 0.7426597582037997, + "grad_norm": 4.125, + "learning_rate": 7.885264641597961e-06, + "loss": 0.1618, + "step": 7310 + }, + { + "epoch": 0.7436757086254191, + "grad_norm": 3.5, + "learning_rate": 7.826593640117889e-06, + "loss": 0.1134, + "step": 7320 + }, + { + "epoch": 0.7446916590470385, + "grad_norm": 2.6875, + "learning_rate": 7.76810119297767e-06, + "loss": 0.1795, + "step": 7330 + }, + { + "epoch": 0.7457076094686579, + "grad_norm": 4.34375, + "learning_rate": 7.709787908331556e-06, + "loss": 0.2736, + "step": 7340 + }, + { + "epoch": 0.7467235598902774, + "grad_norm": 1.21875, + "learning_rate": 7.651654392471038e-06, + "loss": 0.139, + "step": 7350 + }, + { + "epoch": 0.7477395103118968, + "grad_norm": 3.578125, + "learning_rate": 7.593701249818521e-06, + "loss": 0.2023, + "step": 7360 + }, + { + "epoch": 0.7487554607335162, + "grad_norm": 2.15625, + "learning_rate": 7.535929082921048e-06, + "loss": 0.1702, + "step": 7370 + }, + { + "epoch": 0.7497714111551357, + "grad_norm": 1.96875, + "learning_rate": 7.47833849244402e-06, + "loss": 0.1835, + "step": 7380 + }, + { + "epoch": 0.7507873615767551, + "grad_norm": 2.796875, + "learning_rate": 7.420930077164959e-06, + "loss": 0.1713, + "step": 7390 + }, + { + "epoch": 0.7518033119983745, + "grad_norm": 4.46875, + "learning_rate": 7.363704433967311e-06, + "loss": 0.1906, + "step": 7400 + }, + { + "epoch": 0.7528192624199939, + "grad_norm": 1.75, + "learning_rate": 7.306662157834185e-06, + "loss": 0.1421, + "step": 7410 + }, + { + "epoch": 0.7538352128416134, + "grad_norm": 1.140625, + "learning_rate": 7.2498038418422145e-06, + "loss": 0.1793, + "step": 7420 + }, + { + "epoch": 0.7548511632632328, + "grad_norm": 2.578125, + "learning_rate": 7.193130077155374e-06, + "loss": 0.1603, + "step": 7430 + }, + { + "epoch": 0.7558671136848522, + "grad_norm": 4.3125, + "learning_rate": 7.13664145301883e-06, + "loss": 0.2169, + "step": 7440 + }, + { + "epoch": 0.7568830641064717, + "grad_norm": 3.078125, + "learning_rate": 7.0803385567528025e-06, + "loss": 0.1685, + "step": 7450 + }, + { + "epoch": 0.757899014528091, + "grad_norm": 3.5625, + "learning_rate": 7.024221973746495e-06, + "loss": 0.2282, + "step": 7460 + }, + { + "epoch": 0.7589149649497104, + "grad_norm": 2.265625, + "learning_rate": 6.968292287451961e-06, + "loss": 0.1786, + "step": 7470 + }, + { + "epoch": 0.7599309153713298, + "grad_norm": 4.71875, + "learning_rate": 6.912550079378091e-06, + "loss": 0.1811, + "step": 7480 + }, + { + "epoch": 0.7609468657929493, + "grad_norm": 2.328125, + "learning_rate": 6.856995929084506e-06, + "loss": 0.1747, + "step": 7490 + }, + { + "epoch": 0.7619628162145687, + "grad_norm": 5.21875, + "learning_rate": 6.801630414175589e-06, + "loss": 0.2028, + "step": 7500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-7500/training_args.bin b/checkpoints/checkpoint-7500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-7500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-8000/adapter_config.json b/checkpoints/checkpoint-8000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-8000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-8000/adapter_model.safetensors b/checkpoints/checkpoint-8000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a3e74da16eac9d652cb024c1c96567e92d1eb6b2 --- /dev/null +++ b/checkpoints/checkpoint-8000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34815b44876635d03a12d793111d30ddb5f1ba50b8380dcfca1bf6c33938f840 +size 5919456 diff --git a/checkpoints/checkpoint-8000/optimizer.pt b/checkpoints/checkpoint-8000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..756d0512f4d3fe7c92a0b851ef0cade6d0d5d29c --- /dev/null +++ b/checkpoints/checkpoint-8000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f76507605c949d2190ab2abb8d665fe079ec56e9d7d3261ffaa91dcc3e884b3 +size 11930938 diff --git a/checkpoints/checkpoint-8000/rng_state_0.pth b/checkpoints/checkpoint-8000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..820bae5762cf93cad79c7772fa2a022119a232d9 --- /dev/null +++ b/checkpoints/checkpoint-8000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f2044a5599a741319339b0e82e423a0b7bec9a103bb74fa855a2b26076b2bbe +size 15024 diff --git a/checkpoints/checkpoint-8000/rng_state_1.pth b/checkpoints/checkpoint-8000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..1cab1e079612b70518a476c16ab1414d88aff839 --- /dev/null +++ b/checkpoints/checkpoint-8000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:274ebe0b40c3a495361afcc0dde20f0438b9b75835963738c84e733228f7478e +size 15024 diff --git a/checkpoints/checkpoint-8000/rng_state_2.pth b/checkpoints/checkpoint-8000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..00aef9053f81e7da4386c3746977735f10ec06c9 --- /dev/null +++ b/checkpoints/checkpoint-8000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd18781ccda8ced5e8f822443d08381a84f46d37b63c3de7993f25d8a7b0dac0 +size 15024 diff --git a/checkpoints/checkpoint-8000/rng_state_3.pth b/checkpoints/checkpoint-8000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..928e4beb3f21210417e2df6c6c1dcd68a708f0ee --- /dev/null +++ b/checkpoints/checkpoint-8000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a1dc1515069dbf95fe772f9cfe866820d39bf12f126251b05b193492bf8026b +size 15024 diff --git a/checkpoints/checkpoint-8000/scheduler.pt b/checkpoints/checkpoint-8000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e3e70b0e2f0324728f6d70898373ef2914d119a --- /dev/null +++ b/checkpoints/checkpoint-8000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d390c33d5dd3d95d913d2faf47a8c88adcdfd442ecba2876f37ca1daeb1d2bf4 +size 1064 diff --git a/checkpoints/checkpoint-8000/trainer_state.json b/checkpoints/checkpoint-8000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9cd92bbfdbf0eb2401ba4a0d81d37b675b3a870f --- /dev/null +++ b/checkpoints/checkpoint-8000/trainer_state.json @@ -0,0 +1,5621 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.81276033729554, + "eval_steps": 500, + "global_step": 8000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + }, + { + "epoch": 0.5597886823123032, + "grad_norm": 1.9921875, + "learning_rate": 2.0680849202968743e-05, + "loss": 0.2139, + "step": 5510 + }, + { + "epoch": 0.5608046327339226, + "grad_norm": 2.4375, + "learning_rate": 2.060147242557674e-05, + "loss": 0.183, + "step": 5520 + }, + { + "epoch": 0.5618205831555421, + "grad_norm": 5.5, + "learning_rate": 2.0522141380299308e-05, + "loss": 0.1673, + "step": 5530 + }, + { + "epoch": 0.5628365335771615, + "grad_norm": 4.25, + "learning_rate": 2.044285689195258e-05, + "loss": 0.1674, + "step": 5540 + }, + { + "epoch": 0.5638524839987809, + "grad_norm": 2.109375, + "learning_rate": 2.0363619784868604e-05, + "loss": 0.1531, + "step": 5550 + }, + { + "epoch": 0.5648684344204002, + "grad_norm": 2.59375, + "learning_rate": 2.0284430882886836e-05, + "loss": 0.1665, + "step": 5560 + }, + { + "epoch": 0.5658843848420197, + "grad_norm": 3.984375, + "learning_rate": 2.020529100934549e-05, + "loss": 0.1717, + "step": 5570 + }, + { + "epoch": 0.5669003352636391, + "grad_norm": 1.6015625, + "learning_rate": 2.012620098707306e-05, + "loss": 0.1167, + "step": 5580 + }, + { + "epoch": 0.5679162856852585, + "grad_norm": 6.0625, + "learning_rate": 2.004716163837972e-05, + "loss": 0.2084, + "step": 5590 + }, + { + "epoch": 0.5689322361068779, + "grad_norm": 2.5625, + "learning_rate": 1.996817378504876e-05, + "loss": 0.1939, + "step": 5600 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 3.109375, + "learning_rate": 1.9889238248328108e-05, + "loss": 0.1241, + "step": 5610 + }, + { + "epoch": 0.5709641369501168, + "grad_norm": 4.875, + "learning_rate": 1.981035584892171e-05, + "loss": 0.1865, + "step": 5620 + }, + { + "epoch": 0.5719800873717362, + "grad_norm": 2.984375, + "learning_rate": 1.9731527406981072e-05, + "loss": 0.1639, + "step": 5630 + }, + { + "epoch": 0.5729960377933557, + "grad_norm": 4.4375, + "learning_rate": 1.9652753742096655e-05, + "loss": 0.2019, + "step": 5640 + }, + { + "epoch": 0.5740119882149751, + "grad_norm": 4.3125, + "learning_rate": 1.9574035673289432e-05, + "loss": 0.1829, + "step": 5650 + }, + { + "epoch": 0.5750279386365945, + "grad_norm": 3.203125, + "learning_rate": 1.9495374019002312e-05, + "loss": 0.2267, + "step": 5660 + }, + { + "epoch": 0.5760438890582139, + "grad_norm": 1.765625, + "learning_rate": 1.9416769597091673e-05, + "loss": 0.1411, + "step": 5670 + }, + { + "epoch": 0.5770598394798334, + "grad_norm": 2.640625, + "learning_rate": 1.9338223224818818e-05, + "loss": 0.1476, + "step": 5680 + }, + { + "epoch": 0.5780757899014528, + "grad_norm": 4.84375, + "learning_rate": 1.9259735718841524e-05, + "loss": 0.1417, + "step": 5690 + }, + { + "epoch": 0.5790917403230722, + "grad_norm": 2.421875, + "learning_rate": 1.918130789520551e-05, + "loss": 0.1592, + "step": 5700 + }, + { + "epoch": 0.5801076907446917, + "grad_norm": 2.984375, + "learning_rate": 1.9102940569335963e-05, + "loss": 0.161, + "step": 5710 + }, + { + "epoch": 0.5811236411663111, + "grad_norm": 1.0234375, + "learning_rate": 1.9024634556029093e-05, + "loss": 0.1614, + "step": 5720 + }, + { + "epoch": 0.5821395915879305, + "grad_norm": 2.90625, + "learning_rate": 1.89463906694436e-05, + "loss": 0.1505, + "step": 5730 + }, + { + "epoch": 0.5831555420095499, + "grad_norm": 2.875, + "learning_rate": 1.8868209723092286e-05, + "loss": 0.1674, + "step": 5740 + }, + { + "epoch": 0.5841714924311694, + "grad_norm": 0.408203125, + "learning_rate": 1.8790092529833508e-05, + "loss": 0.1468, + "step": 5750 + }, + { + "epoch": 0.5851874428527888, + "grad_norm": 5.1875, + "learning_rate": 1.871203990186281e-05, + "loss": 0.1903, + "step": 5760 + }, + { + "epoch": 0.5862033932744082, + "grad_norm": 0.5546875, + "learning_rate": 1.8634052650704415e-05, + "loss": 0.2644, + "step": 5770 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 3.203125, + "learning_rate": 1.8556131587202848e-05, + "loss": 0.1968, + "step": 5780 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484375, + "learning_rate": 1.8478277521514424e-05, + "loss": 0.2249, + "step": 5790 + }, + { + "epoch": 0.5892512445392665, + "grad_norm": 4.0, + "learning_rate": 1.8400491263098906e-05, + "loss": 0.1881, + "step": 5800 + }, + { + "epoch": 0.5902671949608859, + "grad_norm": 1.90625, + "learning_rate": 1.832277362071106e-05, + "loss": 0.1352, + "step": 5810 + }, + { + "epoch": 0.5912831453825054, + "grad_norm": 2.765625, + "learning_rate": 1.824512540239221e-05, + "loss": 0.2737, + "step": 5820 + }, + { + "epoch": 0.5922990958041248, + "grad_norm": 2.609375, + "learning_rate": 1.81675474154619e-05, + "loss": 0.1566, + "step": 5830 + }, + { + "epoch": 0.5933150462257442, + "grad_norm": 2.6875, + "learning_rate": 1.8090040466509444e-05, + "loss": 0.1999, + "step": 5840 + }, + { + "epoch": 0.5943309966473637, + "grad_norm": 2.609375, + "learning_rate": 1.8012605361385592e-05, + "loss": 0.2372, + "step": 5850 + }, + { + "epoch": 0.5953469470689831, + "grad_norm": 8.125, + "learning_rate": 1.7935242905194087e-05, + "loss": 0.2411, + "step": 5860 + }, + { + "epoch": 0.5963628974906025, + "grad_norm": 3.46875, + "learning_rate": 1.785795390228336e-05, + "loss": 0.138, + "step": 5870 + }, + { + "epoch": 0.5973788479122218, + "grad_norm": 2.3125, + "learning_rate": 1.7780739156238125e-05, + "loss": 0.1867, + "step": 5880 + }, + { + "epoch": 0.5983947983338413, + "grad_norm": 4.0625, + "learning_rate": 1.770359946987105e-05, + "loss": 0.2091, + "step": 5890 + }, + { + "epoch": 0.5994107487554607, + "grad_norm": 5.21875, + "learning_rate": 1.7626535645214378e-05, + "loss": 0.2091, + "step": 5900 + }, + { + "epoch": 0.6004266991770801, + "grad_norm": 3.15625, + "learning_rate": 1.7549548483511614e-05, + "loss": 0.1927, + "step": 5910 + }, + { + "epoch": 0.6014426495986995, + "grad_norm": 4.71875, + "learning_rate": 1.7472638785209198e-05, + "loss": 0.1893, + "step": 5920 + }, + { + "epoch": 0.602458600020319, + "grad_norm": 3.015625, + "learning_rate": 1.7395807349948145e-05, + "loss": 0.1557, + "step": 5930 + }, + { + "epoch": 0.6034745504419384, + "grad_norm": 2.9375, + "learning_rate": 1.73190549765558e-05, + "loss": 0.1717, + "step": 5940 + }, + { + "epoch": 0.6044905008635578, + "grad_norm": 3.109375, + "learning_rate": 1.724238246303745e-05, + "loss": 0.1879, + "step": 5950 + }, + { + "epoch": 0.6055064512851773, + "grad_norm": 3.875, + "learning_rate": 1.71657906065681e-05, + "loss": 0.1908, + "step": 5960 + }, + { + "epoch": 0.6065224017067967, + "grad_norm": 5.09375, + "learning_rate": 1.7089280203484115e-05, + "loss": 0.1712, + "step": 5970 + }, + { + "epoch": 0.6075383521284161, + "grad_norm": 3.015625, + "learning_rate": 1.701285204927502e-05, + "loss": 0.1454, + "step": 5980 + }, + { + "epoch": 0.6085543025500355, + "grad_norm": 3.265625, + "learning_rate": 1.693650693857515e-05, + "loss": 0.2283, + "step": 5990 + }, + { + "epoch": 0.609570252971655, + "grad_norm": 3.40625, + "learning_rate": 1.6860245665155466e-05, + "loss": 0.2188, + "step": 6000 + }, + { + "epoch": 0.6105862033932744, + "grad_norm": 2.5625, + "learning_rate": 1.678406902191521e-05, + "loss": 0.1605, + "step": 6010 + }, + { + "epoch": 0.6116021538148938, + "grad_norm": 0.6796875, + "learning_rate": 1.670797780087374e-05, + "loss": 0.1472, + "step": 6020 + }, + { + "epoch": 0.6126181042365133, + "grad_norm": 2.234375, + "learning_rate": 1.6631972793162288e-05, + "loss": 0.1676, + "step": 6030 + }, + { + "epoch": 0.6136340546581327, + "grad_norm": 1.25, + "learning_rate": 1.6556054789015662e-05, + "loss": 0.1508, + "step": 6040 + }, + { + "epoch": 0.6146500050797521, + "grad_norm": 4.78125, + "learning_rate": 1.6480224577764132e-05, + "loss": 0.1981, + "step": 6050 + }, + { + "epoch": 0.6156659555013715, + "grad_norm": 3.46875, + "learning_rate": 1.6404482947825137e-05, + "loss": 0.2514, + "step": 6060 + }, + { + "epoch": 0.616681905922991, + "grad_norm": 1.265625, + "learning_rate": 1.6328830686695154e-05, + "loss": 0.2397, + "step": 6070 + }, + { + "epoch": 0.6176978563446104, + "grad_norm": 1.953125, + "learning_rate": 1.625326858094144e-05, + "loss": 0.1523, + "step": 6080 + }, + { + "epoch": 0.6187138067662298, + "grad_norm": 3.484375, + "learning_rate": 1.6177797416193953e-05, + "loss": 0.218, + "step": 6090 + }, + { + "epoch": 0.6197297571878493, + "grad_norm": 3.484375, + "learning_rate": 1.6102417977137052e-05, + "loss": 0.1476, + "step": 6100 + }, + { + "epoch": 0.6207457076094687, + "grad_norm": 4.90625, + "learning_rate": 1.602713104750147e-05, + "loss": 0.1818, + "step": 6110 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 4.375, + "learning_rate": 1.5951937410056087e-05, + "loss": 0.2061, + "step": 6120 + }, + { + "epoch": 0.6227776084527075, + "grad_norm": 6.3125, + "learning_rate": 1.587683784659979e-05, + "loss": 0.1566, + "step": 6130 + }, + { + "epoch": 0.623793558874327, + "grad_norm": 2.828125, + "learning_rate": 1.58018331379534e-05, + "loss": 0.1376, + "step": 6140 + }, + { + "epoch": 0.6248095092959464, + "grad_norm": 2.40625, + "learning_rate": 1.572692406395149e-05, + "loss": 0.1655, + "step": 6150 + }, + { + "epoch": 0.6258254597175658, + "grad_norm": 4.34375, + "learning_rate": 1.5652111403434338e-05, + "loss": 0.2363, + "step": 6160 + }, + { + "epoch": 0.6268414101391853, + "grad_norm": 2.453125, + "learning_rate": 1.5577395934239757e-05, + "loss": 0.2464, + "step": 6170 + }, + { + "epoch": 0.6278573605608047, + "grad_norm": 2.53125, + "learning_rate": 1.5502778433195085e-05, + "loss": 0.1898, + "step": 6180 + }, + { + "epoch": 0.628873310982424, + "grad_norm": 2.28125, + "learning_rate": 1.5428259676109048e-05, + "loss": 0.1804, + "step": 6190 + }, + { + "epoch": 0.6298892614040434, + "grad_norm": 4.3125, + "learning_rate": 1.5353840437763732e-05, + "loss": 0.1409, + "step": 6200 + }, + { + "epoch": 0.630905211825663, + "grad_norm": 2.5625, + "learning_rate": 1.5279521491906496e-05, + "loss": 0.2449, + "step": 6210 + }, + { + "epoch": 0.6319211622472823, + "grad_norm": 3.0625, + "learning_rate": 1.520530361124195e-05, + "loss": 0.2103, + "step": 6220 + }, + { + "epoch": 0.6329371126689017, + "grad_norm": 2.609375, + "learning_rate": 1.5131187567423937e-05, + "loss": 0.2156, + "step": 6230 + }, + { + "epoch": 0.6339530630905211, + "grad_norm": 2.703125, + "learning_rate": 1.5057174131047446e-05, + "loss": 0.161, + "step": 6240 + }, + { + "epoch": 0.6349690135121406, + "grad_norm": 3.265625, + "learning_rate": 1.4983264071640679e-05, + "loss": 0.1757, + "step": 6250 + }, + { + "epoch": 0.63598496393376, + "grad_norm": 3.15625, + "learning_rate": 1.490945815765699e-05, + "loss": 0.2011, + "step": 6260 + }, + { + "epoch": 0.6370009143553794, + "grad_norm": 5.375, + "learning_rate": 1.4835757156466945e-05, + "loss": 0.1658, + "step": 6270 + }, + { + "epoch": 0.6380168647769989, + "grad_norm": 2.984375, + "learning_rate": 1.4762161834350271e-05, + "loss": 0.1754, + "step": 6280 + }, + { + "epoch": 0.6390328151986183, + "grad_norm": 2.015625, + "learning_rate": 1.4688672956487987e-05, + "loss": 0.1427, + "step": 6290 + }, + { + "epoch": 0.6400487656202377, + "grad_norm": 3.78125, + "learning_rate": 1.4615291286954352e-05, + "loss": 0.1517, + "step": 6300 + }, + { + "epoch": 0.6410647160418571, + "grad_norm": 2.859375, + "learning_rate": 1.4542017588709005e-05, + "loss": 0.2348, + "step": 6310 + }, + { + "epoch": 0.6420806664634766, + "grad_norm": 2.421875, + "learning_rate": 1.4468852623588961e-05, + "loss": 0.2089, + "step": 6320 + }, + { + "epoch": 0.643096616885096, + "grad_norm": 2.15625, + "learning_rate": 1.4395797152300719e-05, + "loss": 0.1702, + "step": 6330 + }, + { + "epoch": 0.6441125673067154, + "grad_norm": 1.53125, + "learning_rate": 1.4322851934412382e-05, + "loss": 0.1017, + "step": 6340 + }, + { + "epoch": 0.6451285177283349, + "grad_norm": 1.90625, + "learning_rate": 1.4250017728345716e-05, + "loss": 0.1813, + "step": 6350 + }, + { + "epoch": 0.6461444681499543, + "grad_norm": 2.015625, + "learning_rate": 1.4177295291368292e-05, + "loss": 0.1095, + "step": 6360 + }, + { + "epoch": 0.6471604185715737, + "grad_norm": 2.625, + "learning_rate": 1.410468537958558e-05, + "loss": 0.2259, + "step": 6370 + }, + { + "epoch": 0.6481763689931931, + "grad_norm": 3.5, + "learning_rate": 1.4032188747933136e-05, + "loss": 0.1595, + "step": 6380 + }, + { + "epoch": 0.6491923194148126, + "grad_norm": 5.21875, + "learning_rate": 1.39598061501687e-05, + "loss": 0.2226, + "step": 6390 + }, + { + "epoch": 0.650208269836432, + "grad_norm": 5.34375, + "learning_rate": 1.388753833886442e-05, + "loss": 0.2132, + "step": 6400 + }, + { + "epoch": 0.6512242202580514, + "grad_norm": 3.640625, + "learning_rate": 1.3815386065398945e-05, + "loss": 0.1227, + "step": 6410 + }, + { + "epoch": 0.6522401706796709, + "grad_norm": 1.0, + "learning_rate": 1.3743350079949705e-05, + "loss": 0.1755, + "step": 6420 + }, + { + "epoch": 0.6532561211012903, + "grad_norm": 2.359375, + "learning_rate": 1.3671431131485057e-05, + "loss": 0.1552, + "step": 6430 + }, + { + "epoch": 0.6542720715229097, + "grad_norm": 5.3125, + "learning_rate": 1.3599629967756483e-05, + "loss": 0.1917, + "step": 6440 + }, + { + "epoch": 0.6552880219445291, + "grad_norm": 4.625, + "learning_rate": 1.3527947335290877e-05, + "loss": 0.1812, + "step": 6450 + }, + { + "epoch": 0.6563039723661486, + "grad_norm": 1.234375, + "learning_rate": 1.3456383979382708e-05, + "loss": 0.1896, + "step": 6460 + }, + { + "epoch": 0.657319922787768, + "grad_norm": 3.984375, + "learning_rate": 1.3384940644086352e-05, + "loss": 0.1484, + "step": 6470 + }, + { + "epoch": 0.6583358732093874, + "grad_norm": 2.40625, + "learning_rate": 1.3313618072208268e-05, + "loss": 0.1334, + "step": 6480 + }, + { + "epoch": 0.6593518236310069, + "grad_norm": 4.375, + "learning_rate": 1.3242417005299357e-05, + "loss": 0.1351, + "step": 6490 + }, + { + "epoch": 0.6603677740526263, + "grad_norm": 2.640625, + "learning_rate": 1.31713381836472e-05, + "loss": 0.1717, + "step": 6500 + }, + { + "epoch": 0.6613837244742456, + "grad_norm": 2.640625, + "learning_rate": 1.3100382346268392e-05, + "loss": 0.1867, + "step": 6510 + }, + { + "epoch": 0.662399674895865, + "grad_norm": 1.734375, + "learning_rate": 1.3029550230900812e-05, + "loss": 0.1997, + "step": 6520 + }, + { + "epoch": 0.6634156253174845, + "grad_norm": 3.609375, + "learning_rate": 1.2958842573996016e-05, + "loss": 0.1969, + "step": 6530 + }, + { + "epoch": 0.6644315757391039, + "grad_norm": 3.578125, + "learning_rate": 1.2888260110711525e-05, + "loss": 0.1469, + "step": 6540 + }, + { + "epoch": 0.6654475261607233, + "grad_norm": 1.3515625, + "learning_rate": 1.2817803574903212e-05, + "loss": 0.1524, + "step": 6550 + }, + { + "epoch": 0.6664634765823427, + "grad_norm": 2.109375, + "learning_rate": 1.2747473699117668e-05, + "loss": 0.159, + "step": 6560 + }, + { + "epoch": 0.6674794270039622, + "grad_norm": 1.53125, + "learning_rate": 1.267727121458458e-05, + "loss": 0.1999, + "step": 6570 + }, + { + "epoch": 0.6684953774255816, + "grad_norm": 1.7265625, + "learning_rate": 1.2607196851209137e-05, + "loss": 0.2216, + "step": 6580 + }, + { + "epoch": 0.669511327847201, + "grad_norm": 3.125, + "learning_rate": 1.2537251337564412e-05, + "loss": 0.1607, + "step": 6590 + }, + { + "epoch": 0.6705272782688205, + "grad_norm": 2.421875, + "learning_rate": 1.2467435400883839e-05, + "loss": 0.2187, + "step": 6600 + }, + { + "epoch": 0.6715432286904399, + "grad_norm": 1.5078125, + "learning_rate": 1.239774976705359e-05, + "loss": 0.1753, + "step": 6610 + }, + { + "epoch": 0.6725591791120593, + "grad_norm": 1.140625, + "learning_rate": 1.2328195160605092e-05, + "loss": 0.194, + "step": 6620 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 4.9375, + "learning_rate": 1.225877230470743e-05, + "loss": 0.1485, + "step": 6630 + }, + { + "epoch": 0.6745910799552982, + "grad_norm": 3.65625, + "learning_rate": 1.218948192115988e-05, + "loss": 0.1847, + "step": 6640 + }, + { + "epoch": 0.6756070303769176, + "grad_norm": 3.875, + "learning_rate": 1.21203247303844e-05, + "loss": 0.1874, + "step": 6650 + }, + { + "epoch": 0.676622980798537, + "grad_norm": 2.65625, + "learning_rate": 1.2051301451418073e-05, + "loss": 0.2377, + "step": 6660 + }, + { + "epoch": 0.6776389312201565, + "grad_norm": 2.09375, + "learning_rate": 1.198241280190574e-05, + "loss": 0.1508, + "step": 6670 + }, + { + "epoch": 0.6786548816417759, + "grad_norm": 2.203125, + "learning_rate": 1.1913659498092431e-05, + "loss": 0.1537, + "step": 6680 + }, + { + "epoch": 0.6796708320633953, + "grad_norm": 2.484375, + "learning_rate": 1.184504225481601e-05, + "loss": 0.2339, + "step": 6690 + }, + { + "epoch": 0.6806867824850147, + "grad_norm": 5.625, + "learning_rate": 1.177656178549966e-05, + "loss": 0.2102, + "step": 6700 + }, + { + "epoch": 0.6817027329066342, + "grad_norm": 2.5, + "learning_rate": 1.1708218802144536e-05, + "loss": 0.1435, + "step": 6710 + }, + { + "epoch": 0.6827186833282536, + "grad_norm": 3.84375, + "learning_rate": 1.1640014015322323e-05, + "loss": 0.1823, + "step": 6720 + }, + { + "epoch": 0.683734633749873, + "grad_norm": 2.359375, + "learning_rate": 1.1571948134167862e-05, + "loss": 0.1154, + "step": 6730 + }, + { + "epoch": 0.6847505841714925, + "grad_norm": 2.90625, + "learning_rate": 1.1504021866371761e-05, + "loss": 0.2105, + "step": 6740 + }, + { + "epoch": 0.6857665345931119, + "grad_norm": 5.46875, + "learning_rate": 1.143623591817304e-05, + "loss": 0.1317, + "step": 6750 + }, + { + "epoch": 0.6867824850147313, + "grad_norm": 3.34375, + "learning_rate": 1.1368590994351835e-05, + "loss": 0.1406, + "step": 6760 + }, + { + "epoch": 0.6877984354363507, + "grad_norm": 3.78125, + "learning_rate": 1.130108779822198e-05, + "loss": 0.1425, + "step": 6770 + }, + { + "epoch": 0.6888143858579702, + "grad_norm": 0.77734375, + "learning_rate": 1.1233727031623783e-05, + "loss": 0.1623, + "step": 6780 + }, + { + "epoch": 0.6898303362795896, + "grad_norm": 4.625, + "learning_rate": 1.1166509394916682e-05, + "loss": 0.1591, + "step": 6790 + }, + { + "epoch": 0.690846286701209, + "grad_norm": 3.84375, + "learning_rate": 1.1099435586971982e-05, + "loss": 0.1758, + "step": 6800 + }, + { + "epoch": 0.6918622371228285, + "grad_norm": 2.4375, + "learning_rate": 1.1032506305165555e-05, + "loss": 0.1018, + "step": 6810 + }, + { + "epoch": 0.6928781875444479, + "grad_norm": 3.203125, + "learning_rate": 1.0965722245370641e-05, + "loss": 0.1485, + "step": 6820 + }, + { + "epoch": 0.6938941379660672, + "grad_norm": 0.7109375, + "learning_rate": 1.0899084101950561e-05, + "loss": 0.1762, + "step": 6830 + }, + { + "epoch": 0.6949100883876866, + "grad_norm": 1.9765625, + "learning_rate": 1.0832592567751555e-05, + "loss": 0.1402, + "step": 6840 + }, + { + "epoch": 0.6959260388093061, + "grad_norm": 1.4609375, + "learning_rate": 1.0766248334095505e-05, + "loss": 0.2278, + "step": 6850 + }, + { + "epoch": 0.6969419892309255, + "grad_norm": 3.953125, + "learning_rate": 1.0700052090772828e-05, + "loss": 0.1969, + "step": 6860 + }, + { + "epoch": 0.6979579396525449, + "grad_norm": 2.453125, + "learning_rate": 1.0634004526035249e-05, + "loss": 0.2073, + "step": 6870 + }, + { + "epoch": 0.6989738900741643, + "grad_norm": 1.6171875, + "learning_rate": 1.0568106326588645e-05, + "loss": 0.1902, + "step": 6880 + }, + { + "epoch": 0.6999898404957838, + "grad_norm": 1.2734375, + "learning_rate": 1.0502358177585953e-05, + "loss": 0.2165, + "step": 6890 + }, + { + "epoch": 0.7010057909174032, + "grad_norm": 1.671875, + "learning_rate": 1.0436760762619977e-05, + "loss": 0.1952, + "step": 6900 + }, + { + "epoch": 0.7020217413390226, + "grad_norm": 2.8125, + "learning_rate": 1.0371314763716347e-05, + "loss": 0.1422, + "step": 6910 + }, + { + "epoch": 0.7030376917606421, + "grad_norm": 2.53125, + "learning_rate": 1.0306020861326388e-05, + "loss": 0.0961, + "step": 6920 + }, + { + "epoch": 0.7040536421822615, + "grad_norm": 3.046875, + "learning_rate": 1.0240879734320068e-05, + "loss": 0.1542, + "step": 6930 + }, + { + "epoch": 0.7050695926038809, + "grad_norm": 2.859375, + "learning_rate": 1.0175892059978901e-05, + "loss": 0.1748, + "step": 6940 + }, + { + "epoch": 0.7060855430255003, + "grad_norm": 2.671875, + "learning_rate": 1.0111058513988958e-05, + "loss": 0.0819, + "step": 6950 + }, + { + "epoch": 0.7071014934471198, + "grad_norm": 3.5625, + "learning_rate": 1.0046379770433803e-05, + "loss": 0.1933, + "step": 6960 + }, + { + "epoch": 0.7081174438687392, + "grad_norm": 2.859375, + "learning_rate": 9.98185650178749e-06, + "loss": 0.1891, + "step": 6970 + }, + { + "epoch": 0.7091333942903586, + "grad_norm": 3.15625, + "learning_rate": 9.917489378907591e-06, + "loss": 0.2102, + "step": 6980 + }, + { + "epoch": 0.7101493447119781, + "grad_norm": 6.40625, + "learning_rate": 9.853279071028212e-06, + "loss": 0.1714, + "step": 6990 + }, + { + "epoch": 0.7111652951335975, + "grad_norm": 2.375, + "learning_rate": 9.78922624575303e-06, + "loss": 0.1299, + "step": 7000 + }, + { + "epoch": 0.7121812455552169, + "grad_norm": 2.078125, + "learning_rate": 9.72533156904833e-06, + "loss": 0.1914, + "step": 7010 + }, + { + "epoch": 0.7131971959768363, + "grad_norm": 3.859375, + "learning_rate": 9.661595705236137e-06, + "loss": 0.2377, + "step": 7020 + }, + { + "epoch": 0.7142131463984558, + "grad_norm": 1.171875, + "learning_rate": 9.598019316987244e-06, + "loss": 0.1851, + "step": 7030 + }, + { + "epoch": 0.7152290968200752, + "grad_norm": 1.078125, + "learning_rate": 9.53460306531439e-06, + "loss": 0.2661, + "step": 7040 + }, + { + "epoch": 0.7162450472416946, + "grad_norm": 1.6484375, + "learning_rate": 9.471347609565311e-06, + "loss": 0.1669, + "step": 7050 + }, + { + "epoch": 0.7172609976633141, + "grad_norm": 4.59375, + "learning_rate": 9.408253607415957e-06, + "loss": 0.2487, + "step": 7060 + }, + { + "epoch": 0.7182769480849335, + "grad_norm": 3.09375, + "learning_rate": 9.345321714863614e-06, + "loss": 0.186, + "step": 7070 + }, + { + "epoch": 0.7192928985065529, + "grad_norm": 6.0625, + "learning_rate": 9.282552586220075e-06, + "loss": 0.2249, + "step": 7080 + }, + { + "epoch": 0.7203088489281723, + "grad_norm": 1.5703125, + "learning_rate": 9.219946874104885e-06, + "loss": 0.1255, + "step": 7090 + }, + { + "epoch": 0.7213247993497918, + "grad_norm": 1.9453125, + "learning_rate": 9.157505229438481e-06, + "loss": 0.1999, + "step": 7100 + }, + { + "epoch": 0.7223407497714112, + "grad_norm": 5.1875, + "learning_rate": 9.095228301435518e-06, + "loss": 0.199, + "step": 7110 + }, + { + "epoch": 0.7233567001930306, + "grad_norm": 2.078125, + "learning_rate": 9.03311673759802e-06, + "loss": 0.2182, + "step": 7120 + }, + { + "epoch": 0.7243726506146501, + "grad_norm": 6.46875, + "learning_rate": 8.971171183708733e-06, + "loss": 0.1573, + "step": 7130 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 3.015625, + "learning_rate": 8.909392283824353e-06, + "loss": 0.2044, + "step": 7140 + }, + { + "epoch": 0.7264045514578888, + "grad_norm": 2.921875, + "learning_rate": 8.847780680268872e-06, + "loss": 0.11, + "step": 7150 + }, + { + "epoch": 0.7274205018795082, + "grad_norm": 2.96875, + "learning_rate": 8.786337013626853e-06, + "loss": 0.1897, + "step": 7160 + }, + { + "epoch": 0.7284364523011277, + "grad_norm": 1.7578125, + "learning_rate": 8.725061922736799e-06, + "loss": 0.153, + "step": 7170 + }, + { + "epoch": 0.7294524027227471, + "grad_norm": 1.609375, + "learning_rate": 8.663956044684532e-06, + "loss": 0.1746, + "step": 7180 + }, + { + "epoch": 0.7304683531443665, + "grad_norm": 1.9375, + "learning_rate": 8.603020014796507e-06, + "loss": 0.2284, + "step": 7190 + }, + { + "epoch": 0.7314843035659859, + "grad_norm": 1.515625, + "learning_rate": 8.542254466633273e-06, + "loss": 0.1186, + "step": 7200 + }, + { + "epoch": 0.7325002539876054, + "grad_norm": 1.671875, + "learning_rate": 8.481660031982844e-06, + "loss": 0.1971, + "step": 7210 + }, + { + "epoch": 0.7335162044092248, + "grad_norm": 1.453125, + "learning_rate": 8.421237340854157e-06, + "loss": 0.196, + "step": 7220 + }, + { + "epoch": 0.7345321548308442, + "grad_norm": 0.65234375, + "learning_rate": 8.360987021470479e-06, + "loss": 0.1724, + "step": 7230 + }, + { + "epoch": 0.7355481052524637, + "grad_norm": 2.84375, + "learning_rate": 8.300909700262929e-06, + "loss": 0.175, + "step": 7240 + }, + { + "epoch": 0.7365640556740831, + "grad_norm": 3.109375, + "learning_rate": 8.241006001863924e-06, + "loss": 0.2276, + "step": 7250 + }, + { + "epoch": 0.7375800060957025, + "grad_norm": 4.8125, + "learning_rate": 8.181276549100714e-06, + "loss": 0.2029, + "step": 7260 + }, + { + "epoch": 0.7385959565173219, + "grad_norm": 4.03125, + "learning_rate": 8.12172196298887e-06, + "loss": 0.175, + "step": 7270 + }, + { + "epoch": 0.7396119069389414, + "grad_norm": 3.046875, + "learning_rate": 8.062342862725878e-06, + "loss": 0.1662, + "step": 7280 + }, + { + "epoch": 0.7406278573605608, + "grad_norm": 3.375, + "learning_rate": 8.003139865684662e-06, + "loss": 0.1616, + "step": 7290 + }, + { + "epoch": 0.7416438077821802, + "grad_norm": 2.5625, + "learning_rate": 7.944113587407157e-06, + "loss": 0.2448, + "step": 7300 + }, + { + "epoch": 0.7426597582037997, + "grad_norm": 4.125, + "learning_rate": 7.885264641597961e-06, + "loss": 0.1618, + "step": 7310 + }, + { + "epoch": 0.7436757086254191, + "grad_norm": 3.5, + "learning_rate": 7.826593640117889e-06, + "loss": 0.1134, + "step": 7320 + }, + { + "epoch": 0.7446916590470385, + "grad_norm": 2.6875, + "learning_rate": 7.76810119297767e-06, + "loss": 0.1795, + "step": 7330 + }, + { + "epoch": 0.7457076094686579, + "grad_norm": 4.34375, + "learning_rate": 7.709787908331556e-06, + "loss": 0.2736, + "step": 7340 + }, + { + "epoch": 0.7467235598902774, + "grad_norm": 1.21875, + "learning_rate": 7.651654392471038e-06, + "loss": 0.139, + "step": 7350 + }, + { + "epoch": 0.7477395103118968, + "grad_norm": 3.578125, + "learning_rate": 7.593701249818521e-06, + "loss": 0.2023, + "step": 7360 + }, + { + "epoch": 0.7487554607335162, + "grad_norm": 2.15625, + "learning_rate": 7.535929082921048e-06, + "loss": 0.1702, + "step": 7370 + }, + { + "epoch": 0.7497714111551357, + "grad_norm": 1.96875, + "learning_rate": 7.47833849244402e-06, + "loss": 0.1835, + "step": 7380 + }, + { + "epoch": 0.7507873615767551, + "grad_norm": 2.796875, + "learning_rate": 7.420930077164959e-06, + "loss": 0.1713, + "step": 7390 + }, + { + "epoch": 0.7518033119983745, + "grad_norm": 4.46875, + "learning_rate": 7.363704433967311e-06, + "loss": 0.1906, + "step": 7400 + }, + { + "epoch": 0.7528192624199939, + "grad_norm": 1.75, + "learning_rate": 7.306662157834185e-06, + "loss": 0.1421, + "step": 7410 + }, + { + "epoch": 0.7538352128416134, + "grad_norm": 1.140625, + "learning_rate": 7.2498038418422145e-06, + "loss": 0.1793, + "step": 7420 + }, + { + "epoch": 0.7548511632632328, + "grad_norm": 2.578125, + "learning_rate": 7.193130077155374e-06, + "loss": 0.1603, + "step": 7430 + }, + { + "epoch": 0.7558671136848522, + "grad_norm": 4.3125, + "learning_rate": 7.13664145301883e-06, + "loss": 0.2169, + "step": 7440 + }, + { + "epoch": 0.7568830641064717, + "grad_norm": 3.078125, + "learning_rate": 7.0803385567528025e-06, + "loss": 0.1685, + "step": 7450 + }, + { + "epoch": 0.757899014528091, + "grad_norm": 3.5625, + "learning_rate": 7.024221973746495e-06, + "loss": 0.2282, + "step": 7460 + }, + { + "epoch": 0.7589149649497104, + "grad_norm": 2.265625, + "learning_rate": 6.968292287451961e-06, + "loss": 0.1786, + "step": 7470 + }, + { + "epoch": 0.7599309153713298, + "grad_norm": 4.71875, + "learning_rate": 6.912550079378091e-06, + "loss": 0.1811, + "step": 7480 + }, + { + "epoch": 0.7609468657929493, + "grad_norm": 2.328125, + "learning_rate": 6.856995929084506e-06, + "loss": 0.1747, + "step": 7490 + }, + { + "epoch": 0.7619628162145687, + "grad_norm": 5.21875, + "learning_rate": 6.801630414175589e-06, + "loss": 0.2028, + "step": 7500 + }, + { + "epoch": 0.7629787666361881, + "grad_norm": 3.78125, + "learning_rate": 6.746454110294451e-06, + "loss": 0.2255, + "step": 7510 + }, + { + "epoch": 0.7639947170578075, + "grad_norm": 1.625, + "learning_rate": 6.691467591116931e-06, + "loss": 0.1604, + "step": 7520 + }, + { + "epoch": 0.765010667479427, + "grad_norm": 1.7734375, + "learning_rate": 6.6366714283456755e-06, + "loss": 0.2559, + "step": 7530 + }, + { + "epoch": 0.7660266179010464, + "grad_norm": 4.59375, + "learning_rate": 6.582066191704142e-06, + "loss": 0.2034, + "step": 7540 + }, + { + "epoch": 0.7670425683226658, + "grad_norm": 1.578125, + "learning_rate": 6.527652448930724e-06, + "loss": 0.148, + "step": 7550 + }, + { + "epoch": 0.7680585187442853, + "grad_norm": 1.7109375, + "learning_rate": 6.4734307657728e-06, + "loss": 0.1811, + "step": 7560 + }, + { + "epoch": 0.7690744691659047, + "grad_norm": 1.2734375, + "learning_rate": 6.419401705980924e-06, + "loss": 0.1407, + "step": 7570 + }, + { + "epoch": 0.7700904195875241, + "grad_norm": 2.25, + "learning_rate": 6.365565831302869e-06, + "loss": 0.1893, + "step": 7580 + }, + { + "epoch": 0.7711063700091435, + "grad_norm": 1.625, + "learning_rate": 6.311923701477854e-06, + "loss": 0.1835, + "step": 7590 + }, + { + "epoch": 0.772122320430763, + "grad_norm": 2.375, + "learning_rate": 6.258475874230713e-06, + "loss": 0.1579, + "step": 7600 + }, + { + "epoch": 0.7731382708523824, + "grad_norm": 4.5, + "learning_rate": 6.205222905266067e-06, + "loss": 0.1794, + "step": 7610 + }, + { + "epoch": 0.7741542212740018, + "grad_norm": 4.25, + "learning_rate": 6.152165348262598e-06, + "loss": 0.1477, + "step": 7620 + }, + { + "epoch": 0.7751701716956213, + "grad_norm": 1.9765625, + "learning_rate": 6.0993037548672246e-06, + "loss": 0.2396, + "step": 7630 + }, + { + "epoch": 0.7761861221172407, + "grad_norm": 2.671875, + "learning_rate": 6.046638674689454e-06, + "loss": 0.1717, + "step": 7640 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 3.671875, + "learning_rate": 5.994170655295567e-06, + "loss": 0.2646, + "step": 7650 + }, + { + "epoch": 0.7782180229604795, + "grad_norm": 1.3046875, + "learning_rate": 5.9419002422030106e-06, + "loss": 0.1553, + "step": 7660 + }, + { + "epoch": 0.779233973382099, + "grad_norm": 3.734375, + "learning_rate": 5.889827978874665e-06, + "loss": 0.1854, + "step": 7670 + }, + { + "epoch": 0.7802499238037184, + "grad_norm": 2.140625, + "learning_rate": 5.837954406713245e-06, + "loss": 0.1857, + "step": 7680 + }, + { + "epoch": 0.7812658742253378, + "grad_norm": 3.34375, + "learning_rate": 5.786280065055619e-06, + "loss": 0.1797, + "step": 7690 + }, + { + "epoch": 0.7822818246469573, + "grad_norm": 0.97265625, + "learning_rate": 5.734805491167244e-06, + "loss": 0.1488, + "step": 7700 + }, + { + "epoch": 0.7832977750685767, + "grad_norm": 2.078125, + "learning_rate": 5.683531220236576e-06, + "loss": 0.1688, + "step": 7710 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 3.046875, + "learning_rate": 5.632457785369455e-06, + "loss": 0.1503, + "step": 7720 + }, + { + "epoch": 0.7853296759118155, + "grad_norm": 1.6875, + "learning_rate": 5.581585717583637e-06, + "loss": 0.1658, + "step": 7730 + }, + { + "epoch": 0.786345626333435, + "grad_norm": 3.421875, + "learning_rate": 5.530915545803209e-06, + "loss": 0.2112, + "step": 7740 + }, + { + "epoch": 0.7873615767550544, + "grad_norm": 4.1875, + "learning_rate": 5.480447796853141e-06, + "loss": 0.165, + "step": 7750 + }, + { + "epoch": 0.7883775271766738, + "grad_norm": 5.3125, + "learning_rate": 5.430182995453756e-06, + "loss": 0.1499, + "step": 7760 + }, + { + "epoch": 0.7893934775982933, + "grad_norm": 2.1875, + "learning_rate": 5.380121664215329e-06, + "loss": 0.1559, + "step": 7770 + }, + { + "epoch": 0.7904094280199127, + "grad_norm": 1.46875, + "learning_rate": 5.330264323632611e-06, + "loss": 0.2098, + "step": 7780 + }, + { + "epoch": 0.791425378441532, + "grad_norm": 4.65625, + "learning_rate": 5.280611492079449e-06, + "loss": 0.1776, + "step": 7790 + }, + { + "epoch": 0.7924413288631514, + "grad_norm": 1.3359375, + "learning_rate": 5.231163685803361e-06, + "loss": 0.1497, + "step": 7800 + }, + { + "epoch": 0.7934572792847709, + "grad_norm": 2.640625, + "learning_rate": 5.181921418920191e-06, + "loss": 0.12, + "step": 7810 + }, + { + "epoch": 0.7944732297063903, + "grad_norm": 2.328125, + "learning_rate": 5.13288520340878e-06, + "loss": 0.1981, + "step": 7820 + }, + { + "epoch": 0.7954891801280097, + "grad_norm": 3.0625, + "learning_rate": 5.084055549105596e-06, + "loss": 0.1389, + "step": 7830 + }, + { + "epoch": 0.7965051305496291, + "grad_norm": 2.796875, + "learning_rate": 5.035432963699479e-06, + "loss": 0.2293, + "step": 7840 + }, + { + "epoch": 0.7975210809712486, + "grad_norm": 5.0625, + "learning_rate": 4.98701795272635e-06, + "loss": 0.1618, + "step": 7850 + }, + { + "epoch": 0.798537031392868, + "grad_norm": 5.09375, + "learning_rate": 4.938811019563938e-06, + "loss": 0.1755, + "step": 7860 + }, + { + "epoch": 0.7995529818144874, + "grad_norm": 2.140625, + "learning_rate": 4.8908126654265475e-06, + "loss": 0.1565, + "step": 7870 + }, + { + "epoch": 0.8005689322361069, + "grad_norm": 0.76171875, + "learning_rate": 4.843023389359885e-06, + "loss": 0.2176, + "step": 7880 + }, + { + "epoch": 0.8015848826577263, + "grad_norm": 2.625, + "learning_rate": 4.79544368823581e-06, + "loss": 0.2013, + "step": 7890 + }, + { + "epoch": 0.8026008330793457, + "grad_norm": 2.078125, + "learning_rate": 4.748074056747234e-06, + "loss": 0.1246, + "step": 7900 + }, + { + "epoch": 0.8036167835009651, + "grad_norm": 3.5, + "learning_rate": 4.700914987402919e-06, + "loss": 0.1638, + "step": 7910 + }, + { + "epoch": 0.8046327339225846, + "grad_norm": 3.4375, + "learning_rate": 4.6539669705223916e-06, + "loss": 0.2213, + "step": 7920 + }, + { + "epoch": 0.805648684344204, + "grad_norm": 2.96875, + "learning_rate": 4.607230494230849e-06, + "loss": 0.1822, + "step": 7930 + }, + { + "epoch": 0.8066646347658234, + "grad_norm": 2.359375, + "learning_rate": 4.560706044454047e-06, + "loss": 0.1763, + "step": 7940 + }, + { + "epoch": 0.8076805851874429, + "grad_norm": 4.59375, + "learning_rate": 4.514394104913291e-06, + "loss": 0.234, + "step": 7950 + }, + { + "epoch": 0.8086965356090623, + "grad_norm": 1.96875, + "learning_rate": 4.468295157120372e-06, + "loss": 0.1939, + "step": 7960 + }, + { + "epoch": 0.8097124860306817, + "grad_norm": 2.578125, + "learning_rate": 4.422409680372594e-06, + "loss": 0.174, + "step": 7970 + }, + { + "epoch": 0.8107284364523011, + "grad_norm": 4.5625, + "learning_rate": 4.3767381517477505e-06, + "loss": 0.2375, + "step": 7980 + }, + { + "epoch": 0.8117443868739206, + "grad_norm": 0.9609375, + "learning_rate": 4.331281046099203e-06, + "loss": 0.2076, + "step": 7990 + }, + { + "epoch": 0.81276033729554, + "grad_norm": 6.0625, + "learning_rate": 4.286038836050929e-06, + "loss": 0.2504, + "step": 8000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-8000/training_args.bin b/checkpoints/checkpoint-8000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-8000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-8500/adapter_config.json b/checkpoints/checkpoint-8500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-8500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-8500/adapter_model.safetensors b/checkpoints/checkpoint-8500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c4142c2f632b19891dd52f1c81cfe510afb044ab --- /dev/null +++ b/checkpoints/checkpoint-8500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28ac81ab3c97c69dfd1334375e0193bcbeec6d6567bc44b70349e272cd340ee0 +size 5919456 diff --git a/checkpoints/checkpoint-8500/optimizer.pt b/checkpoints/checkpoint-8500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0eb48e2be9f73c79b44660a595421fd4cb1d762c --- /dev/null +++ b/checkpoints/checkpoint-8500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a97970da7a377ddc03d41ac023f8d6e5a218717457c493a71d03647daee1d293 +size 11930938 diff --git a/checkpoints/checkpoint-8500/rng_state_0.pth b/checkpoints/checkpoint-8500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..de8f82b7c4feb8b730311d766bf44d581384eae7 --- /dev/null +++ b/checkpoints/checkpoint-8500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a353686e04717107f6f8ef46d31bd168a04a71491dff1b49a95dd841cae20dee +size 15024 diff --git a/checkpoints/checkpoint-8500/rng_state_1.pth b/checkpoints/checkpoint-8500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..58bca56d46a97b8f2ee454510a5f93c7f3bb7410 --- /dev/null +++ b/checkpoints/checkpoint-8500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59a024dac60d84768b968adca05416d784c0295ae139bead6f2816285ba6b01b +size 15024 diff --git a/checkpoints/checkpoint-8500/rng_state_2.pth b/checkpoints/checkpoint-8500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9748eeb88596cf9f27c2821bec97e56aa5f2ccfc --- /dev/null +++ b/checkpoints/checkpoint-8500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f28ab7ae50115d00765b2d4d2188a1d88b5bcb9be07f45fe7e78cb4752256bd +size 15024 diff --git a/checkpoints/checkpoint-8500/rng_state_3.pth b/checkpoints/checkpoint-8500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..354e6dd8952c6a6e554d8c56f7f0daa728e13f22 --- /dev/null +++ b/checkpoints/checkpoint-8500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce60533c28c44e2a4955c454dfc9eac41778818fa945283e28161cac010c0143 +size 15024 diff --git a/checkpoints/checkpoint-8500/scheduler.pt b/checkpoints/checkpoint-8500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f22ac3e321eef132a6a05d855a039dde00c7589a --- /dev/null +++ b/checkpoints/checkpoint-8500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d013f7dc21c771c94b398b7ee47f1c3b60572d9f3b478e50ee4f2b58d385070d +size 1064 diff --git a/checkpoints/checkpoint-8500/trainer_state.json b/checkpoints/checkpoint-8500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..3ef48260e073af534f11c2fb65f8473ce1a4a401 --- /dev/null +++ b/checkpoints/checkpoint-8500/trainer_state.json @@ -0,0 +1,5971 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8635578583765112, + "eval_steps": 500, + "global_step": 8500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + }, + { + "epoch": 0.5597886823123032, + "grad_norm": 1.9921875, + "learning_rate": 2.0680849202968743e-05, + "loss": 0.2139, + "step": 5510 + }, + { + "epoch": 0.5608046327339226, + "grad_norm": 2.4375, + "learning_rate": 2.060147242557674e-05, + "loss": 0.183, + "step": 5520 + }, + { + "epoch": 0.5618205831555421, + "grad_norm": 5.5, + "learning_rate": 2.0522141380299308e-05, + "loss": 0.1673, + "step": 5530 + }, + { + "epoch": 0.5628365335771615, + "grad_norm": 4.25, + "learning_rate": 2.044285689195258e-05, + "loss": 0.1674, + "step": 5540 + }, + { + "epoch": 0.5638524839987809, + "grad_norm": 2.109375, + "learning_rate": 2.0363619784868604e-05, + "loss": 0.1531, + "step": 5550 + }, + { + "epoch": 0.5648684344204002, + "grad_norm": 2.59375, + "learning_rate": 2.0284430882886836e-05, + "loss": 0.1665, + "step": 5560 + }, + { + "epoch": 0.5658843848420197, + "grad_norm": 3.984375, + "learning_rate": 2.020529100934549e-05, + "loss": 0.1717, + "step": 5570 + }, + { + "epoch": 0.5669003352636391, + "grad_norm": 1.6015625, + "learning_rate": 2.012620098707306e-05, + "loss": 0.1167, + "step": 5580 + }, + { + "epoch": 0.5679162856852585, + "grad_norm": 6.0625, + "learning_rate": 2.004716163837972e-05, + "loss": 0.2084, + "step": 5590 + }, + { + "epoch": 0.5689322361068779, + "grad_norm": 2.5625, + "learning_rate": 1.996817378504876e-05, + "loss": 0.1939, + "step": 5600 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 3.109375, + "learning_rate": 1.9889238248328108e-05, + "loss": 0.1241, + "step": 5610 + }, + { + "epoch": 0.5709641369501168, + "grad_norm": 4.875, + "learning_rate": 1.981035584892171e-05, + "loss": 0.1865, + "step": 5620 + }, + { + "epoch": 0.5719800873717362, + "grad_norm": 2.984375, + "learning_rate": 1.9731527406981072e-05, + "loss": 0.1639, + "step": 5630 + }, + { + "epoch": 0.5729960377933557, + "grad_norm": 4.4375, + "learning_rate": 1.9652753742096655e-05, + "loss": 0.2019, + "step": 5640 + }, + { + "epoch": 0.5740119882149751, + "grad_norm": 4.3125, + "learning_rate": 1.9574035673289432e-05, + "loss": 0.1829, + "step": 5650 + }, + { + "epoch": 0.5750279386365945, + "grad_norm": 3.203125, + "learning_rate": 1.9495374019002312e-05, + "loss": 0.2267, + "step": 5660 + }, + { + "epoch": 0.5760438890582139, + "grad_norm": 1.765625, + "learning_rate": 1.9416769597091673e-05, + "loss": 0.1411, + "step": 5670 + }, + { + "epoch": 0.5770598394798334, + "grad_norm": 2.640625, + "learning_rate": 1.9338223224818818e-05, + "loss": 0.1476, + "step": 5680 + }, + { + "epoch": 0.5780757899014528, + "grad_norm": 4.84375, + "learning_rate": 1.9259735718841524e-05, + "loss": 0.1417, + "step": 5690 + }, + { + "epoch": 0.5790917403230722, + "grad_norm": 2.421875, + "learning_rate": 1.918130789520551e-05, + "loss": 0.1592, + "step": 5700 + }, + { + "epoch": 0.5801076907446917, + "grad_norm": 2.984375, + "learning_rate": 1.9102940569335963e-05, + "loss": 0.161, + "step": 5710 + }, + { + "epoch": 0.5811236411663111, + "grad_norm": 1.0234375, + "learning_rate": 1.9024634556029093e-05, + "loss": 0.1614, + "step": 5720 + }, + { + "epoch": 0.5821395915879305, + "grad_norm": 2.90625, + "learning_rate": 1.89463906694436e-05, + "loss": 0.1505, + "step": 5730 + }, + { + "epoch": 0.5831555420095499, + "grad_norm": 2.875, + "learning_rate": 1.8868209723092286e-05, + "loss": 0.1674, + "step": 5740 + }, + { + "epoch": 0.5841714924311694, + "grad_norm": 0.408203125, + "learning_rate": 1.8790092529833508e-05, + "loss": 0.1468, + "step": 5750 + }, + { + "epoch": 0.5851874428527888, + "grad_norm": 5.1875, + "learning_rate": 1.871203990186281e-05, + "loss": 0.1903, + "step": 5760 + }, + { + "epoch": 0.5862033932744082, + "grad_norm": 0.5546875, + "learning_rate": 1.8634052650704415e-05, + "loss": 0.2644, + "step": 5770 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 3.203125, + "learning_rate": 1.8556131587202848e-05, + "loss": 0.1968, + "step": 5780 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484375, + "learning_rate": 1.8478277521514424e-05, + "loss": 0.2249, + "step": 5790 + }, + { + "epoch": 0.5892512445392665, + "grad_norm": 4.0, + "learning_rate": 1.8400491263098906e-05, + "loss": 0.1881, + "step": 5800 + }, + { + "epoch": 0.5902671949608859, + "grad_norm": 1.90625, + "learning_rate": 1.832277362071106e-05, + "loss": 0.1352, + "step": 5810 + }, + { + "epoch": 0.5912831453825054, + "grad_norm": 2.765625, + "learning_rate": 1.824512540239221e-05, + "loss": 0.2737, + "step": 5820 + }, + { + "epoch": 0.5922990958041248, + "grad_norm": 2.609375, + "learning_rate": 1.81675474154619e-05, + "loss": 0.1566, + "step": 5830 + }, + { + "epoch": 0.5933150462257442, + "grad_norm": 2.6875, + "learning_rate": 1.8090040466509444e-05, + "loss": 0.1999, + "step": 5840 + }, + { + "epoch": 0.5943309966473637, + "grad_norm": 2.609375, + "learning_rate": 1.8012605361385592e-05, + "loss": 0.2372, + "step": 5850 + }, + { + "epoch": 0.5953469470689831, + "grad_norm": 8.125, + "learning_rate": 1.7935242905194087e-05, + "loss": 0.2411, + "step": 5860 + }, + { + "epoch": 0.5963628974906025, + "grad_norm": 3.46875, + "learning_rate": 1.785795390228336e-05, + "loss": 0.138, + "step": 5870 + }, + { + "epoch": 0.5973788479122218, + "grad_norm": 2.3125, + "learning_rate": 1.7780739156238125e-05, + "loss": 0.1867, + "step": 5880 + }, + { + "epoch": 0.5983947983338413, + "grad_norm": 4.0625, + "learning_rate": 1.770359946987105e-05, + "loss": 0.2091, + "step": 5890 + }, + { + "epoch": 0.5994107487554607, + "grad_norm": 5.21875, + "learning_rate": 1.7626535645214378e-05, + "loss": 0.2091, + "step": 5900 + }, + { + "epoch": 0.6004266991770801, + "grad_norm": 3.15625, + "learning_rate": 1.7549548483511614e-05, + "loss": 0.1927, + "step": 5910 + }, + { + "epoch": 0.6014426495986995, + "grad_norm": 4.71875, + "learning_rate": 1.7472638785209198e-05, + "loss": 0.1893, + "step": 5920 + }, + { + "epoch": 0.602458600020319, + "grad_norm": 3.015625, + "learning_rate": 1.7395807349948145e-05, + "loss": 0.1557, + "step": 5930 + }, + { + "epoch": 0.6034745504419384, + "grad_norm": 2.9375, + "learning_rate": 1.73190549765558e-05, + "loss": 0.1717, + "step": 5940 + }, + { + "epoch": 0.6044905008635578, + "grad_norm": 3.109375, + "learning_rate": 1.724238246303745e-05, + "loss": 0.1879, + "step": 5950 + }, + { + "epoch": 0.6055064512851773, + "grad_norm": 3.875, + "learning_rate": 1.71657906065681e-05, + "loss": 0.1908, + "step": 5960 + }, + { + "epoch": 0.6065224017067967, + "grad_norm": 5.09375, + "learning_rate": 1.7089280203484115e-05, + "loss": 0.1712, + "step": 5970 + }, + { + "epoch": 0.6075383521284161, + "grad_norm": 3.015625, + "learning_rate": 1.701285204927502e-05, + "loss": 0.1454, + "step": 5980 + }, + { + "epoch": 0.6085543025500355, + "grad_norm": 3.265625, + "learning_rate": 1.693650693857515e-05, + "loss": 0.2283, + "step": 5990 + }, + { + "epoch": 0.609570252971655, + "grad_norm": 3.40625, + "learning_rate": 1.6860245665155466e-05, + "loss": 0.2188, + "step": 6000 + }, + { + "epoch": 0.6105862033932744, + "grad_norm": 2.5625, + "learning_rate": 1.678406902191521e-05, + "loss": 0.1605, + "step": 6010 + }, + { + "epoch": 0.6116021538148938, + "grad_norm": 0.6796875, + "learning_rate": 1.670797780087374e-05, + "loss": 0.1472, + "step": 6020 + }, + { + "epoch": 0.6126181042365133, + "grad_norm": 2.234375, + "learning_rate": 1.6631972793162288e-05, + "loss": 0.1676, + "step": 6030 + }, + { + "epoch": 0.6136340546581327, + "grad_norm": 1.25, + "learning_rate": 1.6556054789015662e-05, + "loss": 0.1508, + "step": 6040 + }, + { + "epoch": 0.6146500050797521, + "grad_norm": 4.78125, + "learning_rate": 1.6480224577764132e-05, + "loss": 0.1981, + "step": 6050 + }, + { + "epoch": 0.6156659555013715, + "grad_norm": 3.46875, + "learning_rate": 1.6404482947825137e-05, + "loss": 0.2514, + "step": 6060 + }, + { + "epoch": 0.616681905922991, + "grad_norm": 1.265625, + "learning_rate": 1.6328830686695154e-05, + "loss": 0.2397, + "step": 6070 + }, + { + "epoch": 0.6176978563446104, + "grad_norm": 1.953125, + "learning_rate": 1.625326858094144e-05, + "loss": 0.1523, + "step": 6080 + }, + { + "epoch": 0.6187138067662298, + "grad_norm": 3.484375, + "learning_rate": 1.6177797416193953e-05, + "loss": 0.218, + "step": 6090 + }, + { + "epoch": 0.6197297571878493, + "grad_norm": 3.484375, + "learning_rate": 1.6102417977137052e-05, + "loss": 0.1476, + "step": 6100 + }, + { + "epoch": 0.6207457076094687, + "grad_norm": 4.90625, + "learning_rate": 1.602713104750147e-05, + "loss": 0.1818, + "step": 6110 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 4.375, + "learning_rate": 1.5951937410056087e-05, + "loss": 0.2061, + "step": 6120 + }, + { + "epoch": 0.6227776084527075, + "grad_norm": 6.3125, + "learning_rate": 1.587683784659979e-05, + "loss": 0.1566, + "step": 6130 + }, + { + "epoch": 0.623793558874327, + "grad_norm": 2.828125, + "learning_rate": 1.58018331379534e-05, + "loss": 0.1376, + "step": 6140 + }, + { + "epoch": 0.6248095092959464, + "grad_norm": 2.40625, + "learning_rate": 1.572692406395149e-05, + "loss": 0.1655, + "step": 6150 + }, + { + "epoch": 0.6258254597175658, + "grad_norm": 4.34375, + "learning_rate": 1.5652111403434338e-05, + "loss": 0.2363, + "step": 6160 + }, + { + "epoch": 0.6268414101391853, + "grad_norm": 2.453125, + "learning_rate": 1.5577395934239757e-05, + "loss": 0.2464, + "step": 6170 + }, + { + "epoch": 0.6278573605608047, + "grad_norm": 2.53125, + "learning_rate": 1.5502778433195085e-05, + "loss": 0.1898, + "step": 6180 + }, + { + "epoch": 0.628873310982424, + "grad_norm": 2.28125, + "learning_rate": 1.5428259676109048e-05, + "loss": 0.1804, + "step": 6190 + }, + { + "epoch": 0.6298892614040434, + "grad_norm": 4.3125, + "learning_rate": 1.5353840437763732e-05, + "loss": 0.1409, + "step": 6200 + }, + { + "epoch": 0.630905211825663, + "grad_norm": 2.5625, + "learning_rate": 1.5279521491906496e-05, + "loss": 0.2449, + "step": 6210 + }, + { + "epoch": 0.6319211622472823, + "grad_norm": 3.0625, + "learning_rate": 1.520530361124195e-05, + "loss": 0.2103, + "step": 6220 + }, + { + "epoch": 0.6329371126689017, + "grad_norm": 2.609375, + "learning_rate": 1.5131187567423937e-05, + "loss": 0.2156, + "step": 6230 + }, + { + "epoch": 0.6339530630905211, + "grad_norm": 2.703125, + "learning_rate": 1.5057174131047446e-05, + "loss": 0.161, + "step": 6240 + }, + { + "epoch": 0.6349690135121406, + "grad_norm": 3.265625, + "learning_rate": 1.4983264071640679e-05, + "loss": 0.1757, + "step": 6250 + }, + { + "epoch": 0.63598496393376, + "grad_norm": 3.15625, + "learning_rate": 1.490945815765699e-05, + "loss": 0.2011, + "step": 6260 + }, + { + "epoch": 0.6370009143553794, + "grad_norm": 5.375, + "learning_rate": 1.4835757156466945e-05, + "loss": 0.1658, + "step": 6270 + }, + { + "epoch": 0.6380168647769989, + "grad_norm": 2.984375, + "learning_rate": 1.4762161834350271e-05, + "loss": 0.1754, + "step": 6280 + }, + { + "epoch": 0.6390328151986183, + "grad_norm": 2.015625, + "learning_rate": 1.4688672956487987e-05, + "loss": 0.1427, + "step": 6290 + }, + { + "epoch": 0.6400487656202377, + "grad_norm": 3.78125, + "learning_rate": 1.4615291286954352e-05, + "loss": 0.1517, + "step": 6300 + }, + { + "epoch": 0.6410647160418571, + "grad_norm": 2.859375, + "learning_rate": 1.4542017588709005e-05, + "loss": 0.2348, + "step": 6310 + }, + { + "epoch": 0.6420806664634766, + "grad_norm": 2.421875, + "learning_rate": 1.4468852623588961e-05, + "loss": 0.2089, + "step": 6320 + }, + { + "epoch": 0.643096616885096, + "grad_norm": 2.15625, + "learning_rate": 1.4395797152300719e-05, + "loss": 0.1702, + "step": 6330 + }, + { + "epoch": 0.6441125673067154, + "grad_norm": 1.53125, + "learning_rate": 1.4322851934412382e-05, + "loss": 0.1017, + "step": 6340 + }, + { + "epoch": 0.6451285177283349, + "grad_norm": 1.90625, + "learning_rate": 1.4250017728345716e-05, + "loss": 0.1813, + "step": 6350 + }, + { + "epoch": 0.6461444681499543, + "grad_norm": 2.015625, + "learning_rate": 1.4177295291368292e-05, + "loss": 0.1095, + "step": 6360 + }, + { + "epoch": 0.6471604185715737, + "grad_norm": 2.625, + "learning_rate": 1.410468537958558e-05, + "loss": 0.2259, + "step": 6370 + }, + { + "epoch": 0.6481763689931931, + "grad_norm": 3.5, + "learning_rate": 1.4032188747933136e-05, + "loss": 0.1595, + "step": 6380 + }, + { + "epoch": 0.6491923194148126, + "grad_norm": 5.21875, + "learning_rate": 1.39598061501687e-05, + "loss": 0.2226, + "step": 6390 + }, + { + "epoch": 0.650208269836432, + "grad_norm": 5.34375, + "learning_rate": 1.388753833886442e-05, + "loss": 0.2132, + "step": 6400 + }, + { + "epoch": 0.6512242202580514, + "grad_norm": 3.640625, + "learning_rate": 1.3815386065398945e-05, + "loss": 0.1227, + "step": 6410 + }, + { + "epoch": 0.6522401706796709, + "grad_norm": 1.0, + "learning_rate": 1.3743350079949705e-05, + "loss": 0.1755, + "step": 6420 + }, + { + "epoch": 0.6532561211012903, + "grad_norm": 2.359375, + "learning_rate": 1.3671431131485057e-05, + "loss": 0.1552, + "step": 6430 + }, + { + "epoch": 0.6542720715229097, + "grad_norm": 5.3125, + "learning_rate": 1.3599629967756483e-05, + "loss": 0.1917, + "step": 6440 + }, + { + "epoch": 0.6552880219445291, + "grad_norm": 4.625, + "learning_rate": 1.3527947335290877e-05, + "loss": 0.1812, + "step": 6450 + }, + { + "epoch": 0.6563039723661486, + "grad_norm": 1.234375, + "learning_rate": 1.3456383979382708e-05, + "loss": 0.1896, + "step": 6460 + }, + { + "epoch": 0.657319922787768, + "grad_norm": 3.984375, + "learning_rate": 1.3384940644086352e-05, + "loss": 0.1484, + "step": 6470 + }, + { + "epoch": 0.6583358732093874, + "grad_norm": 2.40625, + "learning_rate": 1.3313618072208268e-05, + "loss": 0.1334, + "step": 6480 + }, + { + "epoch": 0.6593518236310069, + "grad_norm": 4.375, + "learning_rate": 1.3242417005299357e-05, + "loss": 0.1351, + "step": 6490 + }, + { + "epoch": 0.6603677740526263, + "grad_norm": 2.640625, + "learning_rate": 1.31713381836472e-05, + "loss": 0.1717, + "step": 6500 + }, + { + "epoch": 0.6613837244742456, + "grad_norm": 2.640625, + "learning_rate": 1.3100382346268392e-05, + "loss": 0.1867, + "step": 6510 + }, + { + "epoch": 0.662399674895865, + "grad_norm": 1.734375, + "learning_rate": 1.3029550230900812e-05, + "loss": 0.1997, + "step": 6520 + }, + { + "epoch": 0.6634156253174845, + "grad_norm": 3.609375, + "learning_rate": 1.2958842573996016e-05, + "loss": 0.1969, + "step": 6530 + }, + { + "epoch": 0.6644315757391039, + "grad_norm": 3.578125, + "learning_rate": 1.2888260110711525e-05, + "loss": 0.1469, + "step": 6540 + }, + { + "epoch": 0.6654475261607233, + "grad_norm": 1.3515625, + "learning_rate": 1.2817803574903212e-05, + "loss": 0.1524, + "step": 6550 + }, + { + "epoch": 0.6664634765823427, + "grad_norm": 2.109375, + "learning_rate": 1.2747473699117668e-05, + "loss": 0.159, + "step": 6560 + }, + { + "epoch": 0.6674794270039622, + "grad_norm": 1.53125, + "learning_rate": 1.267727121458458e-05, + "loss": 0.1999, + "step": 6570 + }, + { + "epoch": 0.6684953774255816, + "grad_norm": 1.7265625, + "learning_rate": 1.2607196851209137e-05, + "loss": 0.2216, + "step": 6580 + }, + { + "epoch": 0.669511327847201, + "grad_norm": 3.125, + "learning_rate": 1.2537251337564412e-05, + "loss": 0.1607, + "step": 6590 + }, + { + "epoch": 0.6705272782688205, + "grad_norm": 2.421875, + "learning_rate": 1.2467435400883839e-05, + "loss": 0.2187, + "step": 6600 + }, + { + "epoch": 0.6715432286904399, + "grad_norm": 1.5078125, + "learning_rate": 1.239774976705359e-05, + "loss": 0.1753, + "step": 6610 + }, + { + "epoch": 0.6725591791120593, + "grad_norm": 1.140625, + "learning_rate": 1.2328195160605092e-05, + "loss": 0.194, + "step": 6620 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 4.9375, + "learning_rate": 1.225877230470743e-05, + "loss": 0.1485, + "step": 6630 + }, + { + "epoch": 0.6745910799552982, + "grad_norm": 3.65625, + "learning_rate": 1.218948192115988e-05, + "loss": 0.1847, + "step": 6640 + }, + { + "epoch": 0.6756070303769176, + "grad_norm": 3.875, + "learning_rate": 1.21203247303844e-05, + "loss": 0.1874, + "step": 6650 + }, + { + "epoch": 0.676622980798537, + "grad_norm": 2.65625, + "learning_rate": 1.2051301451418073e-05, + "loss": 0.2377, + "step": 6660 + }, + { + "epoch": 0.6776389312201565, + "grad_norm": 2.09375, + "learning_rate": 1.198241280190574e-05, + "loss": 0.1508, + "step": 6670 + }, + { + "epoch": 0.6786548816417759, + "grad_norm": 2.203125, + "learning_rate": 1.1913659498092431e-05, + "loss": 0.1537, + "step": 6680 + }, + { + "epoch": 0.6796708320633953, + "grad_norm": 2.484375, + "learning_rate": 1.184504225481601e-05, + "loss": 0.2339, + "step": 6690 + }, + { + "epoch": 0.6806867824850147, + "grad_norm": 5.625, + "learning_rate": 1.177656178549966e-05, + "loss": 0.2102, + "step": 6700 + }, + { + "epoch": 0.6817027329066342, + "grad_norm": 2.5, + "learning_rate": 1.1708218802144536e-05, + "loss": 0.1435, + "step": 6710 + }, + { + "epoch": 0.6827186833282536, + "grad_norm": 3.84375, + "learning_rate": 1.1640014015322323e-05, + "loss": 0.1823, + "step": 6720 + }, + { + "epoch": 0.683734633749873, + "grad_norm": 2.359375, + "learning_rate": 1.1571948134167862e-05, + "loss": 0.1154, + "step": 6730 + }, + { + "epoch": 0.6847505841714925, + "grad_norm": 2.90625, + "learning_rate": 1.1504021866371761e-05, + "loss": 0.2105, + "step": 6740 + }, + { + "epoch": 0.6857665345931119, + "grad_norm": 5.46875, + "learning_rate": 1.143623591817304e-05, + "loss": 0.1317, + "step": 6750 + }, + { + "epoch": 0.6867824850147313, + "grad_norm": 3.34375, + "learning_rate": 1.1368590994351835e-05, + "loss": 0.1406, + "step": 6760 + }, + { + "epoch": 0.6877984354363507, + "grad_norm": 3.78125, + "learning_rate": 1.130108779822198e-05, + "loss": 0.1425, + "step": 6770 + }, + { + "epoch": 0.6888143858579702, + "grad_norm": 0.77734375, + "learning_rate": 1.1233727031623783e-05, + "loss": 0.1623, + "step": 6780 + }, + { + "epoch": 0.6898303362795896, + "grad_norm": 4.625, + "learning_rate": 1.1166509394916682e-05, + "loss": 0.1591, + "step": 6790 + }, + { + "epoch": 0.690846286701209, + "grad_norm": 3.84375, + "learning_rate": 1.1099435586971982e-05, + "loss": 0.1758, + "step": 6800 + }, + { + "epoch": 0.6918622371228285, + "grad_norm": 2.4375, + "learning_rate": 1.1032506305165555e-05, + "loss": 0.1018, + "step": 6810 + }, + { + "epoch": 0.6928781875444479, + "grad_norm": 3.203125, + "learning_rate": 1.0965722245370641e-05, + "loss": 0.1485, + "step": 6820 + }, + { + "epoch": 0.6938941379660672, + "grad_norm": 0.7109375, + "learning_rate": 1.0899084101950561e-05, + "loss": 0.1762, + "step": 6830 + }, + { + "epoch": 0.6949100883876866, + "grad_norm": 1.9765625, + "learning_rate": 1.0832592567751555e-05, + "loss": 0.1402, + "step": 6840 + }, + { + "epoch": 0.6959260388093061, + "grad_norm": 1.4609375, + "learning_rate": 1.0766248334095505e-05, + "loss": 0.2278, + "step": 6850 + }, + { + "epoch": 0.6969419892309255, + "grad_norm": 3.953125, + "learning_rate": 1.0700052090772828e-05, + "loss": 0.1969, + "step": 6860 + }, + { + "epoch": 0.6979579396525449, + "grad_norm": 2.453125, + "learning_rate": 1.0634004526035249e-05, + "loss": 0.2073, + "step": 6870 + }, + { + "epoch": 0.6989738900741643, + "grad_norm": 1.6171875, + "learning_rate": 1.0568106326588645e-05, + "loss": 0.1902, + "step": 6880 + }, + { + "epoch": 0.6999898404957838, + "grad_norm": 1.2734375, + "learning_rate": 1.0502358177585953e-05, + "loss": 0.2165, + "step": 6890 + }, + { + "epoch": 0.7010057909174032, + "grad_norm": 1.671875, + "learning_rate": 1.0436760762619977e-05, + "loss": 0.1952, + "step": 6900 + }, + { + "epoch": 0.7020217413390226, + "grad_norm": 2.8125, + "learning_rate": 1.0371314763716347e-05, + "loss": 0.1422, + "step": 6910 + }, + { + "epoch": 0.7030376917606421, + "grad_norm": 2.53125, + "learning_rate": 1.0306020861326388e-05, + "loss": 0.0961, + "step": 6920 + }, + { + "epoch": 0.7040536421822615, + "grad_norm": 3.046875, + "learning_rate": 1.0240879734320068e-05, + "loss": 0.1542, + "step": 6930 + }, + { + "epoch": 0.7050695926038809, + "grad_norm": 2.859375, + "learning_rate": 1.0175892059978901e-05, + "loss": 0.1748, + "step": 6940 + }, + { + "epoch": 0.7060855430255003, + "grad_norm": 2.671875, + "learning_rate": 1.0111058513988958e-05, + "loss": 0.0819, + "step": 6950 + }, + { + "epoch": 0.7071014934471198, + "grad_norm": 3.5625, + "learning_rate": 1.0046379770433803e-05, + "loss": 0.1933, + "step": 6960 + }, + { + "epoch": 0.7081174438687392, + "grad_norm": 2.859375, + "learning_rate": 9.98185650178749e-06, + "loss": 0.1891, + "step": 6970 + }, + { + "epoch": 0.7091333942903586, + "grad_norm": 3.15625, + "learning_rate": 9.917489378907591e-06, + "loss": 0.2102, + "step": 6980 + }, + { + "epoch": 0.7101493447119781, + "grad_norm": 6.40625, + "learning_rate": 9.853279071028212e-06, + "loss": 0.1714, + "step": 6990 + }, + { + "epoch": 0.7111652951335975, + "grad_norm": 2.375, + "learning_rate": 9.78922624575303e-06, + "loss": 0.1299, + "step": 7000 + }, + { + "epoch": 0.7121812455552169, + "grad_norm": 2.078125, + "learning_rate": 9.72533156904833e-06, + "loss": 0.1914, + "step": 7010 + }, + { + "epoch": 0.7131971959768363, + "grad_norm": 3.859375, + "learning_rate": 9.661595705236137e-06, + "loss": 0.2377, + "step": 7020 + }, + { + "epoch": 0.7142131463984558, + "grad_norm": 1.171875, + "learning_rate": 9.598019316987244e-06, + "loss": 0.1851, + "step": 7030 + }, + { + "epoch": 0.7152290968200752, + "grad_norm": 1.078125, + "learning_rate": 9.53460306531439e-06, + "loss": 0.2661, + "step": 7040 + }, + { + "epoch": 0.7162450472416946, + "grad_norm": 1.6484375, + "learning_rate": 9.471347609565311e-06, + "loss": 0.1669, + "step": 7050 + }, + { + "epoch": 0.7172609976633141, + "grad_norm": 4.59375, + "learning_rate": 9.408253607415957e-06, + "loss": 0.2487, + "step": 7060 + }, + { + "epoch": 0.7182769480849335, + "grad_norm": 3.09375, + "learning_rate": 9.345321714863614e-06, + "loss": 0.186, + "step": 7070 + }, + { + "epoch": 0.7192928985065529, + "grad_norm": 6.0625, + "learning_rate": 9.282552586220075e-06, + "loss": 0.2249, + "step": 7080 + }, + { + "epoch": 0.7203088489281723, + "grad_norm": 1.5703125, + "learning_rate": 9.219946874104885e-06, + "loss": 0.1255, + "step": 7090 + }, + { + "epoch": 0.7213247993497918, + "grad_norm": 1.9453125, + "learning_rate": 9.157505229438481e-06, + "loss": 0.1999, + "step": 7100 + }, + { + "epoch": 0.7223407497714112, + "grad_norm": 5.1875, + "learning_rate": 9.095228301435518e-06, + "loss": 0.199, + "step": 7110 + }, + { + "epoch": 0.7233567001930306, + "grad_norm": 2.078125, + "learning_rate": 9.03311673759802e-06, + "loss": 0.2182, + "step": 7120 + }, + { + "epoch": 0.7243726506146501, + "grad_norm": 6.46875, + "learning_rate": 8.971171183708733e-06, + "loss": 0.1573, + "step": 7130 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 3.015625, + "learning_rate": 8.909392283824353e-06, + "loss": 0.2044, + "step": 7140 + }, + { + "epoch": 0.7264045514578888, + "grad_norm": 2.921875, + "learning_rate": 8.847780680268872e-06, + "loss": 0.11, + "step": 7150 + }, + { + "epoch": 0.7274205018795082, + "grad_norm": 2.96875, + "learning_rate": 8.786337013626853e-06, + "loss": 0.1897, + "step": 7160 + }, + { + "epoch": 0.7284364523011277, + "grad_norm": 1.7578125, + "learning_rate": 8.725061922736799e-06, + "loss": 0.153, + "step": 7170 + }, + { + "epoch": 0.7294524027227471, + "grad_norm": 1.609375, + "learning_rate": 8.663956044684532e-06, + "loss": 0.1746, + "step": 7180 + }, + { + "epoch": 0.7304683531443665, + "grad_norm": 1.9375, + "learning_rate": 8.603020014796507e-06, + "loss": 0.2284, + "step": 7190 + }, + { + "epoch": 0.7314843035659859, + "grad_norm": 1.515625, + "learning_rate": 8.542254466633273e-06, + "loss": 0.1186, + "step": 7200 + }, + { + "epoch": 0.7325002539876054, + "grad_norm": 1.671875, + "learning_rate": 8.481660031982844e-06, + "loss": 0.1971, + "step": 7210 + }, + { + "epoch": 0.7335162044092248, + "grad_norm": 1.453125, + "learning_rate": 8.421237340854157e-06, + "loss": 0.196, + "step": 7220 + }, + { + "epoch": 0.7345321548308442, + "grad_norm": 0.65234375, + "learning_rate": 8.360987021470479e-06, + "loss": 0.1724, + "step": 7230 + }, + { + "epoch": 0.7355481052524637, + "grad_norm": 2.84375, + "learning_rate": 8.300909700262929e-06, + "loss": 0.175, + "step": 7240 + }, + { + "epoch": 0.7365640556740831, + "grad_norm": 3.109375, + "learning_rate": 8.241006001863924e-06, + "loss": 0.2276, + "step": 7250 + }, + { + "epoch": 0.7375800060957025, + "grad_norm": 4.8125, + "learning_rate": 8.181276549100714e-06, + "loss": 0.2029, + "step": 7260 + }, + { + "epoch": 0.7385959565173219, + "grad_norm": 4.03125, + "learning_rate": 8.12172196298887e-06, + "loss": 0.175, + "step": 7270 + }, + { + "epoch": 0.7396119069389414, + "grad_norm": 3.046875, + "learning_rate": 8.062342862725878e-06, + "loss": 0.1662, + "step": 7280 + }, + { + "epoch": 0.7406278573605608, + "grad_norm": 3.375, + "learning_rate": 8.003139865684662e-06, + "loss": 0.1616, + "step": 7290 + }, + { + "epoch": 0.7416438077821802, + "grad_norm": 2.5625, + "learning_rate": 7.944113587407157e-06, + "loss": 0.2448, + "step": 7300 + }, + { + "epoch": 0.7426597582037997, + "grad_norm": 4.125, + "learning_rate": 7.885264641597961e-06, + "loss": 0.1618, + "step": 7310 + }, + { + "epoch": 0.7436757086254191, + "grad_norm": 3.5, + "learning_rate": 7.826593640117889e-06, + "loss": 0.1134, + "step": 7320 + }, + { + "epoch": 0.7446916590470385, + "grad_norm": 2.6875, + "learning_rate": 7.76810119297767e-06, + "loss": 0.1795, + "step": 7330 + }, + { + "epoch": 0.7457076094686579, + "grad_norm": 4.34375, + "learning_rate": 7.709787908331556e-06, + "loss": 0.2736, + "step": 7340 + }, + { + "epoch": 0.7467235598902774, + "grad_norm": 1.21875, + "learning_rate": 7.651654392471038e-06, + "loss": 0.139, + "step": 7350 + }, + { + "epoch": 0.7477395103118968, + "grad_norm": 3.578125, + "learning_rate": 7.593701249818521e-06, + "loss": 0.2023, + "step": 7360 + }, + { + "epoch": 0.7487554607335162, + "grad_norm": 2.15625, + "learning_rate": 7.535929082921048e-06, + "loss": 0.1702, + "step": 7370 + }, + { + "epoch": 0.7497714111551357, + "grad_norm": 1.96875, + "learning_rate": 7.47833849244402e-06, + "loss": 0.1835, + "step": 7380 + }, + { + "epoch": 0.7507873615767551, + "grad_norm": 2.796875, + "learning_rate": 7.420930077164959e-06, + "loss": 0.1713, + "step": 7390 + }, + { + "epoch": 0.7518033119983745, + "grad_norm": 4.46875, + "learning_rate": 7.363704433967311e-06, + "loss": 0.1906, + "step": 7400 + }, + { + "epoch": 0.7528192624199939, + "grad_norm": 1.75, + "learning_rate": 7.306662157834185e-06, + "loss": 0.1421, + "step": 7410 + }, + { + "epoch": 0.7538352128416134, + "grad_norm": 1.140625, + "learning_rate": 7.2498038418422145e-06, + "loss": 0.1793, + "step": 7420 + }, + { + "epoch": 0.7548511632632328, + "grad_norm": 2.578125, + "learning_rate": 7.193130077155374e-06, + "loss": 0.1603, + "step": 7430 + }, + { + "epoch": 0.7558671136848522, + "grad_norm": 4.3125, + "learning_rate": 7.13664145301883e-06, + "loss": 0.2169, + "step": 7440 + }, + { + "epoch": 0.7568830641064717, + "grad_norm": 3.078125, + "learning_rate": 7.0803385567528025e-06, + "loss": 0.1685, + "step": 7450 + }, + { + "epoch": 0.757899014528091, + "grad_norm": 3.5625, + "learning_rate": 7.024221973746495e-06, + "loss": 0.2282, + "step": 7460 + }, + { + "epoch": 0.7589149649497104, + "grad_norm": 2.265625, + "learning_rate": 6.968292287451961e-06, + "loss": 0.1786, + "step": 7470 + }, + { + "epoch": 0.7599309153713298, + "grad_norm": 4.71875, + "learning_rate": 6.912550079378091e-06, + "loss": 0.1811, + "step": 7480 + }, + { + "epoch": 0.7609468657929493, + "grad_norm": 2.328125, + "learning_rate": 6.856995929084506e-06, + "loss": 0.1747, + "step": 7490 + }, + { + "epoch": 0.7619628162145687, + "grad_norm": 5.21875, + "learning_rate": 6.801630414175589e-06, + "loss": 0.2028, + "step": 7500 + }, + { + "epoch": 0.7629787666361881, + "grad_norm": 3.78125, + "learning_rate": 6.746454110294451e-06, + "loss": 0.2255, + "step": 7510 + }, + { + "epoch": 0.7639947170578075, + "grad_norm": 1.625, + "learning_rate": 6.691467591116931e-06, + "loss": 0.1604, + "step": 7520 + }, + { + "epoch": 0.765010667479427, + "grad_norm": 1.7734375, + "learning_rate": 6.6366714283456755e-06, + "loss": 0.2559, + "step": 7530 + }, + { + "epoch": 0.7660266179010464, + "grad_norm": 4.59375, + "learning_rate": 6.582066191704142e-06, + "loss": 0.2034, + "step": 7540 + }, + { + "epoch": 0.7670425683226658, + "grad_norm": 1.578125, + "learning_rate": 6.527652448930724e-06, + "loss": 0.148, + "step": 7550 + }, + { + "epoch": 0.7680585187442853, + "grad_norm": 1.7109375, + "learning_rate": 6.4734307657728e-06, + "loss": 0.1811, + "step": 7560 + }, + { + "epoch": 0.7690744691659047, + "grad_norm": 1.2734375, + "learning_rate": 6.419401705980924e-06, + "loss": 0.1407, + "step": 7570 + }, + { + "epoch": 0.7700904195875241, + "grad_norm": 2.25, + "learning_rate": 6.365565831302869e-06, + "loss": 0.1893, + "step": 7580 + }, + { + "epoch": 0.7711063700091435, + "grad_norm": 1.625, + "learning_rate": 6.311923701477854e-06, + "loss": 0.1835, + "step": 7590 + }, + { + "epoch": 0.772122320430763, + "grad_norm": 2.375, + "learning_rate": 6.258475874230713e-06, + "loss": 0.1579, + "step": 7600 + }, + { + "epoch": 0.7731382708523824, + "grad_norm": 4.5, + "learning_rate": 6.205222905266067e-06, + "loss": 0.1794, + "step": 7610 + }, + { + "epoch": 0.7741542212740018, + "grad_norm": 4.25, + "learning_rate": 6.152165348262598e-06, + "loss": 0.1477, + "step": 7620 + }, + { + "epoch": 0.7751701716956213, + "grad_norm": 1.9765625, + "learning_rate": 6.0993037548672246e-06, + "loss": 0.2396, + "step": 7630 + }, + { + "epoch": 0.7761861221172407, + "grad_norm": 2.671875, + "learning_rate": 6.046638674689454e-06, + "loss": 0.1717, + "step": 7640 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 3.671875, + "learning_rate": 5.994170655295567e-06, + "loss": 0.2646, + "step": 7650 + }, + { + "epoch": 0.7782180229604795, + "grad_norm": 1.3046875, + "learning_rate": 5.9419002422030106e-06, + "loss": 0.1553, + "step": 7660 + }, + { + "epoch": 0.779233973382099, + "grad_norm": 3.734375, + "learning_rate": 5.889827978874665e-06, + "loss": 0.1854, + "step": 7670 + }, + { + "epoch": 0.7802499238037184, + "grad_norm": 2.140625, + "learning_rate": 5.837954406713245e-06, + "loss": 0.1857, + "step": 7680 + }, + { + "epoch": 0.7812658742253378, + "grad_norm": 3.34375, + "learning_rate": 5.786280065055619e-06, + "loss": 0.1797, + "step": 7690 + }, + { + "epoch": 0.7822818246469573, + "grad_norm": 0.97265625, + "learning_rate": 5.734805491167244e-06, + "loss": 0.1488, + "step": 7700 + }, + { + "epoch": 0.7832977750685767, + "grad_norm": 2.078125, + "learning_rate": 5.683531220236576e-06, + "loss": 0.1688, + "step": 7710 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 3.046875, + "learning_rate": 5.632457785369455e-06, + "loss": 0.1503, + "step": 7720 + }, + { + "epoch": 0.7853296759118155, + "grad_norm": 1.6875, + "learning_rate": 5.581585717583637e-06, + "loss": 0.1658, + "step": 7730 + }, + { + "epoch": 0.786345626333435, + "grad_norm": 3.421875, + "learning_rate": 5.530915545803209e-06, + "loss": 0.2112, + "step": 7740 + }, + { + "epoch": 0.7873615767550544, + "grad_norm": 4.1875, + "learning_rate": 5.480447796853141e-06, + "loss": 0.165, + "step": 7750 + }, + { + "epoch": 0.7883775271766738, + "grad_norm": 5.3125, + "learning_rate": 5.430182995453756e-06, + "loss": 0.1499, + "step": 7760 + }, + { + "epoch": 0.7893934775982933, + "grad_norm": 2.1875, + "learning_rate": 5.380121664215329e-06, + "loss": 0.1559, + "step": 7770 + }, + { + "epoch": 0.7904094280199127, + "grad_norm": 1.46875, + "learning_rate": 5.330264323632611e-06, + "loss": 0.2098, + "step": 7780 + }, + { + "epoch": 0.791425378441532, + "grad_norm": 4.65625, + "learning_rate": 5.280611492079449e-06, + "loss": 0.1776, + "step": 7790 + }, + { + "epoch": 0.7924413288631514, + "grad_norm": 1.3359375, + "learning_rate": 5.231163685803361e-06, + "loss": 0.1497, + "step": 7800 + }, + { + "epoch": 0.7934572792847709, + "grad_norm": 2.640625, + "learning_rate": 5.181921418920191e-06, + "loss": 0.12, + "step": 7810 + }, + { + "epoch": 0.7944732297063903, + "grad_norm": 2.328125, + "learning_rate": 5.13288520340878e-06, + "loss": 0.1981, + "step": 7820 + }, + { + "epoch": 0.7954891801280097, + "grad_norm": 3.0625, + "learning_rate": 5.084055549105596e-06, + "loss": 0.1389, + "step": 7830 + }, + { + "epoch": 0.7965051305496291, + "grad_norm": 2.796875, + "learning_rate": 5.035432963699479e-06, + "loss": 0.2293, + "step": 7840 + }, + { + "epoch": 0.7975210809712486, + "grad_norm": 5.0625, + "learning_rate": 4.98701795272635e-06, + "loss": 0.1618, + "step": 7850 + }, + { + "epoch": 0.798537031392868, + "grad_norm": 5.09375, + "learning_rate": 4.938811019563938e-06, + "loss": 0.1755, + "step": 7860 + }, + { + "epoch": 0.7995529818144874, + "grad_norm": 2.140625, + "learning_rate": 4.8908126654265475e-06, + "loss": 0.1565, + "step": 7870 + }, + { + "epoch": 0.8005689322361069, + "grad_norm": 0.76171875, + "learning_rate": 4.843023389359885e-06, + "loss": 0.2176, + "step": 7880 + }, + { + "epoch": 0.8015848826577263, + "grad_norm": 2.625, + "learning_rate": 4.79544368823581e-06, + "loss": 0.2013, + "step": 7890 + }, + { + "epoch": 0.8026008330793457, + "grad_norm": 2.078125, + "learning_rate": 4.748074056747234e-06, + "loss": 0.1246, + "step": 7900 + }, + { + "epoch": 0.8036167835009651, + "grad_norm": 3.5, + "learning_rate": 4.700914987402919e-06, + "loss": 0.1638, + "step": 7910 + }, + { + "epoch": 0.8046327339225846, + "grad_norm": 3.4375, + "learning_rate": 4.6539669705223916e-06, + "loss": 0.2213, + "step": 7920 + }, + { + "epoch": 0.805648684344204, + "grad_norm": 2.96875, + "learning_rate": 4.607230494230849e-06, + "loss": 0.1822, + "step": 7930 + }, + { + "epoch": 0.8066646347658234, + "grad_norm": 2.359375, + "learning_rate": 4.560706044454047e-06, + "loss": 0.1763, + "step": 7940 + }, + { + "epoch": 0.8076805851874429, + "grad_norm": 4.59375, + "learning_rate": 4.514394104913291e-06, + "loss": 0.234, + "step": 7950 + }, + { + "epoch": 0.8086965356090623, + "grad_norm": 1.96875, + "learning_rate": 4.468295157120372e-06, + "loss": 0.1939, + "step": 7960 + }, + { + "epoch": 0.8097124860306817, + "grad_norm": 2.578125, + "learning_rate": 4.422409680372594e-06, + "loss": 0.174, + "step": 7970 + }, + { + "epoch": 0.8107284364523011, + "grad_norm": 4.5625, + "learning_rate": 4.3767381517477505e-06, + "loss": 0.2375, + "step": 7980 + }, + { + "epoch": 0.8117443868739206, + "grad_norm": 0.9609375, + "learning_rate": 4.331281046099203e-06, + "loss": 0.2076, + "step": 7990 + }, + { + "epoch": 0.81276033729554, + "grad_norm": 6.0625, + "learning_rate": 4.286038836050929e-06, + "loss": 0.2504, + "step": 8000 + }, + { + "epoch": 0.8137762877171594, + "grad_norm": 3.484375, + "learning_rate": 4.241011991992586e-06, + "loss": 0.2102, + "step": 8010 + }, + { + "epoch": 0.8147922381387789, + "grad_norm": 1.9765625, + "learning_rate": 4.1962009820746635e-06, + "loss": 0.1846, + "step": 8020 + }, + { + "epoch": 0.8158081885603983, + "grad_norm": 1.875, + "learning_rate": 4.15160627220357e-06, + "loss": 0.1741, + "step": 8030 + }, + { + "epoch": 0.8168241389820177, + "grad_norm": 5.5625, + "learning_rate": 4.107228326036838e-06, + "loss": 0.2078, + "step": 8040 + }, + { + "epoch": 0.8178400894036371, + "grad_norm": 1.7578125, + "learning_rate": 4.063067604978252e-06, + "loss": 0.212, + "step": 8050 + }, + { + "epoch": 0.8188560398252566, + "grad_norm": 4.09375, + "learning_rate": 4.019124568173094e-06, + "loss": 0.1831, + "step": 8060 + }, + { + "epoch": 0.819871990246876, + "grad_norm": 6.625, + "learning_rate": 3.975399672503341e-06, + "loss": 0.2196, + "step": 8070 + }, + { + "epoch": 0.8208879406684954, + "grad_norm": 2.78125, + "learning_rate": 3.931893372582943e-06, + "loss": 0.2002, + "step": 8080 + }, + { + "epoch": 0.8219038910901149, + "grad_norm": 6.90625, + "learning_rate": 3.888606120753047e-06, + "loss": 0.2138, + "step": 8090 + }, + { + "epoch": 0.8229198415117343, + "grad_norm": 4.09375, + "learning_rate": 3.845538367077362e-06, + "loss": 0.2593, + "step": 8100 + }, + { + "epoch": 0.8239357919333536, + "grad_norm": 1.859375, + "learning_rate": 3.8026905593374213e-06, + "loss": 0.2062, + "step": 8110 + }, + { + "epoch": 0.824951742354973, + "grad_norm": 4.3125, + "learning_rate": 3.760063143027945e-06, + "loss": 0.1343, + "step": 8120 + }, + { + "epoch": 0.8259676927765925, + "grad_norm": 1.984375, + "learning_rate": 3.7176565613522313e-06, + "loss": 0.2494, + "step": 8130 + }, + { + "epoch": 0.8269836431982119, + "grad_norm": 3.71875, + "learning_rate": 3.675471255217516e-06, + "loss": 0.1502, + "step": 8140 + }, + { + "epoch": 0.8279995936198313, + "grad_norm": 2.359375, + "learning_rate": 3.6335076632304175e-06, + "loss": 0.1256, + "step": 8150 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 1.46875, + "learning_rate": 3.5917662216923332e-06, + "loss": 0.1709, + "step": 8160 + }, + { + "epoch": 0.8300314944630702, + "grad_norm": 2.78125, + "learning_rate": 3.550247364594958e-06, + "loss": 0.1881, + "step": 8170 + }, + { + "epoch": 0.8310474448846896, + "grad_norm": 1.0703125, + "learning_rate": 3.508951523615725e-06, + "loss": 0.1998, + "step": 8180 + }, + { + "epoch": 0.832063395306309, + "grad_norm": 2.40625, + "learning_rate": 3.467879128113352e-06, + "loss": 0.2429, + "step": 8190 + }, + { + "epoch": 0.8330793457279285, + "grad_norm": 2.609375, + "learning_rate": 3.427030605123352e-06, + "loss": 0.1942, + "step": 8200 + }, + { + "epoch": 0.8340952961495479, + "grad_norm": 1.6015625, + "learning_rate": 3.3864063793536043e-06, + "loss": 0.1898, + "step": 8210 + }, + { + "epoch": 0.8351112465711673, + "grad_norm": 5.375, + "learning_rate": 3.3460068731799577e-06, + "loss": 0.1919, + "step": 8220 + }, + { + "epoch": 0.8361271969927867, + "grad_norm": 3.3125, + "learning_rate": 3.3058325066417818e-06, + "loss": 0.1516, + "step": 8230 + }, + { + "epoch": 0.8371431474144062, + "grad_norm": 0.76171875, + "learning_rate": 3.26588369743768e-06, + "loss": 0.1068, + "step": 8240 + }, + { + "epoch": 0.8381590978360256, + "grad_norm": 3.171875, + "learning_rate": 3.2261608609210653e-06, + "loss": 0.1203, + "step": 8250 + }, + { + "epoch": 0.839175048257645, + "grad_norm": 2.359375, + "learning_rate": 3.186664410095913e-06, + "loss": 0.2172, + "step": 8260 + }, + { + "epoch": 0.8401909986792645, + "grad_norm": 3.328125, + "learning_rate": 3.1473947556124093e-06, + "loss": 0.1249, + "step": 8270 + }, + { + "epoch": 0.8412069491008839, + "grad_norm": 2.484375, + "learning_rate": 3.1083523057627213e-06, + "loss": 0.1744, + "step": 8280 + }, + { + "epoch": 0.8422228995225033, + "grad_norm": 4.46875, + "learning_rate": 3.0695374664767353e-06, + "loss": 0.1772, + "step": 8290 + }, + { + "epoch": 0.8432388499441227, + "grad_norm": 0.59375, + "learning_rate": 3.0309506413178397e-06, + "loss": 0.2302, + "step": 8300 + }, + { + "epoch": 0.8442548003657422, + "grad_norm": 2.390625, + "learning_rate": 2.9925922314787136e-06, + "loss": 0.1635, + "step": 8310 + }, + { + "epoch": 0.8452707507873616, + "grad_norm": 2.34375, + "learning_rate": 2.954462635777194e-06, + "loss": 0.1573, + "step": 8320 + }, + { + "epoch": 0.846286701208981, + "grad_norm": 2.015625, + "learning_rate": 2.916562250652083e-06, + "loss": 0.1608, + "step": 8330 + }, + { + "epoch": 0.8473026516306005, + "grad_norm": 4.125, + "learning_rate": 2.878891470159048e-06, + "loss": 0.184, + "step": 8340 + }, + { + "epoch": 0.8483186020522199, + "grad_norm": 2.515625, + "learning_rate": 2.8414506859665514e-06, + "loss": 0.2141, + "step": 8350 + }, + { + "epoch": 0.8493345524738393, + "grad_norm": 3.375, + "learning_rate": 2.8042402873517197e-06, + "loss": 0.1729, + "step": 8360 + }, + { + "epoch": 0.8503505028954587, + "grad_norm": 3.078125, + "learning_rate": 2.76726066119635e-06, + "loss": 0.2252, + "step": 8370 + }, + { + "epoch": 0.8513664533170782, + "grad_norm": 1.5390625, + "learning_rate": 2.730512191982845e-06, + "loss": 0.1644, + "step": 8380 + }, + { + "epoch": 0.8523824037386976, + "grad_norm": 1.9296875, + "learning_rate": 2.693995261790261e-06, + "loss": 0.1822, + "step": 8390 + }, + { + "epoch": 0.853398354160317, + "grad_norm": 3.3125, + "learning_rate": 2.657710250290285e-06, + "loss": 0.2068, + "step": 8400 + }, + { + "epoch": 0.8544143045819365, + "grad_norm": 0.640625, + "learning_rate": 2.621657534743327e-06, + "loss": 0.1224, + "step": 8410 + }, + { + "epoch": 0.8554302550035559, + "grad_norm": 3.421875, + "learning_rate": 2.5858374899945804e-06, + "loss": 0.179, + "step": 8420 + }, + { + "epoch": 0.8564462054251752, + "grad_norm": 3.484375, + "learning_rate": 2.550250488470135e-06, + "loss": 0.1873, + "step": 8430 + }, + { + "epoch": 0.8574621558467946, + "grad_norm": 3.984375, + "learning_rate": 2.5148969001730806e-06, + "loss": 0.1799, + "step": 8440 + }, + { + "epoch": 0.8584781062684141, + "grad_norm": 1.375, + "learning_rate": 2.4797770926796858e-06, + "loss": 0.176, + "step": 8450 + }, + { + "epoch": 0.8594940566900335, + "grad_norm": 1.8984375, + "learning_rate": 2.444891431135571e-06, + "loss": 0.1664, + "step": 8460 + }, + { + "epoch": 0.8605100071116529, + "grad_norm": 4.15625, + "learning_rate": 2.4102402782518936e-06, + "loss": 0.1512, + "step": 8470 + }, + { + "epoch": 0.8615259575332723, + "grad_norm": 1.34375, + "learning_rate": 2.3758239943016096e-06, + "loss": 0.1629, + "step": 8480 + }, + { + "epoch": 0.8625419079548918, + "grad_norm": 5.3125, + "learning_rate": 2.3416429371157013e-06, + "loss": 0.2099, + "step": 8490 + }, + { + "epoch": 0.8635578583765112, + "grad_norm": 5.9375, + "learning_rate": 2.307697462079464e-06, + "loss": 0.2221, + "step": 8500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-8500/training_args.bin b/checkpoints/checkpoint-8500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-8500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-9000/adapter_config.json b/checkpoints/checkpoint-9000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-9000/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-9000/adapter_model.safetensors b/checkpoints/checkpoint-9000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8527f2c40f383b1cf651d36a2984f5ff825b8ae2 --- /dev/null +++ b/checkpoints/checkpoint-9000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33bdc4e327ac26650f40cb5930deff06b7f48707f893ee77c1175a4a65f6909b +size 5919456 diff --git a/checkpoints/checkpoint-9000/optimizer.pt b/checkpoints/checkpoint-9000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6bc1be7888a16d8c4b7f04fab92dd9b007b67cc --- /dev/null +++ b/checkpoints/checkpoint-9000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0638566e76474065be273b543ece321c7e8e004ee51a20c1a1d52b2838e78bb6 +size 11930938 diff --git a/checkpoints/checkpoint-9000/rng_state_0.pth b/checkpoints/checkpoint-9000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..2f6ccedf895c2d6ac8b9d081077455c1e01eb40b --- /dev/null +++ b/checkpoints/checkpoint-9000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9ca1c49d5b46a7f77afead045cfb6ddf181a9c78168c26c7eef763523361557 +size 15024 diff --git a/checkpoints/checkpoint-9000/rng_state_1.pth b/checkpoints/checkpoint-9000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a9526156d7a2b5afe37477f57c9c8c5878b57db6 --- /dev/null +++ b/checkpoints/checkpoint-9000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:321d28c2b0ca1e12bac7c5601220acbf51dba57cb5c81444c67560204eeb23e8 +size 15024 diff --git a/checkpoints/checkpoint-9000/rng_state_2.pth b/checkpoints/checkpoint-9000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..cf5c72ead4046664eac6220c25e849e4fd4ae199 --- /dev/null +++ b/checkpoints/checkpoint-9000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dbe2075823c34d4e2ddbb8628dd2dfb1329f66b7c4225949042b1ffb085b4c4 +size 15024 diff --git a/checkpoints/checkpoint-9000/rng_state_3.pth b/checkpoints/checkpoint-9000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..83bd913f93637b92c9974abbc175838c2fc91877 --- /dev/null +++ b/checkpoints/checkpoint-9000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94cf730a64bd09032ea105de0e172f948a4a2bd002a9ce85d137729cfbbd0578 +size 15024 diff --git a/checkpoints/checkpoint-9000/scheduler.pt b/checkpoints/checkpoint-9000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..581c38180d342706ff7b6c9ed5c479819b60323e --- /dev/null +++ b/checkpoints/checkpoint-9000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:777ecd9b7cc3e4361056b976c635577f61b77b3bbef2bb60189849a484abfe13 +size 1064 diff --git a/checkpoints/checkpoint-9000/trainer_state.json b/checkpoints/checkpoint-9000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ef0193a85dc4d73484dbcd0610b17b8f9fd78a14 --- /dev/null +++ b/checkpoints/checkpoint-9000/trainer_state.json @@ -0,0 +1,6321 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9143553794574825, + "eval_steps": 500, + "global_step": 9000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + }, + { + "epoch": 0.5597886823123032, + "grad_norm": 1.9921875, + "learning_rate": 2.0680849202968743e-05, + "loss": 0.2139, + "step": 5510 + }, + { + "epoch": 0.5608046327339226, + "grad_norm": 2.4375, + "learning_rate": 2.060147242557674e-05, + "loss": 0.183, + "step": 5520 + }, + { + "epoch": 0.5618205831555421, + "grad_norm": 5.5, + "learning_rate": 2.0522141380299308e-05, + "loss": 0.1673, + "step": 5530 + }, + { + "epoch": 0.5628365335771615, + "grad_norm": 4.25, + "learning_rate": 2.044285689195258e-05, + "loss": 0.1674, + "step": 5540 + }, + { + "epoch": 0.5638524839987809, + "grad_norm": 2.109375, + "learning_rate": 2.0363619784868604e-05, + "loss": 0.1531, + "step": 5550 + }, + { + "epoch": 0.5648684344204002, + "grad_norm": 2.59375, + "learning_rate": 2.0284430882886836e-05, + "loss": 0.1665, + "step": 5560 + }, + { + "epoch": 0.5658843848420197, + "grad_norm": 3.984375, + "learning_rate": 2.020529100934549e-05, + "loss": 0.1717, + "step": 5570 + }, + { + "epoch": 0.5669003352636391, + "grad_norm": 1.6015625, + "learning_rate": 2.012620098707306e-05, + "loss": 0.1167, + "step": 5580 + }, + { + "epoch": 0.5679162856852585, + "grad_norm": 6.0625, + "learning_rate": 2.004716163837972e-05, + "loss": 0.2084, + "step": 5590 + }, + { + "epoch": 0.5689322361068779, + "grad_norm": 2.5625, + "learning_rate": 1.996817378504876e-05, + "loss": 0.1939, + "step": 5600 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 3.109375, + "learning_rate": 1.9889238248328108e-05, + "loss": 0.1241, + "step": 5610 + }, + { + "epoch": 0.5709641369501168, + "grad_norm": 4.875, + "learning_rate": 1.981035584892171e-05, + "loss": 0.1865, + "step": 5620 + }, + { + "epoch": 0.5719800873717362, + "grad_norm": 2.984375, + "learning_rate": 1.9731527406981072e-05, + "loss": 0.1639, + "step": 5630 + }, + { + "epoch": 0.5729960377933557, + "grad_norm": 4.4375, + "learning_rate": 1.9652753742096655e-05, + "loss": 0.2019, + "step": 5640 + }, + { + "epoch": 0.5740119882149751, + "grad_norm": 4.3125, + "learning_rate": 1.9574035673289432e-05, + "loss": 0.1829, + "step": 5650 + }, + { + "epoch": 0.5750279386365945, + "grad_norm": 3.203125, + "learning_rate": 1.9495374019002312e-05, + "loss": 0.2267, + "step": 5660 + }, + { + "epoch": 0.5760438890582139, + "grad_norm": 1.765625, + "learning_rate": 1.9416769597091673e-05, + "loss": 0.1411, + "step": 5670 + }, + { + "epoch": 0.5770598394798334, + "grad_norm": 2.640625, + "learning_rate": 1.9338223224818818e-05, + "loss": 0.1476, + "step": 5680 + }, + { + "epoch": 0.5780757899014528, + "grad_norm": 4.84375, + "learning_rate": 1.9259735718841524e-05, + "loss": 0.1417, + "step": 5690 + }, + { + "epoch": 0.5790917403230722, + "grad_norm": 2.421875, + "learning_rate": 1.918130789520551e-05, + "loss": 0.1592, + "step": 5700 + }, + { + "epoch": 0.5801076907446917, + "grad_norm": 2.984375, + "learning_rate": 1.9102940569335963e-05, + "loss": 0.161, + "step": 5710 + }, + { + "epoch": 0.5811236411663111, + "grad_norm": 1.0234375, + "learning_rate": 1.9024634556029093e-05, + "loss": 0.1614, + "step": 5720 + }, + { + "epoch": 0.5821395915879305, + "grad_norm": 2.90625, + "learning_rate": 1.89463906694436e-05, + "loss": 0.1505, + "step": 5730 + }, + { + "epoch": 0.5831555420095499, + "grad_norm": 2.875, + "learning_rate": 1.8868209723092286e-05, + "loss": 0.1674, + "step": 5740 + }, + { + "epoch": 0.5841714924311694, + "grad_norm": 0.408203125, + "learning_rate": 1.8790092529833508e-05, + "loss": 0.1468, + "step": 5750 + }, + { + "epoch": 0.5851874428527888, + "grad_norm": 5.1875, + "learning_rate": 1.871203990186281e-05, + "loss": 0.1903, + "step": 5760 + }, + { + "epoch": 0.5862033932744082, + "grad_norm": 0.5546875, + "learning_rate": 1.8634052650704415e-05, + "loss": 0.2644, + "step": 5770 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 3.203125, + "learning_rate": 1.8556131587202848e-05, + "loss": 0.1968, + "step": 5780 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484375, + "learning_rate": 1.8478277521514424e-05, + "loss": 0.2249, + "step": 5790 + }, + { + "epoch": 0.5892512445392665, + "grad_norm": 4.0, + "learning_rate": 1.8400491263098906e-05, + "loss": 0.1881, + "step": 5800 + }, + { + "epoch": 0.5902671949608859, + "grad_norm": 1.90625, + "learning_rate": 1.832277362071106e-05, + "loss": 0.1352, + "step": 5810 + }, + { + "epoch": 0.5912831453825054, + "grad_norm": 2.765625, + "learning_rate": 1.824512540239221e-05, + "loss": 0.2737, + "step": 5820 + }, + { + "epoch": 0.5922990958041248, + "grad_norm": 2.609375, + "learning_rate": 1.81675474154619e-05, + "loss": 0.1566, + "step": 5830 + }, + { + "epoch": 0.5933150462257442, + "grad_norm": 2.6875, + "learning_rate": 1.8090040466509444e-05, + "loss": 0.1999, + "step": 5840 + }, + { + "epoch": 0.5943309966473637, + "grad_norm": 2.609375, + "learning_rate": 1.8012605361385592e-05, + "loss": 0.2372, + "step": 5850 + }, + { + "epoch": 0.5953469470689831, + "grad_norm": 8.125, + "learning_rate": 1.7935242905194087e-05, + "loss": 0.2411, + "step": 5860 + }, + { + "epoch": 0.5963628974906025, + "grad_norm": 3.46875, + "learning_rate": 1.785795390228336e-05, + "loss": 0.138, + "step": 5870 + }, + { + "epoch": 0.5973788479122218, + "grad_norm": 2.3125, + "learning_rate": 1.7780739156238125e-05, + "loss": 0.1867, + "step": 5880 + }, + { + "epoch": 0.5983947983338413, + "grad_norm": 4.0625, + "learning_rate": 1.770359946987105e-05, + "loss": 0.2091, + "step": 5890 + }, + { + "epoch": 0.5994107487554607, + "grad_norm": 5.21875, + "learning_rate": 1.7626535645214378e-05, + "loss": 0.2091, + "step": 5900 + }, + { + "epoch": 0.6004266991770801, + "grad_norm": 3.15625, + "learning_rate": 1.7549548483511614e-05, + "loss": 0.1927, + "step": 5910 + }, + { + "epoch": 0.6014426495986995, + "grad_norm": 4.71875, + "learning_rate": 1.7472638785209198e-05, + "loss": 0.1893, + "step": 5920 + }, + { + "epoch": 0.602458600020319, + "grad_norm": 3.015625, + "learning_rate": 1.7395807349948145e-05, + "loss": 0.1557, + "step": 5930 + }, + { + "epoch": 0.6034745504419384, + "grad_norm": 2.9375, + "learning_rate": 1.73190549765558e-05, + "loss": 0.1717, + "step": 5940 + }, + { + "epoch": 0.6044905008635578, + "grad_norm": 3.109375, + "learning_rate": 1.724238246303745e-05, + "loss": 0.1879, + "step": 5950 + }, + { + "epoch": 0.6055064512851773, + "grad_norm": 3.875, + "learning_rate": 1.71657906065681e-05, + "loss": 0.1908, + "step": 5960 + }, + { + "epoch": 0.6065224017067967, + "grad_norm": 5.09375, + "learning_rate": 1.7089280203484115e-05, + "loss": 0.1712, + "step": 5970 + }, + { + "epoch": 0.6075383521284161, + "grad_norm": 3.015625, + "learning_rate": 1.701285204927502e-05, + "loss": 0.1454, + "step": 5980 + }, + { + "epoch": 0.6085543025500355, + "grad_norm": 3.265625, + "learning_rate": 1.693650693857515e-05, + "loss": 0.2283, + "step": 5990 + }, + { + "epoch": 0.609570252971655, + "grad_norm": 3.40625, + "learning_rate": 1.6860245665155466e-05, + "loss": 0.2188, + "step": 6000 + }, + { + "epoch": 0.6105862033932744, + "grad_norm": 2.5625, + "learning_rate": 1.678406902191521e-05, + "loss": 0.1605, + "step": 6010 + }, + { + "epoch": 0.6116021538148938, + "grad_norm": 0.6796875, + "learning_rate": 1.670797780087374e-05, + "loss": 0.1472, + "step": 6020 + }, + { + "epoch": 0.6126181042365133, + "grad_norm": 2.234375, + "learning_rate": 1.6631972793162288e-05, + "loss": 0.1676, + "step": 6030 + }, + { + "epoch": 0.6136340546581327, + "grad_norm": 1.25, + "learning_rate": 1.6556054789015662e-05, + "loss": 0.1508, + "step": 6040 + }, + { + "epoch": 0.6146500050797521, + "grad_norm": 4.78125, + "learning_rate": 1.6480224577764132e-05, + "loss": 0.1981, + "step": 6050 + }, + { + "epoch": 0.6156659555013715, + "grad_norm": 3.46875, + "learning_rate": 1.6404482947825137e-05, + "loss": 0.2514, + "step": 6060 + }, + { + "epoch": 0.616681905922991, + "grad_norm": 1.265625, + "learning_rate": 1.6328830686695154e-05, + "loss": 0.2397, + "step": 6070 + }, + { + "epoch": 0.6176978563446104, + "grad_norm": 1.953125, + "learning_rate": 1.625326858094144e-05, + "loss": 0.1523, + "step": 6080 + }, + { + "epoch": 0.6187138067662298, + "grad_norm": 3.484375, + "learning_rate": 1.6177797416193953e-05, + "loss": 0.218, + "step": 6090 + }, + { + "epoch": 0.6197297571878493, + "grad_norm": 3.484375, + "learning_rate": 1.6102417977137052e-05, + "loss": 0.1476, + "step": 6100 + }, + { + "epoch": 0.6207457076094687, + "grad_norm": 4.90625, + "learning_rate": 1.602713104750147e-05, + "loss": 0.1818, + "step": 6110 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 4.375, + "learning_rate": 1.5951937410056087e-05, + "loss": 0.2061, + "step": 6120 + }, + { + "epoch": 0.6227776084527075, + "grad_norm": 6.3125, + "learning_rate": 1.587683784659979e-05, + "loss": 0.1566, + "step": 6130 + }, + { + "epoch": 0.623793558874327, + "grad_norm": 2.828125, + "learning_rate": 1.58018331379534e-05, + "loss": 0.1376, + "step": 6140 + }, + { + "epoch": 0.6248095092959464, + "grad_norm": 2.40625, + "learning_rate": 1.572692406395149e-05, + "loss": 0.1655, + "step": 6150 + }, + { + "epoch": 0.6258254597175658, + "grad_norm": 4.34375, + "learning_rate": 1.5652111403434338e-05, + "loss": 0.2363, + "step": 6160 + }, + { + "epoch": 0.6268414101391853, + "grad_norm": 2.453125, + "learning_rate": 1.5577395934239757e-05, + "loss": 0.2464, + "step": 6170 + }, + { + "epoch": 0.6278573605608047, + "grad_norm": 2.53125, + "learning_rate": 1.5502778433195085e-05, + "loss": 0.1898, + "step": 6180 + }, + { + "epoch": 0.628873310982424, + "grad_norm": 2.28125, + "learning_rate": 1.5428259676109048e-05, + "loss": 0.1804, + "step": 6190 + }, + { + "epoch": 0.6298892614040434, + "grad_norm": 4.3125, + "learning_rate": 1.5353840437763732e-05, + "loss": 0.1409, + "step": 6200 + }, + { + "epoch": 0.630905211825663, + "grad_norm": 2.5625, + "learning_rate": 1.5279521491906496e-05, + "loss": 0.2449, + "step": 6210 + }, + { + "epoch": 0.6319211622472823, + "grad_norm": 3.0625, + "learning_rate": 1.520530361124195e-05, + "loss": 0.2103, + "step": 6220 + }, + { + "epoch": 0.6329371126689017, + "grad_norm": 2.609375, + "learning_rate": 1.5131187567423937e-05, + "loss": 0.2156, + "step": 6230 + }, + { + "epoch": 0.6339530630905211, + "grad_norm": 2.703125, + "learning_rate": 1.5057174131047446e-05, + "loss": 0.161, + "step": 6240 + }, + { + "epoch": 0.6349690135121406, + "grad_norm": 3.265625, + "learning_rate": 1.4983264071640679e-05, + "loss": 0.1757, + "step": 6250 + }, + { + "epoch": 0.63598496393376, + "grad_norm": 3.15625, + "learning_rate": 1.490945815765699e-05, + "loss": 0.2011, + "step": 6260 + }, + { + "epoch": 0.6370009143553794, + "grad_norm": 5.375, + "learning_rate": 1.4835757156466945e-05, + "loss": 0.1658, + "step": 6270 + }, + { + "epoch": 0.6380168647769989, + "grad_norm": 2.984375, + "learning_rate": 1.4762161834350271e-05, + "loss": 0.1754, + "step": 6280 + }, + { + "epoch": 0.6390328151986183, + "grad_norm": 2.015625, + "learning_rate": 1.4688672956487987e-05, + "loss": 0.1427, + "step": 6290 + }, + { + "epoch": 0.6400487656202377, + "grad_norm": 3.78125, + "learning_rate": 1.4615291286954352e-05, + "loss": 0.1517, + "step": 6300 + }, + { + "epoch": 0.6410647160418571, + "grad_norm": 2.859375, + "learning_rate": 1.4542017588709005e-05, + "loss": 0.2348, + "step": 6310 + }, + { + "epoch": 0.6420806664634766, + "grad_norm": 2.421875, + "learning_rate": 1.4468852623588961e-05, + "loss": 0.2089, + "step": 6320 + }, + { + "epoch": 0.643096616885096, + "grad_norm": 2.15625, + "learning_rate": 1.4395797152300719e-05, + "loss": 0.1702, + "step": 6330 + }, + { + "epoch": 0.6441125673067154, + "grad_norm": 1.53125, + "learning_rate": 1.4322851934412382e-05, + "loss": 0.1017, + "step": 6340 + }, + { + "epoch": 0.6451285177283349, + "grad_norm": 1.90625, + "learning_rate": 1.4250017728345716e-05, + "loss": 0.1813, + "step": 6350 + }, + { + "epoch": 0.6461444681499543, + "grad_norm": 2.015625, + "learning_rate": 1.4177295291368292e-05, + "loss": 0.1095, + "step": 6360 + }, + { + "epoch": 0.6471604185715737, + "grad_norm": 2.625, + "learning_rate": 1.410468537958558e-05, + "loss": 0.2259, + "step": 6370 + }, + { + "epoch": 0.6481763689931931, + "grad_norm": 3.5, + "learning_rate": 1.4032188747933136e-05, + "loss": 0.1595, + "step": 6380 + }, + { + "epoch": 0.6491923194148126, + "grad_norm": 5.21875, + "learning_rate": 1.39598061501687e-05, + "loss": 0.2226, + "step": 6390 + }, + { + "epoch": 0.650208269836432, + "grad_norm": 5.34375, + "learning_rate": 1.388753833886442e-05, + "loss": 0.2132, + "step": 6400 + }, + { + "epoch": 0.6512242202580514, + "grad_norm": 3.640625, + "learning_rate": 1.3815386065398945e-05, + "loss": 0.1227, + "step": 6410 + }, + { + "epoch": 0.6522401706796709, + "grad_norm": 1.0, + "learning_rate": 1.3743350079949705e-05, + "loss": 0.1755, + "step": 6420 + }, + { + "epoch": 0.6532561211012903, + "grad_norm": 2.359375, + "learning_rate": 1.3671431131485057e-05, + "loss": 0.1552, + "step": 6430 + }, + { + "epoch": 0.6542720715229097, + "grad_norm": 5.3125, + "learning_rate": 1.3599629967756483e-05, + "loss": 0.1917, + "step": 6440 + }, + { + "epoch": 0.6552880219445291, + "grad_norm": 4.625, + "learning_rate": 1.3527947335290877e-05, + "loss": 0.1812, + "step": 6450 + }, + { + "epoch": 0.6563039723661486, + "grad_norm": 1.234375, + "learning_rate": 1.3456383979382708e-05, + "loss": 0.1896, + "step": 6460 + }, + { + "epoch": 0.657319922787768, + "grad_norm": 3.984375, + "learning_rate": 1.3384940644086352e-05, + "loss": 0.1484, + "step": 6470 + }, + { + "epoch": 0.6583358732093874, + "grad_norm": 2.40625, + "learning_rate": 1.3313618072208268e-05, + "loss": 0.1334, + "step": 6480 + }, + { + "epoch": 0.6593518236310069, + "grad_norm": 4.375, + "learning_rate": 1.3242417005299357e-05, + "loss": 0.1351, + "step": 6490 + }, + { + "epoch": 0.6603677740526263, + "grad_norm": 2.640625, + "learning_rate": 1.31713381836472e-05, + "loss": 0.1717, + "step": 6500 + }, + { + "epoch": 0.6613837244742456, + "grad_norm": 2.640625, + "learning_rate": 1.3100382346268392e-05, + "loss": 0.1867, + "step": 6510 + }, + { + "epoch": 0.662399674895865, + "grad_norm": 1.734375, + "learning_rate": 1.3029550230900812e-05, + "loss": 0.1997, + "step": 6520 + }, + { + "epoch": 0.6634156253174845, + "grad_norm": 3.609375, + "learning_rate": 1.2958842573996016e-05, + "loss": 0.1969, + "step": 6530 + }, + { + "epoch": 0.6644315757391039, + "grad_norm": 3.578125, + "learning_rate": 1.2888260110711525e-05, + "loss": 0.1469, + "step": 6540 + }, + { + "epoch": 0.6654475261607233, + "grad_norm": 1.3515625, + "learning_rate": 1.2817803574903212e-05, + "loss": 0.1524, + "step": 6550 + }, + { + "epoch": 0.6664634765823427, + "grad_norm": 2.109375, + "learning_rate": 1.2747473699117668e-05, + "loss": 0.159, + "step": 6560 + }, + { + "epoch": 0.6674794270039622, + "grad_norm": 1.53125, + "learning_rate": 1.267727121458458e-05, + "loss": 0.1999, + "step": 6570 + }, + { + "epoch": 0.6684953774255816, + "grad_norm": 1.7265625, + "learning_rate": 1.2607196851209137e-05, + "loss": 0.2216, + "step": 6580 + }, + { + "epoch": 0.669511327847201, + "grad_norm": 3.125, + "learning_rate": 1.2537251337564412e-05, + "loss": 0.1607, + "step": 6590 + }, + { + "epoch": 0.6705272782688205, + "grad_norm": 2.421875, + "learning_rate": 1.2467435400883839e-05, + "loss": 0.2187, + "step": 6600 + }, + { + "epoch": 0.6715432286904399, + "grad_norm": 1.5078125, + "learning_rate": 1.239774976705359e-05, + "loss": 0.1753, + "step": 6610 + }, + { + "epoch": 0.6725591791120593, + "grad_norm": 1.140625, + "learning_rate": 1.2328195160605092e-05, + "loss": 0.194, + "step": 6620 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 4.9375, + "learning_rate": 1.225877230470743e-05, + "loss": 0.1485, + "step": 6630 + }, + { + "epoch": 0.6745910799552982, + "grad_norm": 3.65625, + "learning_rate": 1.218948192115988e-05, + "loss": 0.1847, + "step": 6640 + }, + { + "epoch": 0.6756070303769176, + "grad_norm": 3.875, + "learning_rate": 1.21203247303844e-05, + "loss": 0.1874, + "step": 6650 + }, + { + "epoch": 0.676622980798537, + "grad_norm": 2.65625, + "learning_rate": 1.2051301451418073e-05, + "loss": 0.2377, + "step": 6660 + }, + { + "epoch": 0.6776389312201565, + "grad_norm": 2.09375, + "learning_rate": 1.198241280190574e-05, + "loss": 0.1508, + "step": 6670 + }, + { + "epoch": 0.6786548816417759, + "grad_norm": 2.203125, + "learning_rate": 1.1913659498092431e-05, + "loss": 0.1537, + "step": 6680 + }, + { + "epoch": 0.6796708320633953, + "grad_norm": 2.484375, + "learning_rate": 1.184504225481601e-05, + "loss": 0.2339, + "step": 6690 + }, + { + "epoch": 0.6806867824850147, + "grad_norm": 5.625, + "learning_rate": 1.177656178549966e-05, + "loss": 0.2102, + "step": 6700 + }, + { + "epoch": 0.6817027329066342, + "grad_norm": 2.5, + "learning_rate": 1.1708218802144536e-05, + "loss": 0.1435, + "step": 6710 + }, + { + "epoch": 0.6827186833282536, + "grad_norm": 3.84375, + "learning_rate": 1.1640014015322323e-05, + "loss": 0.1823, + "step": 6720 + }, + { + "epoch": 0.683734633749873, + "grad_norm": 2.359375, + "learning_rate": 1.1571948134167862e-05, + "loss": 0.1154, + "step": 6730 + }, + { + "epoch": 0.6847505841714925, + "grad_norm": 2.90625, + "learning_rate": 1.1504021866371761e-05, + "loss": 0.2105, + "step": 6740 + }, + { + "epoch": 0.6857665345931119, + "grad_norm": 5.46875, + "learning_rate": 1.143623591817304e-05, + "loss": 0.1317, + "step": 6750 + }, + { + "epoch": 0.6867824850147313, + "grad_norm": 3.34375, + "learning_rate": 1.1368590994351835e-05, + "loss": 0.1406, + "step": 6760 + }, + { + "epoch": 0.6877984354363507, + "grad_norm": 3.78125, + "learning_rate": 1.130108779822198e-05, + "loss": 0.1425, + "step": 6770 + }, + { + "epoch": 0.6888143858579702, + "grad_norm": 0.77734375, + "learning_rate": 1.1233727031623783e-05, + "loss": 0.1623, + "step": 6780 + }, + { + "epoch": 0.6898303362795896, + "grad_norm": 4.625, + "learning_rate": 1.1166509394916682e-05, + "loss": 0.1591, + "step": 6790 + }, + { + "epoch": 0.690846286701209, + "grad_norm": 3.84375, + "learning_rate": 1.1099435586971982e-05, + "loss": 0.1758, + "step": 6800 + }, + { + "epoch": 0.6918622371228285, + "grad_norm": 2.4375, + "learning_rate": 1.1032506305165555e-05, + "loss": 0.1018, + "step": 6810 + }, + { + "epoch": 0.6928781875444479, + "grad_norm": 3.203125, + "learning_rate": 1.0965722245370641e-05, + "loss": 0.1485, + "step": 6820 + }, + { + "epoch": 0.6938941379660672, + "grad_norm": 0.7109375, + "learning_rate": 1.0899084101950561e-05, + "loss": 0.1762, + "step": 6830 + }, + { + "epoch": 0.6949100883876866, + "grad_norm": 1.9765625, + "learning_rate": 1.0832592567751555e-05, + "loss": 0.1402, + "step": 6840 + }, + { + "epoch": 0.6959260388093061, + "grad_norm": 1.4609375, + "learning_rate": 1.0766248334095505e-05, + "loss": 0.2278, + "step": 6850 + }, + { + "epoch": 0.6969419892309255, + "grad_norm": 3.953125, + "learning_rate": 1.0700052090772828e-05, + "loss": 0.1969, + "step": 6860 + }, + { + "epoch": 0.6979579396525449, + "grad_norm": 2.453125, + "learning_rate": 1.0634004526035249e-05, + "loss": 0.2073, + "step": 6870 + }, + { + "epoch": 0.6989738900741643, + "grad_norm": 1.6171875, + "learning_rate": 1.0568106326588645e-05, + "loss": 0.1902, + "step": 6880 + }, + { + "epoch": 0.6999898404957838, + "grad_norm": 1.2734375, + "learning_rate": 1.0502358177585953e-05, + "loss": 0.2165, + "step": 6890 + }, + { + "epoch": 0.7010057909174032, + "grad_norm": 1.671875, + "learning_rate": 1.0436760762619977e-05, + "loss": 0.1952, + "step": 6900 + }, + { + "epoch": 0.7020217413390226, + "grad_norm": 2.8125, + "learning_rate": 1.0371314763716347e-05, + "loss": 0.1422, + "step": 6910 + }, + { + "epoch": 0.7030376917606421, + "grad_norm": 2.53125, + "learning_rate": 1.0306020861326388e-05, + "loss": 0.0961, + "step": 6920 + }, + { + "epoch": 0.7040536421822615, + "grad_norm": 3.046875, + "learning_rate": 1.0240879734320068e-05, + "loss": 0.1542, + "step": 6930 + }, + { + "epoch": 0.7050695926038809, + "grad_norm": 2.859375, + "learning_rate": 1.0175892059978901e-05, + "loss": 0.1748, + "step": 6940 + }, + { + "epoch": 0.7060855430255003, + "grad_norm": 2.671875, + "learning_rate": 1.0111058513988958e-05, + "loss": 0.0819, + "step": 6950 + }, + { + "epoch": 0.7071014934471198, + "grad_norm": 3.5625, + "learning_rate": 1.0046379770433803e-05, + "loss": 0.1933, + "step": 6960 + }, + { + "epoch": 0.7081174438687392, + "grad_norm": 2.859375, + "learning_rate": 9.98185650178749e-06, + "loss": 0.1891, + "step": 6970 + }, + { + "epoch": 0.7091333942903586, + "grad_norm": 3.15625, + "learning_rate": 9.917489378907591e-06, + "loss": 0.2102, + "step": 6980 + }, + { + "epoch": 0.7101493447119781, + "grad_norm": 6.40625, + "learning_rate": 9.853279071028212e-06, + "loss": 0.1714, + "step": 6990 + }, + { + "epoch": 0.7111652951335975, + "grad_norm": 2.375, + "learning_rate": 9.78922624575303e-06, + "loss": 0.1299, + "step": 7000 + }, + { + "epoch": 0.7121812455552169, + "grad_norm": 2.078125, + "learning_rate": 9.72533156904833e-06, + "loss": 0.1914, + "step": 7010 + }, + { + "epoch": 0.7131971959768363, + "grad_norm": 3.859375, + "learning_rate": 9.661595705236137e-06, + "loss": 0.2377, + "step": 7020 + }, + { + "epoch": 0.7142131463984558, + "grad_norm": 1.171875, + "learning_rate": 9.598019316987244e-06, + "loss": 0.1851, + "step": 7030 + }, + { + "epoch": 0.7152290968200752, + "grad_norm": 1.078125, + "learning_rate": 9.53460306531439e-06, + "loss": 0.2661, + "step": 7040 + }, + { + "epoch": 0.7162450472416946, + "grad_norm": 1.6484375, + "learning_rate": 9.471347609565311e-06, + "loss": 0.1669, + "step": 7050 + }, + { + "epoch": 0.7172609976633141, + "grad_norm": 4.59375, + "learning_rate": 9.408253607415957e-06, + "loss": 0.2487, + "step": 7060 + }, + { + "epoch": 0.7182769480849335, + "grad_norm": 3.09375, + "learning_rate": 9.345321714863614e-06, + "loss": 0.186, + "step": 7070 + }, + { + "epoch": 0.7192928985065529, + "grad_norm": 6.0625, + "learning_rate": 9.282552586220075e-06, + "loss": 0.2249, + "step": 7080 + }, + { + "epoch": 0.7203088489281723, + "grad_norm": 1.5703125, + "learning_rate": 9.219946874104885e-06, + "loss": 0.1255, + "step": 7090 + }, + { + "epoch": 0.7213247993497918, + "grad_norm": 1.9453125, + "learning_rate": 9.157505229438481e-06, + "loss": 0.1999, + "step": 7100 + }, + { + "epoch": 0.7223407497714112, + "grad_norm": 5.1875, + "learning_rate": 9.095228301435518e-06, + "loss": 0.199, + "step": 7110 + }, + { + "epoch": 0.7233567001930306, + "grad_norm": 2.078125, + "learning_rate": 9.03311673759802e-06, + "loss": 0.2182, + "step": 7120 + }, + { + "epoch": 0.7243726506146501, + "grad_norm": 6.46875, + "learning_rate": 8.971171183708733e-06, + "loss": 0.1573, + "step": 7130 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 3.015625, + "learning_rate": 8.909392283824353e-06, + "loss": 0.2044, + "step": 7140 + }, + { + "epoch": 0.7264045514578888, + "grad_norm": 2.921875, + "learning_rate": 8.847780680268872e-06, + "loss": 0.11, + "step": 7150 + }, + { + "epoch": 0.7274205018795082, + "grad_norm": 2.96875, + "learning_rate": 8.786337013626853e-06, + "loss": 0.1897, + "step": 7160 + }, + { + "epoch": 0.7284364523011277, + "grad_norm": 1.7578125, + "learning_rate": 8.725061922736799e-06, + "loss": 0.153, + "step": 7170 + }, + { + "epoch": 0.7294524027227471, + "grad_norm": 1.609375, + "learning_rate": 8.663956044684532e-06, + "loss": 0.1746, + "step": 7180 + }, + { + "epoch": 0.7304683531443665, + "grad_norm": 1.9375, + "learning_rate": 8.603020014796507e-06, + "loss": 0.2284, + "step": 7190 + }, + { + "epoch": 0.7314843035659859, + "grad_norm": 1.515625, + "learning_rate": 8.542254466633273e-06, + "loss": 0.1186, + "step": 7200 + }, + { + "epoch": 0.7325002539876054, + "grad_norm": 1.671875, + "learning_rate": 8.481660031982844e-06, + "loss": 0.1971, + "step": 7210 + }, + { + "epoch": 0.7335162044092248, + "grad_norm": 1.453125, + "learning_rate": 8.421237340854157e-06, + "loss": 0.196, + "step": 7220 + }, + { + "epoch": 0.7345321548308442, + "grad_norm": 0.65234375, + "learning_rate": 8.360987021470479e-06, + "loss": 0.1724, + "step": 7230 + }, + { + "epoch": 0.7355481052524637, + "grad_norm": 2.84375, + "learning_rate": 8.300909700262929e-06, + "loss": 0.175, + "step": 7240 + }, + { + "epoch": 0.7365640556740831, + "grad_norm": 3.109375, + "learning_rate": 8.241006001863924e-06, + "loss": 0.2276, + "step": 7250 + }, + { + "epoch": 0.7375800060957025, + "grad_norm": 4.8125, + "learning_rate": 8.181276549100714e-06, + "loss": 0.2029, + "step": 7260 + }, + { + "epoch": 0.7385959565173219, + "grad_norm": 4.03125, + "learning_rate": 8.12172196298887e-06, + "loss": 0.175, + "step": 7270 + }, + { + "epoch": 0.7396119069389414, + "grad_norm": 3.046875, + "learning_rate": 8.062342862725878e-06, + "loss": 0.1662, + "step": 7280 + }, + { + "epoch": 0.7406278573605608, + "grad_norm": 3.375, + "learning_rate": 8.003139865684662e-06, + "loss": 0.1616, + "step": 7290 + }, + { + "epoch": 0.7416438077821802, + "grad_norm": 2.5625, + "learning_rate": 7.944113587407157e-06, + "loss": 0.2448, + "step": 7300 + }, + { + "epoch": 0.7426597582037997, + "grad_norm": 4.125, + "learning_rate": 7.885264641597961e-06, + "loss": 0.1618, + "step": 7310 + }, + { + "epoch": 0.7436757086254191, + "grad_norm": 3.5, + "learning_rate": 7.826593640117889e-06, + "loss": 0.1134, + "step": 7320 + }, + { + "epoch": 0.7446916590470385, + "grad_norm": 2.6875, + "learning_rate": 7.76810119297767e-06, + "loss": 0.1795, + "step": 7330 + }, + { + "epoch": 0.7457076094686579, + "grad_norm": 4.34375, + "learning_rate": 7.709787908331556e-06, + "loss": 0.2736, + "step": 7340 + }, + { + "epoch": 0.7467235598902774, + "grad_norm": 1.21875, + "learning_rate": 7.651654392471038e-06, + "loss": 0.139, + "step": 7350 + }, + { + "epoch": 0.7477395103118968, + "grad_norm": 3.578125, + "learning_rate": 7.593701249818521e-06, + "loss": 0.2023, + "step": 7360 + }, + { + "epoch": 0.7487554607335162, + "grad_norm": 2.15625, + "learning_rate": 7.535929082921048e-06, + "loss": 0.1702, + "step": 7370 + }, + { + "epoch": 0.7497714111551357, + "grad_norm": 1.96875, + "learning_rate": 7.47833849244402e-06, + "loss": 0.1835, + "step": 7380 + }, + { + "epoch": 0.7507873615767551, + "grad_norm": 2.796875, + "learning_rate": 7.420930077164959e-06, + "loss": 0.1713, + "step": 7390 + }, + { + "epoch": 0.7518033119983745, + "grad_norm": 4.46875, + "learning_rate": 7.363704433967311e-06, + "loss": 0.1906, + "step": 7400 + }, + { + "epoch": 0.7528192624199939, + "grad_norm": 1.75, + "learning_rate": 7.306662157834185e-06, + "loss": 0.1421, + "step": 7410 + }, + { + "epoch": 0.7538352128416134, + "grad_norm": 1.140625, + "learning_rate": 7.2498038418422145e-06, + "loss": 0.1793, + "step": 7420 + }, + { + "epoch": 0.7548511632632328, + "grad_norm": 2.578125, + "learning_rate": 7.193130077155374e-06, + "loss": 0.1603, + "step": 7430 + }, + { + "epoch": 0.7558671136848522, + "grad_norm": 4.3125, + "learning_rate": 7.13664145301883e-06, + "loss": 0.2169, + "step": 7440 + }, + { + "epoch": 0.7568830641064717, + "grad_norm": 3.078125, + "learning_rate": 7.0803385567528025e-06, + "loss": 0.1685, + "step": 7450 + }, + { + "epoch": 0.757899014528091, + "grad_norm": 3.5625, + "learning_rate": 7.024221973746495e-06, + "loss": 0.2282, + "step": 7460 + }, + { + "epoch": 0.7589149649497104, + "grad_norm": 2.265625, + "learning_rate": 6.968292287451961e-06, + "loss": 0.1786, + "step": 7470 + }, + { + "epoch": 0.7599309153713298, + "grad_norm": 4.71875, + "learning_rate": 6.912550079378091e-06, + "loss": 0.1811, + "step": 7480 + }, + { + "epoch": 0.7609468657929493, + "grad_norm": 2.328125, + "learning_rate": 6.856995929084506e-06, + "loss": 0.1747, + "step": 7490 + }, + { + "epoch": 0.7619628162145687, + "grad_norm": 5.21875, + "learning_rate": 6.801630414175589e-06, + "loss": 0.2028, + "step": 7500 + }, + { + "epoch": 0.7629787666361881, + "grad_norm": 3.78125, + "learning_rate": 6.746454110294451e-06, + "loss": 0.2255, + "step": 7510 + }, + { + "epoch": 0.7639947170578075, + "grad_norm": 1.625, + "learning_rate": 6.691467591116931e-06, + "loss": 0.1604, + "step": 7520 + }, + { + "epoch": 0.765010667479427, + "grad_norm": 1.7734375, + "learning_rate": 6.6366714283456755e-06, + "loss": 0.2559, + "step": 7530 + }, + { + "epoch": 0.7660266179010464, + "grad_norm": 4.59375, + "learning_rate": 6.582066191704142e-06, + "loss": 0.2034, + "step": 7540 + }, + { + "epoch": 0.7670425683226658, + "grad_norm": 1.578125, + "learning_rate": 6.527652448930724e-06, + "loss": 0.148, + "step": 7550 + }, + { + "epoch": 0.7680585187442853, + "grad_norm": 1.7109375, + "learning_rate": 6.4734307657728e-06, + "loss": 0.1811, + "step": 7560 + }, + { + "epoch": 0.7690744691659047, + "grad_norm": 1.2734375, + "learning_rate": 6.419401705980924e-06, + "loss": 0.1407, + "step": 7570 + }, + { + "epoch": 0.7700904195875241, + "grad_norm": 2.25, + "learning_rate": 6.365565831302869e-06, + "loss": 0.1893, + "step": 7580 + }, + { + "epoch": 0.7711063700091435, + "grad_norm": 1.625, + "learning_rate": 6.311923701477854e-06, + "loss": 0.1835, + "step": 7590 + }, + { + "epoch": 0.772122320430763, + "grad_norm": 2.375, + "learning_rate": 6.258475874230713e-06, + "loss": 0.1579, + "step": 7600 + }, + { + "epoch": 0.7731382708523824, + "grad_norm": 4.5, + "learning_rate": 6.205222905266067e-06, + "loss": 0.1794, + "step": 7610 + }, + { + "epoch": 0.7741542212740018, + "grad_norm": 4.25, + "learning_rate": 6.152165348262598e-06, + "loss": 0.1477, + "step": 7620 + }, + { + "epoch": 0.7751701716956213, + "grad_norm": 1.9765625, + "learning_rate": 6.0993037548672246e-06, + "loss": 0.2396, + "step": 7630 + }, + { + "epoch": 0.7761861221172407, + "grad_norm": 2.671875, + "learning_rate": 6.046638674689454e-06, + "loss": 0.1717, + "step": 7640 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 3.671875, + "learning_rate": 5.994170655295567e-06, + "loss": 0.2646, + "step": 7650 + }, + { + "epoch": 0.7782180229604795, + "grad_norm": 1.3046875, + "learning_rate": 5.9419002422030106e-06, + "loss": 0.1553, + "step": 7660 + }, + { + "epoch": 0.779233973382099, + "grad_norm": 3.734375, + "learning_rate": 5.889827978874665e-06, + "loss": 0.1854, + "step": 7670 + }, + { + "epoch": 0.7802499238037184, + "grad_norm": 2.140625, + "learning_rate": 5.837954406713245e-06, + "loss": 0.1857, + "step": 7680 + }, + { + "epoch": 0.7812658742253378, + "grad_norm": 3.34375, + "learning_rate": 5.786280065055619e-06, + "loss": 0.1797, + "step": 7690 + }, + { + "epoch": 0.7822818246469573, + "grad_norm": 0.97265625, + "learning_rate": 5.734805491167244e-06, + "loss": 0.1488, + "step": 7700 + }, + { + "epoch": 0.7832977750685767, + "grad_norm": 2.078125, + "learning_rate": 5.683531220236576e-06, + "loss": 0.1688, + "step": 7710 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 3.046875, + "learning_rate": 5.632457785369455e-06, + "loss": 0.1503, + "step": 7720 + }, + { + "epoch": 0.7853296759118155, + "grad_norm": 1.6875, + "learning_rate": 5.581585717583637e-06, + "loss": 0.1658, + "step": 7730 + }, + { + "epoch": 0.786345626333435, + "grad_norm": 3.421875, + "learning_rate": 5.530915545803209e-06, + "loss": 0.2112, + "step": 7740 + }, + { + "epoch": 0.7873615767550544, + "grad_norm": 4.1875, + "learning_rate": 5.480447796853141e-06, + "loss": 0.165, + "step": 7750 + }, + { + "epoch": 0.7883775271766738, + "grad_norm": 5.3125, + "learning_rate": 5.430182995453756e-06, + "loss": 0.1499, + "step": 7760 + }, + { + "epoch": 0.7893934775982933, + "grad_norm": 2.1875, + "learning_rate": 5.380121664215329e-06, + "loss": 0.1559, + "step": 7770 + }, + { + "epoch": 0.7904094280199127, + "grad_norm": 1.46875, + "learning_rate": 5.330264323632611e-06, + "loss": 0.2098, + "step": 7780 + }, + { + "epoch": 0.791425378441532, + "grad_norm": 4.65625, + "learning_rate": 5.280611492079449e-06, + "loss": 0.1776, + "step": 7790 + }, + { + "epoch": 0.7924413288631514, + "grad_norm": 1.3359375, + "learning_rate": 5.231163685803361e-06, + "loss": 0.1497, + "step": 7800 + }, + { + "epoch": 0.7934572792847709, + "grad_norm": 2.640625, + "learning_rate": 5.181921418920191e-06, + "loss": 0.12, + "step": 7810 + }, + { + "epoch": 0.7944732297063903, + "grad_norm": 2.328125, + "learning_rate": 5.13288520340878e-06, + "loss": 0.1981, + "step": 7820 + }, + { + "epoch": 0.7954891801280097, + "grad_norm": 3.0625, + "learning_rate": 5.084055549105596e-06, + "loss": 0.1389, + "step": 7830 + }, + { + "epoch": 0.7965051305496291, + "grad_norm": 2.796875, + "learning_rate": 5.035432963699479e-06, + "loss": 0.2293, + "step": 7840 + }, + { + "epoch": 0.7975210809712486, + "grad_norm": 5.0625, + "learning_rate": 4.98701795272635e-06, + "loss": 0.1618, + "step": 7850 + }, + { + "epoch": 0.798537031392868, + "grad_norm": 5.09375, + "learning_rate": 4.938811019563938e-06, + "loss": 0.1755, + "step": 7860 + }, + { + "epoch": 0.7995529818144874, + "grad_norm": 2.140625, + "learning_rate": 4.8908126654265475e-06, + "loss": 0.1565, + "step": 7870 + }, + { + "epoch": 0.8005689322361069, + "grad_norm": 0.76171875, + "learning_rate": 4.843023389359885e-06, + "loss": 0.2176, + "step": 7880 + }, + { + "epoch": 0.8015848826577263, + "grad_norm": 2.625, + "learning_rate": 4.79544368823581e-06, + "loss": 0.2013, + "step": 7890 + }, + { + "epoch": 0.8026008330793457, + "grad_norm": 2.078125, + "learning_rate": 4.748074056747234e-06, + "loss": 0.1246, + "step": 7900 + }, + { + "epoch": 0.8036167835009651, + "grad_norm": 3.5, + "learning_rate": 4.700914987402919e-06, + "loss": 0.1638, + "step": 7910 + }, + { + "epoch": 0.8046327339225846, + "grad_norm": 3.4375, + "learning_rate": 4.6539669705223916e-06, + "loss": 0.2213, + "step": 7920 + }, + { + "epoch": 0.805648684344204, + "grad_norm": 2.96875, + "learning_rate": 4.607230494230849e-06, + "loss": 0.1822, + "step": 7930 + }, + { + "epoch": 0.8066646347658234, + "grad_norm": 2.359375, + "learning_rate": 4.560706044454047e-06, + "loss": 0.1763, + "step": 7940 + }, + { + "epoch": 0.8076805851874429, + "grad_norm": 4.59375, + "learning_rate": 4.514394104913291e-06, + "loss": 0.234, + "step": 7950 + }, + { + "epoch": 0.8086965356090623, + "grad_norm": 1.96875, + "learning_rate": 4.468295157120372e-06, + "loss": 0.1939, + "step": 7960 + }, + { + "epoch": 0.8097124860306817, + "grad_norm": 2.578125, + "learning_rate": 4.422409680372594e-06, + "loss": 0.174, + "step": 7970 + }, + { + "epoch": 0.8107284364523011, + "grad_norm": 4.5625, + "learning_rate": 4.3767381517477505e-06, + "loss": 0.2375, + "step": 7980 + }, + { + "epoch": 0.8117443868739206, + "grad_norm": 0.9609375, + "learning_rate": 4.331281046099203e-06, + "loss": 0.2076, + "step": 7990 + }, + { + "epoch": 0.81276033729554, + "grad_norm": 6.0625, + "learning_rate": 4.286038836050929e-06, + "loss": 0.2504, + "step": 8000 + }, + { + "epoch": 0.8137762877171594, + "grad_norm": 3.484375, + "learning_rate": 4.241011991992586e-06, + "loss": 0.2102, + "step": 8010 + }, + { + "epoch": 0.8147922381387789, + "grad_norm": 1.9765625, + "learning_rate": 4.1962009820746635e-06, + "loss": 0.1846, + "step": 8020 + }, + { + "epoch": 0.8158081885603983, + "grad_norm": 1.875, + "learning_rate": 4.15160627220357e-06, + "loss": 0.1741, + "step": 8030 + }, + { + "epoch": 0.8168241389820177, + "grad_norm": 5.5625, + "learning_rate": 4.107228326036838e-06, + "loss": 0.2078, + "step": 8040 + }, + { + "epoch": 0.8178400894036371, + "grad_norm": 1.7578125, + "learning_rate": 4.063067604978252e-06, + "loss": 0.212, + "step": 8050 + }, + { + "epoch": 0.8188560398252566, + "grad_norm": 4.09375, + "learning_rate": 4.019124568173094e-06, + "loss": 0.1831, + "step": 8060 + }, + { + "epoch": 0.819871990246876, + "grad_norm": 6.625, + "learning_rate": 3.975399672503341e-06, + "loss": 0.2196, + "step": 8070 + }, + { + "epoch": 0.8208879406684954, + "grad_norm": 2.78125, + "learning_rate": 3.931893372582943e-06, + "loss": 0.2002, + "step": 8080 + }, + { + "epoch": 0.8219038910901149, + "grad_norm": 6.90625, + "learning_rate": 3.888606120753047e-06, + "loss": 0.2138, + "step": 8090 + }, + { + "epoch": 0.8229198415117343, + "grad_norm": 4.09375, + "learning_rate": 3.845538367077362e-06, + "loss": 0.2593, + "step": 8100 + }, + { + "epoch": 0.8239357919333536, + "grad_norm": 1.859375, + "learning_rate": 3.8026905593374213e-06, + "loss": 0.2062, + "step": 8110 + }, + { + "epoch": 0.824951742354973, + "grad_norm": 4.3125, + "learning_rate": 3.760063143027945e-06, + "loss": 0.1343, + "step": 8120 + }, + { + "epoch": 0.8259676927765925, + "grad_norm": 1.984375, + "learning_rate": 3.7176565613522313e-06, + "loss": 0.2494, + "step": 8130 + }, + { + "epoch": 0.8269836431982119, + "grad_norm": 3.71875, + "learning_rate": 3.675471255217516e-06, + "loss": 0.1502, + "step": 8140 + }, + { + "epoch": 0.8279995936198313, + "grad_norm": 2.359375, + "learning_rate": 3.6335076632304175e-06, + "loss": 0.1256, + "step": 8150 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 1.46875, + "learning_rate": 3.5917662216923332e-06, + "loss": 0.1709, + "step": 8160 + }, + { + "epoch": 0.8300314944630702, + "grad_norm": 2.78125, + "learning_rate": 3.550247364594958e-06, + "loss": 0.1881, + "step": 8170 + }, + { + "epoch": 0.8310474448846896, + "grad_norm": 1.0703125, + "learning_rate": 3.508951523615725e-06, + "loss": 0.1998, + "step": 8180 + }, + { + "epoch": 0.832063395306309, + "grad_norm": 2.40625, + "learning_rate": 3.467879128113352e-06, + "loss": 0.2429, + "step": 8190 + }, + { + "epoch": 0.8330793457279285, + "grad_norm": 2.609375, + "learning_rate": 3.427030605123352e-06, + "loss": 0.1942, + "step": 8200 + }, + { + "epoch": 0.8340952961495479, + "grad_norm": 1.6015625, + "learning_rate": 3.3864063793536043e-06, + "loss": 0.1898, + "step": 8210 + }, + { + "epoch": 0.8351112465711673, + "grad_norm": 5.375, + "learning_rate": 3.3460068731799577e-06, + "loss": 0.1919, + "step": 8220 + }, + { + "epoch": 0.8361271969927867, + "grad_norm": 3.3125, + "learning_rate": 3.3058325066417818e-06, + "loss": 0.1516, + "step": 8230 + }, + { + "epoch": 0.8371431474144062, + "grad_norm": 0.76171875, + "learning_rate": 3.26588369743768e-06, + "loss": 0.1068, + "step": 8240 + }, + { + "epoch": 0.8381590978360256, + "grad_norm": 3.171875, + "learning_rate": 3.2261608609210653e-06, + "loss": 0.1203, + "step": 8250 + }, + { + "epoch": 0.839175048257645, + "grad_norm": 2.359375, + "learning_rate": 3.186664410095913e-06, + "loss": 0.2172, + "step": 8260 + }, + { + "epoch": 0.8401909986792645, + "grad_norm": 3.328125, + "learning_rate": 3.1473947556124093e-06, + "loss": 0.1249, + "step": 8270 + }, + { + "epoch": 0.8412069491008839, + "grad_norm": 2.484375, + "learning_rate": 3.1083523057627213e-06, + "loss": 0.1744, + "step": 8280 + }, + { + "epoch": 0.8422228995225033, + "grad_norm": 4.46875, + "learning_rate": 3.0695374664767353e-06, + "loss": 0.1772, + "step": 8290 + }, + { + "epoch": 0.8432388499441227, + "grad_norm": 0.59375, + "learning_rate": 3.0309506413178397e-06, + "loss": 0.2302, + "step": 8300 + }, + { + "epoch": 0.8442548003657422, + "grad_norm": 2.390625, + "learning_rate": 2.9925922314787136e-06, + "loss": 0.1635, + "step": 8310 + }, + { + "epoch": 0.8452707507873616, + "grad_norm": 2.34375, + "learning_rate": 2.954462635777194e-06, + "loss": 0.1573, + "step": 8320 + }, + { + "epoch": 0.846286701208981, + "grad_norm": 2.015625, + "learning_rate": 2.916562250652083e-06, + "loss": 0.1608, + "step": 8330 + }, + { + "epoch": 0.8473026516306005, + "grad_norm": 4.125, + "learning_rate": 2.878891470159048e-06, + "loss": 0.184, + "step": 8340 + }, + { + "epoch": 0.8483186020522199, + "grad_norm": 2.515625, + "learning_rate": 2.8414506859665514e-06, + "loss": 0.2141, + "step": 8350 + }, + { + "epoch": 0.8493345524738393, + "grad_norm": 3.375, + "learning_rate": 2.8042402873517197e-06, + "loss": 0.1729, + "step": 8360 + }, + { + "epoch": 0.8503505028954587, + "grad_norm": 3.078125, + "learning_rate": 2.76726066119635e-06, + "loss": 0.2252, + "step": 8370 + }, + { + "epoch": 0.8513664533170782, + "grad_norm": 1.5390625, + "learning_rate": 2.730512191982845e-06, + "loss": 0.1644, + "step": 8380 + }, + { + "epoch": 0.8523824037386976, + "grad_norm": 1.9296875, + "learning_rate": 2.693995261790261e-06, + "loss": 0.1822, + "step": 8390 + }, + { + "epoch": 0.853398354160317, + "grad_norm": 3.3125, + "learning_rate": 2.657710250290285e-06, + "loss": 0.2068, + "step": 8400 + }, + { + "epoch": 0.8544143045819365, + "grad_norm": 0.640625, + "learning_rate": 2.621657534743327e-06, + "loss": 0.1224, + "step": 8410 + }, + { + "epoch": 0.8554302550035559, + "grad_norm": 3.421875, + "learning_rate": 2.5858374899945804e-06, + "loss": 0.179, + "step": 8420 + }, + { + "epoch": 0.8564462054251752, + "grad_norm": 3.484375, + "learning_rate": 2.550250488470135e-06, + "loss": 0.1873, + "step": 8430 + }, + { + "epoch": 0.8574621558467946, + "grad_norm": 3.984375, + "learning_rate": 2.5148969001730806e-06, + "loss": 0.1799, + "step": 8440 + }, + { + "epoch": 0.8584781062684141, + "grad_norm": 1.375, + "learning_rate": 2.4797770926796858e-06, + "loss": 0.176, + "step": 8450 + }, + { + "epoch": 0.8594940566900335, + "grad_norm": 1.8984375, + "learning_rate": 2.444891431135571e-06, + "loss": 0.1664, + "step": 8460 + }, + { + "epoch": 0.8605100071116529, + "grad_norm": 4.15625, + "learning_rate": 2.4102402782518936e-06, + "loss": 0.1512, + "step": 8470 + }, + { + "epoch": 0.8615259575332723, + "grad_norm": 1.34375, + "learning_rate": 2.3758239943016096e-06, + "loss": 0.1629, + "step": 8480 + }, + { + "epoch": 0.8625419079548918, + "grad_norm": 5.3125, + "learning_rate": 2.3416429371157013e-06, + "loss": 0.2099, + "step": 8490 + }, + { + "epoch": 0.8635578583765112, + "grad_norm": 5.9375, + "learning_rate": 2.307697462079464e-06, + "loss": 0.2221, + "step": 8500 + }, + { + "epoch": 0.8645738087981306, + "grad_norm": 5.4375, + "learning_rate": 2.273987922128809e-06, + "loss": 0.2191, + "step": 8510 + }, + { + "epoch": 0.8655897592197501, + "grad_norm": 2.171875, + "learning_rate": 2.240514667746607e-06, + "loss": 0.1843, + "step": 8520 + }, + { + "epoch": 0.8666057096413695, + "grad_norm": 2.5625, + "learning_rate": 2.2072780469590245e-06, + "loss": 0.2494, + "step": 8530 + }, + { + "epoch": 0.8676216600629889, + "grad_norm": 2.25, + "learning_rate": 2.1742784053319116e-06, + "loss": 0.1712, + "step": 8540 + }, + { + "epoch": 0.8686376104846083, + "grad_norm": 4.5625, + "learning_rate": 2.141516085967224e-06, + "loss": 0.1169, + "step": 8550 + }, + { + "epoch": 0.8696535609062278, + "grad_norm": 4.25, + "learning_rate": 2.1089914294994434e-06, + "loss": 0.1374, + "step": 8560 + }, + { + "epoch": 0.8706695113278472, + "grad_norm": 3.265625, + "learning_rate": 2.0767047740920336e-06, + "loss": 0.2162, + "step": 8570 + }, + { + "epoch": 0.8716854617494666, + "grad_norm": 1.8203125, + "learning_rate": 2.0446564554339187e-06, + "loss": 0.1593, + "step": 8580 + }, + { + "epoch": 0.8727014121710861, + "grad_norm": 2.671875, + "learning_rate": 2.0128468067360185e-06, + "loss": 0.1857, + "step": 8590 + }, + { + "epoch": 0.8737173625927055, + "grad_norm": 2.765625, + "learning_rate": 1.981276158727749e-06, + "loss": 0.1989, + "step": 8600 + }, + { + "epoch": 0.8747333130143249, + "grad_norm": 2.65625, + "learning_rate": 1.949944839653625e-06, + "loss": 0.2077, + "step": 8610 + }, + { + "epoch": 0.8757492634359443, + "grad_norm": 2.625, + "learning_rate": 1.918853175269797e-06, + "loss": 0.2003, + "step": 8620 + }, + { + "epoch": 0.8767652138575638, + "grad_norm": 0.71875, + "learning_rate": 1.8880014888407127e-06, + "loss": 0.2486, + "step": 8630 + }, + { + "epoch": 0.8777811642791832, + "grad_norm": 4.71875, + "learning_rate": 1.8573901011357336e-06, + "loss": 0.1896, + "step": 8640 + }, + { + "epoch": 0.8787971147008026, + "grad_norm": 5.0625, + "learning_rate": 1.8270193304257887e-06, + "loss": 0.1727, + "step": 8650 + }, + { + "epoch": 0.8798130651224221, + "grad_norm": 1.75, + "learning_rate": 1.7968894924800916e-06, + "loss": 0.1687, + "step": 8660 + }, + { + "epoch": 0.8808290155440415, + "grad_norm": 2.65625, + "learning_rate": 1.7670009005628291e-06, + "loss": 0.166, + "step": 8670 + }, + { + "epoch": 0.8818449659656609, + "grad_norm": 4.71875, + "learning_rate": 1.737353865429936e-06, + "loss": 0.1471, + "step": 8680 + }, + { + "epoch": 0.8828609163872803, + "grad_norm": 0.546875, + "learning_rate": 1.7079486953258283e-06, + "loss": 0.1075, + "step": 8690 + }, + { + "epoch": 0.8838768668088998, + "grad_norm": 1.640625, + "learning_rate": 1.6787856959802367e-06, + "loss": 0.2113, + "step": 8700 + }, + { + "epoch": 0.8848928172305192, + "grad_norm": 2.953125, + "learning_rate": 1.6498651706049945e-06, + "loss": 0.1412, + "step": 8710 + }, + { + "epoch": 0.8859087676521386, + "grad_norm": 3.796875, + "learning_rate": 1.6211874198909072e-06, + "loss": 0.1701, + "step": 8720 + }, + { + "epoch": 0.8869247180737581, + "grad_norm": 3.734375, + "learning_rate": 1.592752742004605e-06, + "loss": 0.1348, + "step": 8730 + }, + { + "epoch": 0.8879406684953774, + "grad_norm": 2.21875, + "learning_rate": 1.5645614325854735e-06, + "loss": 0.1931, + "step": 8740 + }, + { + "epoch": 0.8889566189169968, + "grad_norm": 3.4375, + "learning_rate": 1.5366137847425466e-06, + "loss": 0.1705, + "step": 8750 + }, + { + "epoch": 0.8899725693386162, + "grad_norm": 3.5625, + "learning_rate": 1.5089100890514769e-06, + "loss": 0.1889, + "step": 8760 + }, + { + "epoch": 0.8909885197602357, + "grad_norm": 2.65625, + "learning_rate": 1.4814506335515176e-06, + "loss": 0.1837, + "step": 8770 + }, + { + "epoch": 0.8920044701818551, + "grad_norm": 1.421875, + "learning_rate": 1.4542357037425207e-06, + "loss": 0.1728, + "step": 8780 + }, + { + "epoch": 0.8930204206034745, + "grad_norm": 1.625, + "learning_rate": 1.4272655825819713e-06, + "loss": 0.1562, + "step": 8790 + }, + { + "epoch": 0.8940363710250939, + "grad_norm": 4.0625, + "learning_rate": 1.4005405504820351e-06, + "loss": 0.1681, + "step": 8800 + }, + { + "epoch": 0.8950523214467134, + "grad_norm": 2.328125, + "learning_rate": 1.3740608853066634e-06, + "loss": 0.1449, + "step": 8810 + }, + { + "epoch": 0.8960682718683328, + "grad_norm": 4.0625, + "learning_rate": 1.347826862368684e-06, + "loss": 0.2418, + "step": 8820 + }, + { + "epoch": 0.8970842222899522, + "grad_norm": 0.55859375, + "learning_rate": 1.3218387544269545e-06, + "loss": 0.2473, + "step": 8830 + }, + { + "epoch": 0.8981001727115717, + "grad_norm": 4.78125, + "learning_rate": 1.2960968316835132e-06, + "loss": 0.194, + "step": 8840 + }, + { + "epoch": 0.8991161231331911, + "grad_norm": 3.921875, + "learning_rate": 1.2706013617807822e-06, + "loss": 0.2109, + "step": 8850 + }, + { + "epoch": 0.9001320735548105, + "grad_norm": 5.03125, + "learning_rate": 1.2453526097987778e-06, + "loss": 0.151, + "step": 8860 + }, + { + "epoch": 0.9011480239764299, + "grad_norm": 5.96875, + "learning_rate": 1.2203508382523431e-06, + "loss": 0.1811, + "step": 8870 + }, + { + "epoch": 0.9021639743980494, + "grad_norm": 3.828125, + "learning_rate": 1.1955963070884534e-06, + "loss": 0.2004, + "step": 8880 + }, + { + "epoch": 0.9031799248196688, + "grad_norm": 1.9765625, + "learning_rate": 1.171089273683465e-06, + "loss": 0.1395, + "step": 8890 + }, + { + "epoch": 0.9041958752412882, + "grad_norm": 2.328125, + "learning_rate": 1.1468299928404868e-06, + "loss": 0.1915, + "step": 8900 + }, + { + "epoch": 0.9052118256629077, + "grad_norm": 1.265625, + "learning_rate": 1.1228187167866943e-06, + "loss": 0.1281, + "step": 8910 + }, + { + "epoch": 0.9062277760845271, + "grad_norm": 1.4375, + "learning_rate": 1.099055695170728e-06, + "loss": 0.1627, + "step": 8920 + }, + { + "epoch": 0.9072437265061465, + "grad_norm": 0.6953125, + "learning_rate": 1.0755411750600962e-06, + "loss": 0.1768, + "step": 8930 + }, + { + "epoch": 0.9082596769277659, + "grad_norm": 1.046875, + "learning_rate": 1.052275400938596e-06, + "loss": 0.1544, + "step": 8940 + }, + { + "epoch": 0.9092756273493854, + "grad_norm": 2.71875, + "learning_rate": 1.0292586147037764e-06, + "loss": 0.2498, + "step": 8950 + }, + { + "epoch": 0.9102915777710048, + "grad_norm": 3.0625, + "learning_rate": 1.0064910556644214e-06, + "loss": 0.1918, + "step": 8960 + }, + { + "epoch": 0.9113075281926242, + "grad_norm": 4.0, + "learning_rate": 9.839729605380766e-07, + "loss": 0.2388, + "step": 8970 + }, + { + "epoch": 0.9123234786142437, + "grad_norm": 3.765625, + "learning_rate": 9.61704563448565e-07, + "loss": 0.1944, + "step": 8980 + }, + { + "epoch": 0.9133394290358631, + "grad_norm": 2.90625, + "learning_rate": 9.396860959235671e-07, + "loss": 0.1667, + "step": 8990 + }, + { + "epoch": 0.9143553794574825, + "grad_norm": 2.4375, + "learning_rate": 9.179177868922085e-07, + "loss": 0.2143, + "step": 9000 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-9000/training_args.bin b/checkpoints/checkpoint-9000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-9000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/checkpoints/checkpoint-9500/adapter_config.json b/checkpoints/checkpoint-9500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70 --- /dev/null +++ b/checkpoints/checkpoint-9500/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "gaussian", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoints/checkpoint-9500/adapter_model.safetensors b/checkpoints/checkpoint-9500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4873d7ac0dd93d300c5305928706f0491b7f09e2 --- /dev/null +++ b/checkpoints/checkpoint-9500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2b633ee47b5f3a143fc9c71b1dd0cfeb40cc097a762104327a42af91522c1dc +size 5919456 diff --git a/checkpoints/checkpoint-9500/optimizer.pt b/checkpoints/checkpoint-9500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8453ac00ba64242e454f9493f2b7f0058728a8c --- /dev/null +++ b/checkpoints/checkpoint-9500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada548d4424bd56d11910556a417182e99459e8c5770ee2fd7a75b4cf595f8b5 +size 11930938 diff --git a/checkpoints/checkpoint-9500/rng_state_0.pth b/checkpoints/checkpoint-9500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..a76a2679c5ec6f028f79881ea7b86547e50b925c --- /dev/null +++ b/checkpoints/checkpoint-9500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdd3431862bb3caf99bbe632d5500d35b3a4001ba5f79dfb311597988ce7bc07 +size 15024 diff --git a/checkpoints/checkpoint-9500/rng_state_1.pth b/checkpoints/checkpoint-9500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..d638e87f5f6cf02dbf4bd50290706a308924bd9f --- /dev/null +++ b/checkpoints/checkpoint-9500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49b137689bb9baeb88d62569411ec42b18c43387181a8e482c1188959e80ba8 +size 15024 diff --git a/checkpoints/checkpoint-9500/rng_state_2.pth b/checkpoints/checkpoint-9500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..44200b445027eedae447f8c9abec1fa8bfd8922f --- /dev/null +++ b/checkpoints/checkpoint-9500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64f0a9a34ed397df06f2a99347433818ab578da206dd635ee4d819320c29732 +size 15024 diff --git a/checkpoints/checkpoint-9500/rng_state_3.pth b/checkpoints/checkpoint-9500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..2dd77ff9ed7fd897ba735ad0f9af28473a4e29c2 --- /dev/null +++ b/checkpoints/checkpoint-9500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f2446089864551b4466c0d305e103a1a414455ac1dd2cffffdee7d5b89d904a +size 15024 diff --git a/checkpoints/checkpoint-9500/scheduler.pt b/checkpoints/checkpoint-9500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..294d12a69b8fec0e3bf520b584bc6bb862f59d5d --- /dev/null +++ b/checkpoints/checkpoint-9500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c40a1ffb822b62d125ac636a29ddd80b085b0fed5f9760c5a078bde337791b1a +size 1064 diff --git a/checkpoints/checkpoint-9500/trainer_state.json b/checkpoints/checkpoint-9500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..92150ac85879e6f7310f59e251c296183fd37122 --- /dev/null +++ b/checkpoints/checkpoint-9500/trainer_state.json @@ -0,0 +1,6671 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9651529005384537, + "eval_steps": 500, + "global_step": 9500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001015950421619425, + "grad_norm": 17.625, + "learning_rate": 5e-06, + "loss": 3.4264, + "step": 10 + }, + { + "epoch": 0.00203190084323885, + "grad_norm": 12.5625, + "learning_rate": 1e-05, + "loss": 3.432, + "step": 20 + }, + { + "epoch": 0.003047851264858275, + "grad_norm": 14.0625, + "learning_rate": 1.5e-05, + "loss": 3.23, + "step": 30 + }, + { + "epoch": 0.0040638016864777, + "grad_norm": 12.4375, + "learning_rate": 2e-05, + "loss": 2.9762, + "step": 40 + }, + { + "epoch": 0.005079752108097125, + "grad_norm": 10.0625, + "learning_rate": 2.5e-05, + "loss": 2.6173, + "step": 50 + }, + { + "epoch": 0.00609570252971655, + "grad_norm": 10.1875, + "learning_rate": 3e-05, + "loss": 2.2004, + "step": 60 + }, + { + "epoch": 0.007111652951335975, + "grad_norm": 7.03125, + "learning_rate": 3.5e-05, + "loss": 1.4176, + "step": 70 + }, + { + "epoch": 0.0081276033729554, + "grad_norm": 4.375, + "learning_rate": 4e-05, + "loss": 1.0122, + "step": 80 + }, + { + "epoch": 0.009143553794574825, + "grad_norm": 6.5625, + "learning_rate": 4.5e-05, + "loss": 0.9116, + "step": 90 + }, + { + "epoch": 0.01015950421619425, + "grad_norm": 5.28125, + "learning_rate": 5e-05, + "loss": 0.6832, + "step": 100 + }, + { + "epoch": 0.011175454637813675, + "grad_norm": 5.5, + "learning_rate": 4.9999870035728426e-05, + "loss": 0.7355, + "step": 110 + }, + { + "epoch": 0.0121914050594331, + "grad_norm": 5.1875, + "learning_rate": 4.9999480144264944e-05, + "loss": 0.6673, + "step": 120 + }, + { + "epoch": 0.013207355481052525, + "grad_norm": 4.5, + "learning_rate": 4.9998830329663314e-05, + "loss": 0.6792, + "step": 130 + }, + { + "epoch": 0.01422330590267195, + "grad_norm": 3.9375, + "learning_rate": 4.9997920598679756e-05, + "loss": 0.6207, + "step": 140 + }, + { + "epoch": 0.015239256324291375, + "grad_norm": 3.15625, + "learning_rate": 4.999675096077286e-05, + "loss": 0.483, + "step": 150 + }, + { + "epoch": 0.0162552067459108, + "grad_norm": 5.28125, + "learning_rate": 4.999532142810354e-05, + "loss": 0.5319, + "step": 160 + }, + { + "epoch": 0.017271157167530225, + "grad_norm": 4.59375, + "learning_rate": 4.999363201553483e-05, + "loss": 0.6052, + "step": 170 + }, + { + "epoch": 0.01828710758914965, + "grad_norm": 5.03125, + "learning_rate": 4.9991682740631794e-05, + "loss": 0.4258, + "step": 180 + }, + { + "epoch": 0.019303058010769075, + "grad_norm": 3.859375, + "learning_rate": 4.998947362366133e-05, + "loss": 0.4309, + "step": 190 + }, + { + "epoch": 0.0203190084323885, + "grad_norm": 3.328125, + "learning_rate": 4.998700468759193e-05, + "loss": 0.3957, + "step": 200 + }, + { + "epoch": 0.021334958854007924, + "grad_norm": 4.9375, + "learning_rate": 4.9984275958093475e-05, + "loss": 0.4777, + "step": 210 + }, + { + "epoch": 0.02235090927562735, + "grad_norm": 4.78125, + "learning_rate": 4.998128746353695e-05, + "loss": 0.3549, + "step": 220 + }, + { + "epoch": 0.023366859697246774, + "grad_norm": 4.0625, + "learning_rate": 4.997803923499417e-05, + "loss": 0.4447, + "step": 230 + }, + { + "epoch": 0.0243828101188662, + "grad_norm": 6.375, + "learning_rate": 4.99745313062374e-05, + "loss": 0.3808, + "step": 240 + }, + { + "epoch": 0.025398760540485624, + "grad_norm": 3.59375, + "learning_rate": 4.99707637137391e-05, + "loss": 0.3827, + "step": 250 + }, + { + "epoch": 0.02641471096210505, + "grad_norm": 3.015625, + "learning_rate": 4.996673649667145e-05, + "loss": 0.3694, + "step": 260 + }, + { + "epoch": 0.027430661383724474, + "grad_norm": 2.296875, + "learning_rate": 4.9962449696906e-05, + "loss": 0.3586, + "step": 270 + }, + { + "epoch": 0.0284466118053439, + "grad_norm": 4.125, + "learning_rate": 4.9957903359013214e-05, + "loss": 0.3832, + "step": 280 + }, + { + "epoch": 0.029462562226963324, + "grad_norm": 3.296875, + "learning_rate": 4.995309753026201e-05, + "loss": 0.328, + "step": 290 + }, + { + "epoch": 0.03047851264858275, + "grad_norm": 4.5, + "learning_rate": 4.994803226061927e-05, + "loss": 0.3667, + "step": 300 + }, + { + "epoch": 0.03149446307020217, + "grad_norm": 4.3125, + "learning_rate": 4.994270760274933e-05, + "loss": 0.3811, + "step": 310 + }, + { + "epoch": 0.0325104134918216, + "grad_norm": 3.421875, + "learning_rate": 4.99371236120134e-05, + "loss": 0.3065, + "step": 320 + }, + { + "epoch": 0.03352636391344102, + "grad_norm": 4.6875, + "learning_rate": 4.993128034646902e-05, + "loss": 0.4177, + "step": 330 + }, + { + "epoch": 0.03454231433506045, + "grad_norm": 3.046875, + "learning_rate": 4.992517786686947e-05, + "loss": 0.33, + "step": 340 + }, + { + "epoch": 0.03555826475667987, + "grad_norm": 1.8828125, + "learning_rate": 4.9918816236663077e-05, + "loss": 0.3287, + "step": 350 + }, + { + "epoch": 0.0365742151782993, + "grad_norm": 3.8125, + "learning_rate": 4.991219552199262e-05, + "loss": 0.2934, + "step": 360 + }, + { + "epoch": 0.03759016559991872, + "grad_norm": 4.28125, + "learning_rate": 4.99053157916946e-05, + "loss": 0.3176, + "step": 370 + }, + { + "epoch": 0.03860611602153815, + "grad_norm": 2.609375, + "learning_rate": 4.989817711729856e-05, + "loss": 0.3318, + "step": 380 + }, + { + "epoch": 0.03962206644315757, + "grad_norm": 2.375, + "learning_rate": 4.98907795730263e-05, + "loss": 0.3234, + "step": 390 + }, + { + "epoch": 0.040638016864777, + "grad_norm": 4.46875, + "learning_rate": 4.988312323579114e-05, + "loss": 0.267, + "step": 400 + }, + { + "epoch": 0.04165396728639642, + "grad_norm": 3.75, + "learning_rate": 4.98752081851971e-05, + "loss": 0.3081, + "step": 410 + }, + { + "epoch": 0.04266991770801585, + "grad_norm": 2.203125, + "learning_rate": 4.986703450353809e-05, + "loss": 0.2917, + "step": 420 + }, + { + "epoch": 0.04368586812963527, + "grad_norm": 1.6015625, + "learning_rate": 4.985860227579703e-05, + "loss": 0.2805, + "step": 430 + }, + { + "epoch": 0.0447018185512547, + "grad_norm": 3.140625, + "learning_rate": 4.984991158964499e-05, + "loss": 0.3534, + "step": 440 + }, + { + "epoch": 0.04571776897287412, + "grad_norm": 3.296875, + "learning_rate": 4.9840962535440265e-05, + "loss": 0.335, + "step": 450 + }, + { + "epoch": 0.04673371939449355, + "grad_norm": 3.25, + "learning_rate": 4.983175520622744e-05, + "loss": 0.2544, + "step": 460 + }, + { + "epoch": 0.04774966981611297, + "grad_norm": 2.25, + "learning_rate": 4.982228969773642e-05, + "loss": 0.3449, + "step": 470 + }, + { + "epoch": 0.0487656202377324, + "grad_norm": 4.9375, + "learning_rate": 4.9812566108381435e-05, + "loss": 0.2964, + "step": 480 + }, + { + "epoch": 0.04978157065935182, + "grad_norm": 1.5703125, + "learning_rate": 4.9802584539260035e-05, + "loss": 0.2799, + "step": 490 + }, + { + "epoch": 0.05079752108097125, + "grad_norm": 2.828125, + "learning_rate": 4.979234509415199e-05, + "loss": 0.3231, + "step": 500 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.9375, + "learning_rate": 4.978184787951828e-05, + "loss": 0.2943, + "step": 510 + }, + { + "epoch": 0.0528294219242101, + "grad_norm": 2.34375, + "learning_rate": 4.977109300449992e-05, + "loss": 0.2705, + "step": 520 + }, + { + "epoch": 0.05384537234582952, + "grad_norm": 3.140625, + "learning_rate": 4.9760080580916876e-05, + "loss": 0.2998, + "step": 530 + }, + { + "epoch": 0.05486132276744895, + "grad_norm": 3.5625, + "learning_rate": 4.974881072326688e-05, + "loss": 0.2595, + "step": 540 + }, + { + "epoch": 0.05587727318906837, + "grad_norm": 4.25, + "learning_rate": 4.9737283548724236e-05, + "loss": 0.2803, + "step": 550 + }, + { + "epoch": 0.0568932236106878, + "grad_norm": 4.0625, + "learning_rate": 4.97254991771386e-05, + "loss": 0.3511, + "step": 560 + }, + { + "epoch": 0.05790917403230722, + "grad_norm": 2.515625, + "learning_rate": 4.971345773103377e-05, + "loss": 0.312, + "step": 570 + }, + { + "epoch": 0.05892512445392665, + "grad_norm": 3.21875, + "learning_rate": 4.9701159335606365e-05, + "loss": 0.2482, + "step": 580 + }, + { + "epoch": 0.05994107487554607, + "grad_norm": 5.5, + "learning_rate": 4.968860411872454e-05, + "loss": 0.2537, + "step": 590 + }, + { + "epoch": 0.0609570252971655, + "grad_norm": 3.546875, + "learning_rate": 4.967579221092666e-05, + "loss": 0.3125, + "step": 600 + }, + { + "epoch": 0.06197297571878492, + "grad_norm": 2.984375, + "learning_rate": 4.966272374541996e-05, + "loss": 0.2354, + "step": 610 + }, + { + "epoch": 0.06298892614040434, + "grad_norm": 3.6875, + "learning_rate": 4.964939885807912e-05, + "loss": 0.3213, + "step": 620 + }, + { + "epoch": 0.06400487656202378, + "grad_norm": 2.140625, + "learning_rate": 4.9635817687444876e-05, + "loss": 0.3003, + "step": 630 + }, + { + "epoch": 0.0650208269836432, + "grad_norm": 3.484375, + "learning_rate": 4.962198037472259e-05, + "loss": 0.2996, + "step": 640 + }, + { + "epoch": 0.06603677740526262, + "grad_norm": 3.21875, + "learning_rate": 4.9607887063780776e-05, + "loss": 0.2257, + "step": 650 + }, + { + "epoch": 0.06705272782688204, + "grad_norm": 5.375, + "learning_rate": 4.9593537901149564e-05, + "loss": 0.223, + "step": 660 + }, + { + "epoch": 0.06806867824850148, + "grad_norm": 4.1875, + "learning_rate": 4.957893303601924e-05, + "loss": 0.3407, + "step": 670 + }, + { + "epoch": 0.0690846286701209, + "grad_norm": 3.328125, + "learning_rate": 4.956407262023866e-05, + "loss": 0.2589, + "step": 680 + }, + { + "epoch": 0.07010057909174032, + "grad_norm": 2.953125, + "learning_rate": 4.954895680831367e-05, + "loss": 0.2949, + "step": 690 + }, + { + "epoch": 0.07111652951335974, + "grad_norm": 4.0625, + "learning_rate": 4.9533585757405506e-05, + "loss": 0.2995, + "step": 700 + }, + { + "epoch": 0.07213247993497918, + "grad_norm": 4.625, + "learning_rate": 4.951795962732917e-05, + "loss": 0.2894, + "step": 710 + }, + { + "epoch": 0.0731484303565986, + "grad_norm": 3.0, + "learning_rate": 4.9502078580551755e-05, + "loss": 0.3082, + "step": 720 + }, + { + "epoch": 0.07416438077821802, + "grad_norm": 3.65625, + "learning_rate": 4.9485942782190734e-05, + "loss": 0.2308, + "step": 730 + }, + { + "epoch": 0.07518033119983744, + "grad_norm": 4.78125, + "learning_rate": 4.9469552400012306e-05, + "loss": 0.2272, + "step": 740 + }, + { + "epoch": 0.07619628162145688, + "grad_norm": 4.25, + "learning_rate": 4.94529076044296e-05, + "loss": 0.2701, + "step": 750 + }, + { + "epoch": 0.0772122320430763, + "grad_norm": 3.140625, + "learning_rate": 4.94360085685009e-05, + "loss": 0.2686, + "step": 760 + }, + { + "epoch": 0.07822818246469572, + "grad_norm": 0.765625, + "learning_rate": 4.9418855467927894e-05, + "loss": 0.2051, + "step": 770 + }, + { + "epoch": 0.07924413288631514, + "grad_norm": 1.796875, + "learning_rate": 4.940144848105379e-05, + "loss": 0.2267, + "step": 780 + }, + { + "epoch": 0.08026008330793458, + "grad_norm": 4.5625, + "learning_rate": 4.93837877888615e-05, + "loss": 0.2597, + "step": 790 + }, + { + "epoch": 0.081276033729554, + "grad_norm": 3.03125, + "learning_rate": 4.9365873574971745e-05, + "loss": 0.3701, + "step": 800 + }, + { + "epoch": 0.08229198415117342, + "grad_norm": 4.5625, + "learning_rate": 4.9347706025641136e-05, + "loss": 0.2559, + "step": 810 + }, + { + "epoch": 0.08330793457279284, + "grad_norm": 3.90625, + "learning_rate": 4.9329285329760275e-05, + "loss": 0.2799, + "step": 820 + }, + { + "epoch": 0.08432388499441228, + "grad_norm": 3.140625, + "learning_rate": 4.9310611678851735e-05, + "loss": 0.2866, + "step": 830 + }, + { + "epoch": 0.0853398354160317, + "grad_norm": 2.46875, + "learning_rate": 4.929168526706811e-05, + "loss": 0.3105, + "step": 840 + }, + { + "epoch": 0.08635578583765112, + "grad_norm": 13.625, + "learning_rate": 4.927250629119e-05, + "loss": 0.2454, + "step": 850 + }, + { + "epoch": 0.08737173625927054, + "grad_norm": 3.921875, + "learning_rate": 4.9253074950623925e-05, + "loss": 0.2424, + "step": 860 + }, + { + "epoch": 0.08838768668088998, + "grad_norm": 2.90625, + "learning_rate": 4.9233391447400286e-05, + "loss": 0.2481, + "step": 870 + }, + { + "epoch": 0.0894036371025094, + "grad_norm": 2.96875, + "learning_rate": 4.921345598617125e-05, + "loss": 0.2231, + "step": 880 + }, + { + "epoch": 0.09041958752412882, + "grad_norm": 5.375, + "learning_rate": 4.9193268774208654e-05, + "loss": 0.3447, + "step": 890 + }, + { + "epoch": 0.09143553794574824, + "grad_norm": 2.0, + "learning_rate": 4.9172830021401785e-05, + "loss": 0.229, + "step": 900 + }, + { + "epoch": 0.09245148836736768, + "grad_norm": 3.1875, + "learning_rate": 4.9152139940255245e-05, + "loss": 0.2122, + "step": 910 + }, + { + "epoch": 0.0934674387889871, + "grad_norm": 3.40625, + "learning_rate": 4.913119874588677e-05, + "loss": 0.2386, + "step": 920 + }, + { + "epoch": 0.09448338921060652, + "grad_norm": 1.4609375, + "learning_rate": 4.911000665602489e-05, + "loss": 0.1944, + "step": 930 + }, + { + "epoch": 0.09549933963222594, + "grad_norm": 5.0625, + "learning_rate": 4.9088563891006786e-05, + "loss": 0.2038, + "step": 940 + }, + { + "epoch": 0.09651529005384538, + "grad_norm": 4.53125, + "learning_rate": 4.906687067377592e-05, + "loss": 0.3122, + "step": 950 + }, + { + "epoch": 0.0975312404754648, + "grad_norm": 2.84375, + "learning_rate": 4.904492722987976e-05, + "loss": 0.3157, + "step": 960 + }, + { + "epoch": 0.09854719089708422, + "grad_norm": 2.171875, + "learning_rate": 4.902273378746738e-05, + "loss": 0.3077, + "step": 970 + }, + { + "epoch": 0.09956314131870364, + "grad_norm": 2.84375, + "learning_rate": 4.9000290577287165e-05, + "loss": 0.2756, + "step": 980 + }, + { + "epoch": 0.10057909174032308, + "grad_norm": 0.99609375, + "learning_rate": 4.897759783268434e-05, + "loss": 0.2915, + "step": 990 + }, + { + "epoch": 0.1015950421619425, + "grad_norm": 3.53125, + "learning_rate": 4.895465578959859e-05, + "loss": 0.2052, + "step": 1000 + }, + { + "epoch": 0.10261099258356192, + "grad_norm": 4.0, + "learning_rate": 4.893146468656159e-05, + "loss": 0.2499, + "step": 1010 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.65625, + "learning_rate": 4.890802476469452e-05, + "loss": 0.278, + "step": 1020 + }, + { + "epoch": 0.10464289342680078, + "grad_norm": 3.625, + "learning_rate": 4.888433626770558e-05, + "loss": 0.2143, + "step": 1030 + }, + { + "epoch": 0.1056588438484202, + "grad_norm": 5.0625, + "learning_rate": 4.886039944188741e-05, + "loss": 0.2878, + "step": 1040 + }, + { + "epoch": 0.10667479427003962, + "grad_norm": 4.5, + "learning_rate": 4.883621453611461e-05, + "loss": 0.2744, + "step": 1050 + }, + { + "epoch": 0.10769074469165904, + "grad_norm": 4.5625, + "learning_rate": 4.881178180184106e-05, + "loss": 0.2734, + "step": 1060 + }, + { + "epoch": 0.10870669511327848, + "grad_norm": 3.125, + "learning_rate": 4.878710149309735e-05, + "loss": 0.3574, + "step": 1070 + }, + { + "epoch": 0.1097226455348979, + "grad_norm": 3.0625, + "learning_rate": 4.876217386648816e-05, + "loss": 0.2625, + "step": 1080 + }, + { + "epoch": 0.11073859595651732, + "grad_norm": 4.0625, + "learning_rate": 4.873699918118955e-05, + "loss": 0.2437, + "step": 1090 + }, + { + "epoch": 0.11175454637813674, + "grad_norm": 1.59375, + "learning_rate": 4.87115776989463e-05, + "loss": 0.2051, + "step": 1100 + }, + { + "epoch": 0.11277049679975618, + "grad_norm": 4.375, + "learning_rate": 4.8685909684069153e-05, + "loss": 0.1727, + "step": 1110 + }, + { + "epoch": 0.1137864472213756, + "grad_norm": 2.28125, + "learning_rate": 4.865999540343211e-05, + "loss": 0.2256, + "step": 1120 + }, + { + "epoch": 0.11480239764299502, + "grad_norm": 2.265625, + "learning_rate": 4.86338351264696e-05, + "loss": 0.3529, + "step": 1130 + }, + { + "epoch": 0.11581834806461444, + "grad_norm": 2.34375, + "learning_rate": 4.8607429125173754e-05, + "loss": 0.2113, + "step": 1140 + }, + { + "epoch": 0.11683429848623388, + "grad_norm": 0.7578125, + "learning_rate": 4.858077767409149e-05, + "loss": 0.2759, + "step": 1150 + }, + { + "epoch": 0.1178502489078533, + "grad_norm": 3.640625, + "learning_rate": 4.855388105032174e-05, + "loss": 0.2482, + "step": 1160 + }, + { + "epoch": 0.11886619932947272, + "grad_norm": 3.5, + "learning_rate": 4.852673953351249e-05, + "loss": 0.1865, + "step": 1170 + }, + { + "epoch": 0.11988214975109214, + "grad_norm": 3.75, + "learning_rate": 4.849935340585796e-05, + "loss": 0.2659, + "step": 1180 + }, + { + "epoch": 0.12089810017271158, + "grad_norm": 3.375, + "learning_rate": 4.8471722952095586e-05, + "loss": 0.1506, + "step": 1190 + }, + { + "epoch": 0.121914050594331, + "grad_norm": 3.34375, + "learning_rate": 4.844384845950312e-05, + "loss": 0.307, + "step": 1200 + }, + { + "epoch": 0.12293000101595042, + "grad_norm": 1.578125, + "learning_rate": 4.841573021789561e-05, + "loss": 0.1952, + "step": 1210 + }, + { + "epoch": 0.12394595143756984, + "grad_norm": 1.2890625, + "learning_rate": 4.838736851962239e-05, + "loss": 0.1779, + "step": 1220 + }, + { + "epoch": 0.12496190185918928, + "grad_norm": 1.265625, + "learning_rate": 4.835876365956408e-05, + "loss": 0.1235, + "step": 1230 + }, + { + "epoch": 0.12597785228080868, + "grad_norm": 1.9609375, + "learning_rate": 4.8329915935129436e-05, + "loss": 0.1876, + "step": 1240 + }, + { + "epoch": 0.12699380270242813, + "grad_norm": 1.6328125, + "learning_rate": 4.830082564625235e-05, + "loss": 0.2188, + "step": 1250 + }, + { + "epoch": 0.12800975312404755, + "grad_norm": 3.96875, + "learning_rate": 4.8271493095388684e-05, + "loss": 0.2622, + "step": 1260 + }, + { + "epoch": 0.12902570354566698, + "grad_norm": 3.765625, + "learning_rate": 4.824191858751312e-05, + "loss": 0.2724, + "step": 1270 + }, + { + "epoch": 0.1300416539672864, + "grad_norm": 5.59375, + "learning_rate": 4.821210243011601e-05, + "loss": 0.2413, + "step": 1280 + }, + { + "epoch": 0.13105760438890582, + "grad_norm": 3.34375, + "learning_rate": 4.818204493320016e-05, + "loss": 0.2618, + "step": 1290 + }, + { + "epoch": 0.13207355481052524, + "grad_norm": 2.78125, + "learning_rate": 4.8151746409277634e-05, + "loss": 0.2295, + "step": 1300 + }, + { + "epoch": 0.13308950523214466, + "grad_norm": 3.1875, + "learning_rate": 4.8121207173366484e-05, + "loss": 0.2733, + "step": 1310 + }, + { + "epoch": 0.13410545565376408, + "grad_norm": 2.28125, + "learning_rate": 4.809042754298746e-05, + "loss": 0.2311, + "step": 1320 + }, + { + "epoch": 0.13512140607538353, + "grad_norm": 2.171875, + "learning_rate": 4.805940783816075e-05, + "loss": 0.2059, + "step": 1330 + }, + { + "epoch": 0.13613735649700295, + "grad_norm": 2.796875, + "learning_rate": 4.8028148381402625e-05, + "loss": 0.2102, + "step": 1340 + }, + { + "epoch": 0.13715330691862238, + "grad_norm": 2.96875, + "learning_rate": 4.7996649497722084e-05, + "loss": 0.2708, + "step": 1350 + }, + { + "epoch": 0.1381692573402418, + "grad_norm": 2.4375, + "learning_rate": 4.7964911514617485e-05, + "loss": 0.2429, + "step": 1360 + }, + { + "epoch": 0.13918520776186122, + "grad_norm": 5.8125, + "learning_rate": 4.793293476207312e-05, + "loss": 0.2725, + "step": 1370 + }, + { + "epoch": 0.14020115818348064, + "grad_norm": 2.40625, + "learning_rate": 4.790071957255585e-05, + "loss": 0.2098, + "step": 1380 + }, + { + "epoch": 0.14121710860510006, + "grad_norm": 4.25, + "learning_rate": 4.786826628101154e-05, + "loss": 0.2101, + "step": 1390 + }, + { + "epoch": 0.14223305902671948, + "grad_norm": 2.578125, + "learning_rate": 4.783557522486167e-05, + "loss": 0.2624, + "step": 1400 + }, + { + "epoch": 0.14324900944833893, + "grad_norm": 3.125, + "learning_rate": 4.780264674399978e-05, + "loss": 0.2518, + "step": 1410 + }, + { + "epoch": 0.14426495986995835, + "grad_norm": 3.671875, + "learning_rate": 4.7769481180787966e-05, + "loss": 0.3112, + "step": 1420 + }, + { + "epoch": 0.14528091029157778, + "grad_norm": 3.984375, + "learning_rate": 4.773607888005327e-05, + "loss": 0.2747, + "step": 1430 + }, + { + "epoch": 0.1462968607131972, + "grad_norm": 3.234375, + "learning_rate": 4.770244018908416e-05, + "loss": 0.1572, + "step": 1440 + }, + { + "epoch": 0.14731281113481662, + "grad_norm": 4.09375, + "learning_rate": 4.766856545762687e-05, + "loss": 0.2148, + "step": 1450 + }, + { + "epoch": 0.14832876155643604, + "grad_norm": 1.6875, + "learning_rate": 4.763445503788178e-05, + "loss": 0.2531, + "step": 1460 + }, + { + "epoch": 0.14934471197805546, + "grad_norm": 2.375, + "learning_rate": 4.760010928449976e-05, + "loss": 0.199, + "step": 1470 + }, + { + "epoch": 0.15036066239967488, + "grad_norm": 4.6875, + "learning_rate": 4.7565528554578485e-05, + "loss": 0.2366, + "step": 1480 + }, + { + "epoch": 0.15137661282129433, + "grad_norm": 5.4375, + "learning_rate": 4.75307132076587e-05, + "loss": 0.1862, + "step": 1490 + }, + { + "epoch": 0.15239256324291375, + "grad_norm": 2.484375, + "learning_rate": 4.749566360572049e-05, + "loss": 0.2143, + "step": 1500 + }, + { + "epoch": 0.15340851366453317, + "grad_norm": 2.1875, + "learning_rate": 4.746038011317955e-05, + "loss": 0.1877, + "step": 1510 + }, + { + "epoch": 0.1544244640861526, + "grad_norm": 2.84375, + "learning_rate": 4.742486309688333e-05, + "loss": 0.2831, + "step": 1520 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.015625, + "learning_rate": 4.738911292610732e-05, + "loss": 0.1708, + "step": 1530 + }, + { + "epoch": 0.15645636492939144, + "grad_norm": 3.953125, + "learning_rate": 4.735312997255107e-05, + "loss": 0.192, + "step": 1540 + }, + { + "epoch": 0.15747231535101086, + "grad_norm": 2.09375, + "learning_rate": 4.7316914610334475e-05, + "loss": 0.2586, + "step": 1550 + }, + { + "epoch": 0.15848826577263028, + "grad_norm": 3.6875, + "learning_rate": 4.728046721599378e-05, + "loss": 0.2141, + "step": 1560 + }, + { + "epoch": 0.15950421619424973, + "grad_norm": 2.9375, + "learning_rate": 4.724378816847771e-05, + "loss": 0.193, + "step": 1570 + }, + { + "epoch": 0.16052016661586915, + "grad_norm": 1.5625, + "learning_rate": 4.720687784914352e-05, + "loss": 0.191, + "step": 1580 + }, + { + "epoch": 0.16153611703748857, + "grad_norm": 3.75, + "learning_rate": 4.716973664175304e-05, + "loss": 0.2172, + "step": 1590 + }, + { + "epoch": 0.162552067459108, + "grad_norm": 3.125, + "learning_rate": 4.7132364932468645e-05, + "loss": 0.2134, + "step": 1600 + }, + { + "epoch": 0.16356801788072742, + "grad_norm": 4.09375, + "learning_rate": 4.709476310984932e-05, + "loss": 0.2055, + "step": 1610 + }, + { + "epoch": 0.16458396830234684, + "grad_norm": 3.875, + "learning_rate": 4.705693156484652e-05, + "loss": 0.2136, + "step": 1620 + }, + { + "epoch": 0.16559991872396626, + "grad_norm": 1.1796875, + "learning_rate": 4.7018870690800196e-05, + "loss": 0.1471, + "step": 1630 + }, + { + "epoch": 0.16661586914558568, + "grad_norm": 2.5, + "learning_rate": 4.698058088343465e-05, + "loss": 0.2308, + "step": 1640 + }, + { + "epoch": 0.16763181956720513, + "grad_norm": 1.390625, + "learning_rate": 4.6942062540854425e-05, + "loss": 0.2456, + "step": 1650 + }, + { + "epoch": 0.16864776998882455, + "grad_norm": 3.125, + "learning_rate": 4.69033160635402e-05, + "loss": 0.2654, + "step": 1660 + }, + { + "epoch": 0.16966372041044397, + "grad_norm": 3.984375, + "learning_rate": 4.6864341854344587e-05, + "loss": 0.2226, + "step": 1670 + }, + { + "epoch": 0.1706796708320634, + "grad_norm": 2.328125, + "learning_rate": 4.682514031848795e-05, + "loss": 0.2438, + "step": 1680 + }, + { + "epoch": 0.17169562125368282, + "grad_norm": 3.078125, + "learning_rate": 4.678571186355423e-05, + "loss": 0.1889, + "step": 1690 + }, + { + "epoch": 0.17271157167530224, + "grad_norm": 3.328125, + "learning_rate": 4.6746056899486644e-05, + "loss": 0.2117, + "step": 1700 + }, + { + "epoch": 0.17372752209692166, + "grad_norm": 2.78125, + "learning_rate": 4.67061758385835e-05, + "loss": 0.1953, + "step": 1710 + }, + { + "epoch": 0.17474347251854108, + "grad_norm": 3.09375, + "learning_rate": 4.6666069095493816e-05, + "loss": 0.1844, + "step": 1720 + }, + { + "epoch": 0.17575942294016053, + "grad_norm": 3.234375, + "learning_rate": 4.662573708721309e-05, + "loss": 0.2774, + "step": 1730 + }, + { + "epoch": 0.17677537336177995, + "grad_norm": 4.03125, + "learning_rate": 4.658518023307894e-05, + "loss": 0.2527, + "step": 1740 + }, + { + "epoch": 0.17779132378339937, + "grad_norm": 3.21875, + "learning_rate": 4.654439895476671e-05, + "loss": 0.2164, + "step": 1750 + }, + { + "epoch": 0.1788072742050188, + "grad_norm": 2.390625, + "learning_rate": 4.6503393676285146e-05, + "loss": 0.2424, + "step": 1760 + }, + { + "epoch": 0.17982322462663822, + "grad_norm": 1.8359375, + "learning_rate": 4.646216482397192e-05, + "loss": 0.2428, + "step": 1770 + }, + { + "epoch": 0.18083917504825764, + "grad_norm": 2.796875, + "learning_rate": 4.6420712826489275e-05, + "loss": 0.2155, + "step": 1780 + }, + { + "epoch": 0.18185512546987706, + "grad_norm": 0.69921875, + "learning_rate": 4.6379038114819485e-05, + "loss": 0.1544, + "step": 1790 + }, + { + "epoch": 0.18287107589149648, + "grad_norm": 3.40625, + "learning_rate": 4.6337141122260444e-05, + "loss": 0.2029, + "step": 1800 + }, + { + "epoch": 0.18388702631311593, + "grad_norm": 2.359375, + "learning_rate": 4.629502228442112e-05, + "loss": 0.1489, + "step": 1810 + }, + { + "epoch": 0.18490297673473535, + "grad_norm": 1.4453125, + "learning_rate": 4.6252682039217045e-05, + "loss": 0.2101, + "step": 1820 + }, + { + "epoch": 0.18591892715635477, + "grad_norm": 2.71875, + "learning_rate": 4.621012082686573e-05, + "loss": 0.2076, + "step": 1830 + }, + { + "epoch": 0.1869348775779742, + "grad_norm": 3.0625, + "learning_rate": 4.616733908988216e-05, + "loss": 0.2719, + "step": 1840 + }, + { + "epoch": 0.18795082799959362, + "grad_norm": 1.953125, + "learning_rate": 4.612433727307409e-05, + "loss": 0.2105, + "step": 1850 + }, + { + "epoch": 0.18896677842121304, + "grad_norm": 3.46875, + "learning_rate": 4.608111582353751e-05, + "loss": 0.1877, + "step": 1860 + }, + { + "epoch": 0.18998272884283246, + "grad_norm": 2.546875, + "learning_rate": 4.603767519065197e-05, + "loss": 0.2238, + "step": 1870 + }, + { + "epoch": 0.19099867926445188, + "grad_norm": 1.5703125, + "learning_rate": 4.599401582607589e-05, + "loss": 0.243, + "step": 1880 + }, + { + "epoch": 0.19201462968607133, + "grad_norm": 2.4375, + "learning_rate": 4.595013818374185e-05, + "loss": 0.1867, + "step": 1890 + }, + { + "epoch": 0.19303058010769075, + "grad_norm": 2.203125, + "learning_rate": 4.5906042719851925e-05, + "loss": 0.1994, + "step": 1900 + }, + { + "epoch": 0.19404653052931017, + "grad_norm": 3.984375, + "learning_rate": 4.586172989287291e-05, + "loss": 0.1899, + "step": 1910 + }, + { + "epoch": 0.1950624809509296, + "grad_norm": 2.6875, + "learning_rate": 4.5817200163531534e-05, + "loss": 0.2528, + "step": 1920 + }, + { + "epoch": 0.19607843137254902, + "grad_norm": 2.71875, + "learning_rate": 4.577245399480972e-05, + "loss": 0.2336, + "step": 1930 + }, + { + "epoch": 0.19709438179416844, + "grad_norm": 2.640625, + "learning_rate": 4.5727491851939715e-05, + "loss": 0.2204, + "step": 1940 + }, + { + "epoch": 0.19811033221578786, + "grad_norm": 1.78125, + "learning_rate": 4.568231420239929e-05, + "loss": 0.1656, + "step": 1950 + }, + { + "epoch": 0.19912628263740728, + "grad_norm": 3.15625, + "learning_rate": 4.563692151590687e-05, + "loss": 0.2105, + "step": 1960 + }, + { + "epoch": 0.20014223305902673, + "grad_norm": 1.3671875, + "learning_rate": 4.5591314264416666e-05, + "loss": 0.1464, + "step": 1970 + }, + { + "epoch": 0.20115818348064615, + "grad_norm": 4.25, + "learning_rate": 4.554549292211371e-05, + "loss": 0.2103, + "step": 1980 + }, + { + "epoch": 0.20217413390226557, + "grad_norm": 2.984375, + "learning_rate": 4.549945796540901e-05, + "loss": 0.144, + "step": 1990 + }, + { + "epoch": 0.203190084323885, + "grad_norm": 1.859375, + "learning_rate": 4.545320987293453e-05, + "loss": 0.1963, + "step": 2000 + }, + { + "epoch": 0.20420603474550442, + "grad_norm": 1.078125, + "learning_rate": 4.540674912553824e-05, + "loss": 0.2115, + "step": 2010 + }, + { + "epoch": 0.20522198516712384, + "grad_norm": 4.25, + "learning_rate": 4.536007620627911e-05, + "loss": 0.1682, + "step": 2020 + }, + { + "epoch": 0.20623793558874326, + "grad_norm": 2.71875, + "learning_rate": 4.531319160042212e-05, + "loss": 0.1992, + "step": 2030 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 1.2890625, + "learning_rate": 4.5266095795433126e-05, + "loss": 0.1134, + "step": 2040 + }, + { + "epoch": 0.20826983643198213, + "grad_norm": 3.296875, + "learning_rate": 4.5218789280973925e-05, + "loss": 0.1474, + "step": 2050 + }, + { + "epoch": 0.20928578685360155, + "grad_norm": 1.9375, + "learning_rate": 4.5171272548897024e-05, + "loss": 0.1955, + "step": 2060 + }, + { + "epoch": 0.21030173727522097, + "grad_norm": 2.734375, + "learning_rate": 4.512354609324063e-05, + "loss": 0.2042, + "step": 2070 + }, + { + "epoch": 0.2113176876968404, + "grad_norm": 2.921875, + "learning_rate": 4.507561041022347e-05, + "loss": 0.2174, + "step": 2080 + }, + { + "epoch": 0.21233363811845982, + "grad_norm": 2.40625, + "learning_rate": 4.502746599823963e-05, + "loss": 0.2634, + "step": 2090 + }, + { + "epoch": 0.21334958854007924, + "grad_norm": 1.71875, + "learning_rate": 4.497911335785339e-05, + "loss": 0.1884, + "step": 2100 + }, + { + "epoch": 0.21436553896169866, + "grad_norm": 0.79296875, + "learning_rate": 4.4930552991794e-05, + "loss": 0.1872, + "step": 2110 + }, + { + "epoch": 0.21538148938331808, + "grad_norm": 3.171875, + "learning_rate": 4.4881785404950474e-05, + "loss": 0.2233, + "step": 2120 + }, + { + "epoch": 0.21639743980493753, + "grad_norm": 2.59375, + "learning_rate": 4.483281110436631e-05, + "loss": 0.2374, + "step": 2130 + }, + { + "epoch": 0.21741339022655695, + "grad_norm": 3.328125, + "learning_rate": 4.478363059923426e-05, + "loss": 0.2545, + "step": 2140 + }, + { + "epoch": 0.21842934064817637, + "grad_norm": 2.3125, + "learning_rate": 4.4734244400891014e-05, + "loss": 0.2063, + "step": 2150 + }, + { + "epoch": 0.2194452910697958, + "grad_norm": 3.40625, + "learning_rate": 4.4684653022811865e-05, + "loss": 0.1219, + "step": 2160 + }, + { + "epoch": 0.22046124149141522, + "grad_norm": 4.1875, + "learning_rate": 4.463485698060541e-05, + "loss": 0.2805, + "step": 2170 + }, + { + "epoch": 0.22147719191303464, + "grad_norm": 2.3125, + "learning_rate": 4.458485679200814e-05, + "loss": 0.1998, + "step": 2180 + }, + { + "epoch": 0.22249314233465406, + "grad_norm": 3.578125, + "learning_rate": 4.453465297687912e-05, + "loss": 0.2489, + "step": 2190 + }, + { + "epoch": 0.22350909275627348, + "grad_norm": 2.59375, + "learning_rate": 4.448424605719452e-05, + "loss": 0.2731, + "step": 2200 + }, + { + "epoch": 0.22452504317789293, + "grad_norm": 3.28125, + "learning_rate": 4.443363655704224e-05, + "loss": 0.2425, + "step": 2210 + }, + { + "epoch": 0.22554099359951235, + "grad_norm": 2.78125, + "learning_rate": 4.438282500261641e-05, + "loss": 0.2938, + "step": 2220 + }, + { + "epoch": 0.22655694402113177, + "grad_norm": 1.1953125, + "learning_rate": 4.433181192221197e-05, + "loss": 0.1728, + "step": 2230 + }, + { + "epoch": 0.2275728944427512, + "grad_norm": 1.34375, + "learning_rate": 4.4280597846219155e-05, + "loss": 0.216, + "step": 2240 + }, + { + "epoch": 0.22858884486437062, + "grad_norm": 1.8515625, + "learning_rate": 4.422918330711796e-05, + "loss": 0.1612, + "step": 2250 + }, + { + "epoch": 0.22960479528599004, + "grad_norm": 1.90625, + "learning_rate": 4.417756883947263e-05, + "loss": 0.107, + "step": 2260 + }, + { + "epoch": 0.23062074570760946, + "grad_norm": 3.375, + "learning_rate": 4.412575497992611e-05, + "loss": 0.1756, + "step": 2270 + }, + { + "epoch": 0.23163669612922888, + "grad_norm": 4.375, + "learning_rate": 4.407374226719445e-05, + "loss": 0.234, + "step": 2280 + }, + { + "epoch": 0.23265264655084833, + "grad_norm": 3.25, + "learning_rate": 4.402153124206119e-05, + "loss": 0.2144, + "step": 2290 + }, + { + "epoch": 0.23366859697246775, + "grad_norm": 1.703125, + "learning_rate": 4.396912244737173e-05, + "loss": 0.1696, + "step": 2300 + }, + { + "epoch": 0.23468454739408717, + "grad_norm": 2.84375, + "learning_rate": 4.391651642802778e-05, + "loss": 0.2506, + "step": 2310 + }, + { + "epoch": 0.2357004978157066, + "grad_norm": 4.5, + "learning_rate": 4.386371373098155e-05, + "loss": 0.1686, + "step": 2320 + }, + { + "epoch": 0.23671644823732602, + "grad_norm": 2.515625, + "learning_rate": 4.381071490523018e-05, + "loss": 0.2403, + "step": 2330 + }, + { + "epoch": 0.23773239865894544, + "grad_norm": 4.4375, + "learning_rate": 4.3757520501809955e-05, + "loss": 0.1611, + "step": 2340 + }, + { + "epoch": 0.23874834908056486, + "grad_norm": 1.609375, + "learning_rate": 4.370413107379065e-05, + "loss": 0.1698, + "step": 2350 + }, + { + "epoch": 0.23976429950218428, + "grad_norm": 4.96875, + "learning_rate": 4.36505471762697e-05, + "loss": 0.1928, + "step": 2360 + }, + { + "epoch": 0.24078024992380373, + "grad_norm": 0.8984375, + "learning_rate": 4.3596769366366474e-05, + "loss": 0.2035, + "step": 2370 + }, + { + "epoch": 0.24179620034542315, + "grad_norm": 5.75, + "learning_rate": 4.354279820321649e-05, + "loss": 0.16, + "step": 2380 + }, + { + "epoch": 0.24281215076704257, + "grad_norm": 1.9453125, + "learning_rate": 4.34886342479656e-05, + "loss": 0.1851, + "step": 2390 + }, + { + "epoch": 0.243828101188662, + "grad_norm": 1.015625, + "learning_rate": 4.34342780637641e-05, + "loss": 0.1726, + "step": 2400 + }, + { + "epoch": 0.24484405161028142, + "grad_norm": 4.59375, + "learning_rate": 4.337973021576095e-05, + "loss": 0.2847, + "step": 2410 + }, + { + "epoch": 0.24586000203190084, + "grad_norm": 1.03125, + "learning_rate": 4.3324991271097846e-05, + "loss": 0.2528, + "step": 2420 + }, + { + "epoch": 0.24687595245352026, + "grad_norm": 2.1875, + "learning_rate": 4.3270061798903374e-05, + "loss": 0.1573, + "step": 2430 + }, + { + "epoch": 0.24789190287513968, + "grad_norm": 0.98046875, + "learning_rate": 4.321494237028701e-05, + "loss": 0.1703, + "step": 2440 + }, + { + "epoch": 0.24890785329675913, + "grad_norm": 3.8125, + "learning_rate": 4.31596335583333e-05, + "loss": 0.2613, + "step": 2450 + }, + { + "epoch": 0.24992380371837855, + "grad_norm": 4.0625, + "learning_rate": 4.310413593809579e-05, + "loss": 0.22, + "step": 2460 + }, + { + "epoch": 0.250939754139998, + "grad_norm": 3.15625, + "learning_rate": 4.304845008659108e-05, + "loss": 0.1263, + "step": 2470 + }, + { + "epoch": 0.25195570456161737, + "grad_norm": 3.046875, + "learning_rate": 4.2992576582792895e-05, + "loss": 0.1639, + "step": 2480 + }, + { + "epoch": 0.2529716549832368, + "grad_norm": 9.8125, + "learning_rate": 4.293651600762595e-05, + "loss": 0.2681, + "step": 2490 + }, + { + "epoch": 0.25398760540485626, + "grad_norm": 3.734375, + "learning_rate": 4.288026894395999e-05, + "loss": 0.2292, + "step": 2500 + }, + { + "epoch": 0.25500355582647566, + "grad_norm": 0.455078125, + "learning_rate": 4.2823835976603723e-05, + "loss": 0.2324, + "step": 2510 + }, + { + "epoch": 0.2560195062480951, + "grad_norm": 5.625, + "learning_rate": 4.276721769229869e-05, + "loss": 0.1834, + "step": 2520 + }, + { + "epoch": 0.2570354566697145, + "grad_norm": 1.3671875, + "learning_rate": 4.271041467971323e-05, + "loss": 0.1826, + "step": 2530 + }, + { + "epoch": 0.25805140709133395, + "grad_norm": 5.0625, + "learning_rate": 4.265342752943632e-05, + "loss": 0.2463, + "step": 2540 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 2.859375, + "learning_rate": 4.2596256833971425e-05, + "loss": 0.2598, + "step": 2550 + }, + { + "epoch": 0.2600833079345728, + "grad_norm": 1.8515625, + "learning_rate": 4.2538903187730374e-05, + "loss": 0.1148, + "step": 2560 + }, + { + "epoch": 0.26109925835619224, + "grad_norm": 2.71875, + "learning_rate": 4.248136718702716e-05, + "loss": 0.2123, + "step": 2570 + }, + { + "epoch": 0.26211520877781164, + "grad_norm": 4.5625, + "learning_rate": 4.242364943007172e-05, + "loss": 0.2369, + "step": 2580 + }, + { + "epoch": 0.2631311591994311, + "grad_norm": 2.296875, + "learning_rate": 4.236575051696377e-05, + "loss": 0.261, + "step": 2590 + }, + { + "epoch": 0.2641471096210505, + "grad_norm": 2.75, + "learning_rate": 4.2307671049686514e-05, + "loss": 0.1564, + "step": 2600 + }, + { + "epoch": 0.26516306004266993, + "grad_norm": 3.5, + "learning_rate": 4.2249411632100396e-05, + "loss": 0.1563, + "step": 2610 + }, + { + "epoch": 0.2661790104642893, + "grad_norm": 2.84375, + "learning_rate": 4.219097286993684e-05, + "loss": 0.1697, + "step": 2620 + }, + { + "epoch": 0.26719496088590877, + "grad_norm": 2.125, + "learning_rate": 4.2132355370791946e-05, + "loss": 0.1844, + "step": 2630 + }, + { + "epoch": 0.26821091130752817, + "grad_norm": 4.03125, + "learning_rate": 4.2073559744120156e-05, + "loss": 0.2144, + "step": 2640 + }, + { + "epoch": 0.2692268617291476, + "grad_norm": 2.375, + "learning_rate": 4.201458660122793e-05, + "loss": 0.2013, + "step": 2650 + }, + { + "epoch": 0.27024281215076706, + "grad_norm": 3.625, + "learning_rate": 4.1955436555267393e-05, + "loss": 0.2166, + "step": 2660 + }, + { + "epoch": 0.27125876257238646, + "grad_norm": 0.328125, + "learning_rate": 4.189611022122997e-05, + "loss": 0.1934, + "step": 2670 + }, + { + "epoch": 0.2722747129940059, + "grad_norm": 2.75, + "learning_rate": 4.1836608215939944e-05, + "loss": 0.2157, + "step": 2680 + }, + { + "epoch": 0.2732906634156253, + "grad_norm": 3.5, + "learning_rate": 4.17769311580481e-05, + "loss": 0.18, + "step": 2690 + }, + { + "epoch": 0.27430661383724475, + "grad_norm": 2.109375, + "learning_rate": 4.171707966802528e-05, + "loss": 0.2178, + "step": 2700 + }, + { + "epoch": 0.27532256425886414, + "grad_norm": 4.65625, + "learning_rate": 4.16570543681559e-05, + "loss": 0.1896, + "step": 2710 + }, + { + "epoch": 0.2763385146804836, + "grad_norm": 4.8125, + "learning_rate": 4.159685588253151e-05, + "loss": 0.1322, + "step": 2720 + }, + { + "epoch": 0.27735446510210304, + "grad_norm": 3.9375, + "learning_rate": 4.153648483704429e-05, + "loss": 0.184, + "step": 2730 + }, + { + "epoch": 0.27837041552372244, + "grad_norm": 4.53125, + "learning_rate": 4.147594185938057e-05, + "loss": 0.2451, + "step": 2740 + }, + { + "epoch": 0.2793863659453419, + "grad_norm": 1.0390625, + "learning_rate": 4.141522757901426e-05, + "loss": 0.2367, + "step": 2750 + }, + { + "epoch": 0.2804023163669613, + "grad_norm": 3.375, + "learning_rate": 4.1354342627200345e-05, + "loss": 0.179, + "step": 2760 + }, + { + "epoch": 0.28141826678858073, + "grad_norm": 2.953125, + "learning_rate": 4.1293287636968286e-05, + "loss": 0.1396, + "step": 2770 + }, + { + "epoch": 0.2824342172102001, + "grad_norm": 2.546875, + "learning_rate": 4.1232063243115485e-05, + "loss": 0.1963, + "step": 2780 + }, + { + "epoch": 0.28345016763181957, + "grad_norm": 5.09375, + "learning_rate": 4.117067008220063e-05, + "loss": 0.2457, + "step": 2790 + }, + { + "epoch": 0.28446611805343897, + "grad_norm": 2.046875, + "learning_rate": 4.110910879253712e-05, + "loss": 0.2262, + "step": 2800 + }, + { + "epoch": 0.2854820684750584, + "grad_norm": 2.1875, + "learning_rate": 4.104738001418641e-05, + "loss": 0.2499, + "step": 2810 + }, + { + "epoch": 0.28649801889667786, + "grad_norm": 2.59375, + "learning_rate": 4.098548438895135e-05, + "loss": 0.1667, + "step": 2820 + }, + { + "epoch": 0.28751396931829726, + "grad_norm": 2.875, + "learning_rate": 4.092342256036954e-05, + "loss": 0.2288, + "step": 2830 + }, + { + "epoch": 0.2885299197399167, + "grad_norm": 3.015625, + "learning_rate": 4.086119517370659e-05, + "loss": 0.2038, + "step": 2840 + }, + { + "epoch": 0.2895458701615361, + "grad_norm": 3.53125, + "learning_rate": 4.0798802875949485e-05, + "loss": 0.181, + "step": 2850 + }, + { + "epoch": 0.29056182058315555, + "grad_norm": 2.296875, + "learning_rate": 4.073624631579975e-05, + "loss": 0.1886, + "step": 2860 + }, + { + "epoch": 0.29157777100477494, + "grad_norm": 3.609375, + "learning_rate": 4.067352614366685e-05, + "loss": 0.2053, + "step": 2870 + }, + { + "epoch": 0.2925937214263944, + "grad_norm": 2.328125, + "learning_rate": 4.061064301166128e-05, + "loss": 0.1409, + "step": 2880 + }, + { + "epoch": 0.29360967184801384, + "grad_norm": 4.9375, + "learning_rate": 4.054759757358787e-05, + "loss": 0.184, + "step": 2890 + }, + { + "epoch": 0.29462562226963324, + "grad_norm": 4.6875, + "learning_rate": 4.048439048493898e-05, + "loss": 0.2306, + "step": 2900 + }, + { + "epoch": 0.2956415726912527, + "grad_norm": 4.09375, + "learning_rate": 4.0421022402887676e-05, + "loss": 0.1914, + "step": 2910 + }, + { + "epoch": 0.2966575231128721, + "grad_norm": 2.3125, + "learning_rate": 4.035749398628088e-05, + "loss": 0.1653, + "step": 2920 + }, + { + "epoch": 0.29767347353449153, + "grad_norm": 2.515625, + "learning_rate": 4.029380589563256e-05, + "loss": 0.1941, + "step": 2930 + }, + { + "epoch": 0.2986894239561109, + "grad_norm": 1.78125, + "learning_rate": 4.02299587931168e-05, + "loss": 0.1117, + "step": 2940 + }, + { + "epoch": 0.29970537437773037, + "grad_norm": 0.8359375, + "learning_rate": 4.0165953342560974e-05, + "loss": 0.1605, + "step": 2950 + }, + { + "epoch": 0.30072132479934977, + "grad_norm": 3.046875, + "learning_rate": 4.010179020943884e-05, + "loss": 0.1726, + "step": 2960 + }, + { + "epoch": 0.3017372752209692, + "grad_norm": 3.453125, + "learning_rate": 4.003747006086357e-05, + "loss": 0.2208, + "step": 2970 + }, + { + "epoch": 0.30275322564258866, + "grad_norm": 2.515625, + "learning_rate": 3.9972993565580866e-05, + "loss": 0.1325, + "step": 2980 + }, + { + "epoch": 0.30376917606420806, + "grad_norm": 3.046875, + "learning_rate": 3.9908361393962e-05, + "loss": 0.2014, + "step": 2990 + }, + { + "epoch": 0.3047851264858275, + "grad_norm": 2.28125, + "learning_rate": 3.984357421799681e-05, + "loss": 0.165, + "step": 3000 + }, + { + "epoch": 0.3058010769074469, + "grad_norm": 5.09375, + "learning_rate": 3.9778632711286756e-05, + "loss": 0.212, + "step": 3010 + }, + { + "epoch": 0.30681702732906635, + "grad_norm": 4.25, + "learning_rate": 3.971353754903788e-05, + "loss": 0.2388, + "step": 3020 + }, + { + "epoch": 0.30783297775068574, + "grad_norm": 2.34375, + "learning_rate": 3.964828940805381e-05, + "loss": 0.2175, + "step": 3030 + }, + { + "epoch": 0.3088489281723052, + "grad_norm": 4.09375, + "learning_rate": 3.95828889667287e-05, + "loss": 0.2088, + "step": 3040 + }, + { + "epoch": 0.30986487859392464, + "grad_norm": 2.359375, + "learning_rate": 3.9517336905040244e-05, + "loss": 0.1913, + "step": 3050 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.1640625, + "learning_rate": 3.9451633904542483e-05, + "loss": 0.2185, + "step": 3060 + }, + { + "epoch": 0.3118967794371635, + "grad_norm": 2.59375, + "learning_rate": 3.9385780648358846e-05, + "loss": 0.2072, + "step": 3070 + }, + { + "epoch": 0.3129127298587829, + "grad_norm": 3.015625, + "learning_rate": 3.9319777821174955e-05, + "loss": 0.1902, + "step": 3080 + }, + { + "epoch": 0.31392868028040233, + "grad_norm": 2.375, + "learning_rate": 3.925362610923158e-05, + "loss": 0.259, + "step": 3090 + }, + { + "epoch": 0.3149446307020217, + "grad_norm": 4.65625, + "learning_rate": 3.918732620031742e-05, + "loss": 0.2026, + "step": 3100 + }, + { + "epoch": 0.31596058112364117, + "grad_norm": 2.1875, + "learning_rate": 3.912087878376205e-05, + "loss": 0.1478, + "step": 3110 + }, + { + "epoch": 0.31697653154526056, + "grad_norm": 2.34375, + "learning_rate": 3.905428455042865e-05, + "loss": 0.167, + "step": 3120 + }, + { + "epoch": 0.31799248196688, + "grad_norm": 2.390625, + "learning_rate": 3.898754419270693e-05, + "loss": 0.1629, + "step": 3130 + }, + { + "epoch": 0.31900843238849946, + "grad_norm": 1.546875, + "learning_rate": 3.892065840450583e-05, + "loss": 0.1308, + "step": 3140 + }, + { + "epoch": 0.32002438281011886, + "grad_norm": 4.625, + "learning_rate": 3.885362788124637e-05, + "loss": 0.2008, + "step": 3150 + }, + { + "epoch": 0.3210403332317383, + "grad_norm": 3.8125, + "learning_rate": 3.8786453319854396e-05, + "loss": 0.2225, + "step": 3160 + }, + { + "epoch": 0.3220562836533577, + "grad_norm": 3.015625, + "learning_rate": 3.8719135418753366e-05, + "loss": 0.2243, + "step": 3170 + }, + { + "epoch": 0.32307223407497715, + "grad_norm": 5.6875, + "learning_rate": 3.865167487785702e-05, + "loss": 0.1981, + "step": 3180 + }, + { + "epoch": 0.32408818449659654, + "grad_norm": 4.84375, + "learning_rate": 3.8584072398562164e-05, + "loss": 0.2031, + "step": 3190 + }, + { + "epoch": 0.325104134918216, + "grad_norm": 4.0625, + "learning_rate": 3.851632868374136e-05, + "loss": 0.1621, + "step": 3200 + }, + { + "epoch": 0.32612008533983544, + "grad_norm": 3.421875, + "learning_rate": 3.844844443773562e-05, + "loss": 0.1674, + "step": 3210 + }, + { + "epoch": 0.32713603576145484, + "grad_norm": 1.3671875, + "learning_rate": 3.8380420366347046e-05, + "loss": 0.1502, + "step": 3220 + }, + { + "epoch": 0.3281519861830743, + "grad_norm": 3.734375, + "learning_rate": 3.831225717683157e-05, + "loss": 0.1868, + "step": 3230 + }, + { + "epoch": 0.3291679366046937, + "grad_norm": 2.703125, + "learning_rate": 3.8243955577891534e-05, + "loss": 0.1818, + "step": 3240 + }, + { + "epoch": 0.3301838870263131, + "grad_norm": 3.796875, + "learning_rate": 3.8175516279668335e-05, + "loss": 0.2215, + "step": 3250 + }, + { + "epoch": 0.3311998374479325, + "grad_norm": 3.203125, + "learning_rate": 3.810693999373505e-05, + "loss": 0.2544, + "step": 3260 + }, + { + "epoch": 0.33221578786955197, + "grad_norm": 4.0, + "learning_rate": 3.8038227433089056e-05, + "loss": 0.1175, + "step": 3270 + }, + { + "epoch": 0.33323173829117136, + "grad_norm": 3.625, + "learning_rate": 3.796937931214458e-05, + "loss": 0.2213, + "step": 3280 + }, + { + "epoch": 0.3342476887127908, + "grad_norm": 1.7265625, + "learning_rate": 3.7900396346725296e-05, + "loss": 0.1711, + "step": 3290 + }, + { + "epoch": 0.33526363913441026, + "grad_norm": 3.140625, + "learning_rate": 3.783127925405686e-05, + "loss": 0.2628, + "step": 3300 + }, + { + "epoch": 0.33627958955602966, + "grad_norm": 2.1875, + "learning_rate": 3.77620287527595e-05, + "loss": 0.1671, + "step": 3310 + }, + { + "epoch": 0.3372955399776491, + "grad_norm": 5.28125, + "learning_rate": 3.769264556284048e-05, + "loss": 0.2109, + "step": 3320 + }, + { + "epoch": 0.3383114903992685, + "grad_norm": 2.875, + "learning_rate": 3.762313040568665e-05, + "loss": 0.1978, + "step": 3330 + }, + { + "epoch": 0.33932744082088795, + "grad_norm": 2.234375, + "learning_rate": 3.755348400405697e-05, + "loss": 0.1275, + "step": 3340 + }, + { + "epoch": 0.34034339124250734, + "grad_norm": 1.9453125, + "learning_rate": 3.7483707082074945e-05, + "loss": 0.1482, + "step": 3350 + }, + { + "epoch": 0.3413593416641268, + "grad_norm": 5.40625, + "learning_rate": 3.741380036522111e-05, + "loss": 0.1933, + "step": 3360 + }, + { + "epoch": 0.34237529208574624, + "grad_norm": 4.53125, + "learning_rate": 3.734376458032551e-05, + "loss": 0.1925, + "step": 3370 + }, + { + "epoch": 0.34339124250736563, + "grad_norm": 4.0625, + "learning_rate": 3.727360045556014e-05, + "loss": 0.2297, + "step": 3380 + }, + { + "epoch": 0.3444071929289851, + "grad_norm": 2.53125, + "learning_rate": 3.7203308720431336e-05, + "loss": 0.1704, + "step": 3390 + }, + { + "epoch": 0.3454231433506045, + "grad_norm": 1.859375, + "learning_rate": 3.7132890105772234e-05, + "loss": 0.258, + "step": 3400 + }, + { + "epoch": 0.3464390937722239, + "grad_norm": 3.90625, + "learning_rate": 3.706234534373515e-05, + "loss": 0.2376, + "step": 3410 + }, + { + "epoch": 0.3474550441938433, + "grad_norm": 1.1015625, + "learning_rate": 3.6991675167783985e-05, + "loss": 0.2403, + "step": 3420 + }, + { + "epoch": 0.34847099461546277, + "grad_norm": 1.1640625, + "learning_rate": 3.6920880312686556e-05, + "loss": 0.1642, + "step": 3430 + }, + { + "epoch": 0.34948694503708216, + "grad_norm": 2.875, + "learning_rate": 3.684996151450702e-05, + "loss": 0.1455, + "step": 3440 + }, + { + "epoch": 0.3505028954587016, + "grad_norm": 0.59765625, + "learning_rate": 3.6778919510598155e-05, + "loss": 0.2175, + "step": 3450 + }, + { + "epoch": 0.35151884588032106, + "grad_norm": 0.93359375, + "learning_rate": 3.670775503959376e-05, + "loss": 0.1858, + "step": 3460 + }, + { + "epoch": 0.35253479630194046, + "grad_norm": 4.1875, + "learning_rate": 3.6636468841400917e-05, + "loss": 0.1911, + "step": 3470 + }, + { + "epoch": 0.3535507467235599, + "grad_norm": 3.734375, + "learning_rate": 3.656506165719233e-05, + "loss": 0.2114, + "step": 3480 + }, + { + "epoch": 0.3545666971451793, + "grad_norm": 1.171875, + "learning_rate": 3.649353422939863e-05, + "loss": 0.1841, + "step": 3490 + }, + { + "epoch": 0.35558264756679875, + "grad_norm": 2.53125, + "learning_rate": 3.6421887301700615e-05, + "loss": 0.1505, + "step": 3500 + }, + { + "epoch": 0.35659859798841814, + "grad_norm": 4.9375, + "learning_rate": 3.6350121619021524e-05, + "loss": 0.2625, + "step": 3510 + }, + { + "epoch": 0.3576145484100376, + "grad_norm": 5.25, + "learning_rate": 3.627823792751936e-05, + "loss": 0.1676, + "step": 3520 + }, + { + "epoch": 0.35863049883165704, + "grad_norm": 1.09375, + "learning_rate": 3.620623697457905e-05, + "loss": 0.1963, + "step": 3530 + }, + { + "epoch": 0.35964644925327643, + "grad_norm": 4.03125, + "learning_rate": 3.613411950880468e-05, + "loss": 0.2048, + "step": 3540 + }, + { + "epoch": 0.3606623996748959, + "grad_norm": 4.40625, + "learning_rate": 3.606188628001178e-05, + "loss": 0.226, + "step": 3550 + }, + { + "epoch": 0.3616783500965153, + "grad_norm": 2.375, + "learning_rate": 3.598953803921947e-05, + "loss": 0.1884, + "step": 3560 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 3.21875, + "learning_rate": 3.591707553864266e-05, + "loss": 0.224, + "step": 3570 + }, + { + "epoch": 0.3637102509397541, + "grad_norm": 3.5625, + "learning_rate": 3.584449953168423e-05, + "loss": 0.1866, + "step": 3580 + }, + { + "epoch": 0.36472620136137357, + "grad_norm": 2.359375, + "learning_rate": 3.577181077292722e-05, + "loss": 0.1663, + "step": 3590 + }, + { + "epoch": 0.36574215178299296, + "grad_norm": 5.0, + "learning_rate": 3.569901001812696e-05, + "loss": 0.2032, + "step": 3600 + }, + { + "epoch": 0.3667581022046124, + "grad_norm": 1.953125, + "learning_rate": 3.562609802420321e-05, + "loss": 0.2395, + "step": 3610 + }, + { + "epoch": 0.36777405262623186, + "grad_norm": 3.796875, + "learning_rate": 3.555307554923229e-05, + "loss": 0.1799, + "step": 3620 + }, + { + "epoch": 0.36879000304785126, + "grad_norm": 4.4375, + "learning_rate": 3.547994335243925e-05, + "loss": 0.1771, + "step": 3630 + }, + { + "epoch": 0.3698059534694707, + "grad_norm": 1.890625, + "learning_rate": 3.540670219418989e-05, + "loss": 0.2123, + "step": 3640 + }, + { + "epoch": 0.3708219038910901, + "grad_norm": 4.03125, + "learning_rate": 3.53333528359829e-05, + "loss": 0.2159, + "step": 3650 + }, + { + "epoch": 0.37183785431270955, + "grad_norm": 3.265625, + "learning_rate": 3.525989604044198e-05, + "loss": 0.2749, + "step": 3660 + }, + { + "epoch": 0.37285380473432894, + "grad_norm": 1.4375, + "learning_rate": 3.5186332571307826e-05, + "loss": 0.1613, + "step": 3670 + }, + { + "epoch": 0.3738697551559484, + "grad_norm": 3.984375, + "learning_rate": 3.511266319343025e-05, + "loss": 0.1877, + "step": 3680 + }, + { + "epoch": 0.37488570557756784, + "grad_norm": 2.203125, + "learning_rate": 3.503888867276022e-05, + "loss": 0.2185, + "step": 3690 + }, + { + "epoch": 0.37590165599918723, + "grad_norm": 1.5078125, + "learning_rate": 3.4965009776341894e-05, + "loss": 0.2195, + "step": 3700 + }, + { + "epoch": 0.3769176064208067, + "grad_norm": 4.375, + "learning_rate": 3.489102727230461e-05, + "loss": 0.2344, + "step": 3710 + }, + { + "epoch": 0.3779335568424261, + "grad_norm": 2.984375, + "learning_rate": 3.481694192985496e-05, + "loss": 0.1863, + "step": 3720 + }, + { + "epoch": 0.3789495072640455, + "grad_norm": 1.1328125, + "learning_rate": 3.474275451926875e-05, + "loss": 0.1894, + "step": 3730 + }, + { + "epoch": 0.3799654576856649, + "grad_norm": 2.265625, + "learning_rate": 3.4668465811883e-05, + "loss": 0.2127, + "step": 3740 + }, + { + "epoch": 0.38098140810728437, + "grad_norm": 2.921875, + "learning_rate": 3.4594076580087914e-05, + "loss": 0.2125, + "step": 3750 + }, + { + "epoch": 0.38199735852890376, + "grad_norm": 2.390625, + "learning_rate": 3.451958759731889e-05, + "loss": 0.1801, + "step": 3760 + }, + { + "epoch": 0.3830133089505232, + "grad_norm": 3.046875, + "learning_rate": 3.4444999638048456e-05, + "loss": 0.1949, + "step": 3770 + }, + { + "epoch": 0.38402925937214266, + "grad_norm": 2.890625, + "learning_rate": 3.437031347777817e-05, + "loss": 0.2719, + "step": 3780 + }, + { + "epoch": 0.38504520979376206, + "grad_norm": 3.9375, + "learning_rate": 3.4295529893030634e-05, + "loss": 0.1697, + "step": 3790 + }, + { + "epoch": 0.3860611602153815, + "grad_norm": 2.0625, + "learning_rate": 3.422064966134138e-05, + "loss": 0.1557, + "step": 3800 + }, + { + "epoch": 0.3870771106370009, + "grad_norm": 2.234375, + "learning_rate": 3.4145673561250794e-05, + "loss": 0.2129, + "step": 3810 + }, + { + "epoch": 0.38809306105862035, + "grad_norm": 4.96875, + "learning_rate": 3.4070602372296e-05, + "loss": 0.2068, + "step": 3820 + }, + { + "epoch": 0.38910901148023974, + "grad_norm": 2.234375, + "learning_rate": 3.39954368750028e-05, + "loss": 0.1634, + "step": 3830 + }, + { + "epoch": 0.3901249619018592, + "grad_norm": 1.75, + "learning_rate": 3.392017785087752e-05, + "loss": 0.2299, + "step": 3840 + }, + { + "epoch": 0.39114091232347864, + "grad_norm": 3.90625, + "learning_rate": 3.38448260823989e-05, + "loss": 0.1585, + "step": 3850 + }, + { + "epoch": 0.39215686274509803, + "grad_norm": 2.8125, + "learning_rate": 3.376938235300996e-05, + "loss": 0.2382, + "step": 3860 + }, + { + "epoch": 0.3931728131667175, + "grad_norm": 5.375, + "learning_rate": 3.369384744710984e-05, + "loss": 0.1987, + "step": 3870 + }, + { + "epoch": 0.3941887635883369, + "grad_norm": 2.578125, + "learning_rate": 3.361822215004566e-05, + "loss": 0.2316, + "step": 3880 + }, + { + "epoch": 0.3952047140099563, + "grad_norm": 2.0, + "learning_rate": 3.354250724810436e-05, + "loss": 0.2019, + "step": 3890 + }, + { + "epoch": 0.3962206644315757, + "grad_norm": 2.3125, + "learning_rate": 3.34667035285045e-05, + "loss": 0.187, + "step": 3900 + }, + { + "epoch": 0.39723661485319517, + "grad_norm": 3.53125, + "learning_rate": 3.339081177938811e-05, + "loss": 0.2353, + "step": 3910 + }, + { + "epoch": 0.39825256527481456, + "grad_norm": 1.9609375, + "learning_rate": 3.331483278981244e-05, + "loss": 0.2078, + "step": 3920 + }, + { + "epoch": 0.399268515696434, + "grad_norm": 1.2109375, + "learning_rate": 3.323876734974183e-05, + "loss": 0.1761, + "step": 3930 + }, + { + "epoch": 0.40028446611805346, + "grad_norm": 4.0625, + "learning_rate": 3.316261625003943e-05, + "loss": 0.2081, + "step": 3940 + }, + { + "epoch": 0.40130041653967286, + "grad_norm": 1.953125, + "learning_rate": 3.308638028245902e-05, + "loss": 0.2087, + "step": 3950 + }, + { + "epoch": 0.4023163669612923, + "grad_norm": 2.390625, + "learning_rate": 3.301006023963676e-05, + "loss": 0.1579, + "step": 3960 + }, + { + "epoch": 0.4033323173829117, + "grad_norm": 3.53125, + "learning_rate": 3.293365691508295e-05, + "loss": 0.1904, + "step": 3970 + }, + { + "epoch": 0.40434826780453115, + "grad_norm": 3.0, + "learning_rate": 3.285717110317379e-05, + "loss": 0.1991, + "step": 3980 + }, + { + "epoch": 0.40536421822615054, + "grad_norm": 7.21875, + "learning_rate": 3.27806035991431e-05, + "loss": 0.1445, + "step": 3990 + }, + { + "epoch": 0.40638016864777, + "grad_norm": 1.0859375, + "learning_rate": 3.2703955199074075e-05, + "loss": 0.2393, + "step": 4000 + }, + { + "epoch": 0.40739611906938944, + "grad_norm": 4.5625, + "learning_rate": 3.262722669989098e-05, + "loss": 0.1789, + "step": 4010 + }, + { + "epoch": 0.40841206949100883, + "grad_norm": 3.09375, + "learning_rate": 3.255041889935092e-05, + "loss": 0.1511, + "step": 4020 + }, + { + "epoch": 0.4094280199126283, + "grad_norm": 1.90625, + "learning_rate": 3.247353259603547e-05, + "loss": 0.2066, + "step": 4030 + }, + { + "epoch": 0.4104439703342477, + "grad_norm": 2.28125, + "learning_rate": 3.239656858934242e-05, + "loss": 0.1564, + "step": 4040 + }, + { + "epoch": 0.4114599207558671, + "grad_norm": 2.609375, + "learning_rate": 3.231952767947746e-05, + "loss": 0.1503, + "step": 4050 + }, + { + "epoch": 0.4124758711774865, + "grad_norm": 1.4453125, + "learning_rate": 3.2242410667445844e-05, + "loss": 0.1633, + "step": 4060 + }, + { + "epoch": 0.41349182159910597, + "grad_norm": 3.015625, + "learning_rate": 3.2165218355044076e-05, + "loss": 0.1492, + "step": 4070 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 3.234375, + "learning_rate": 3.2087951544851566e-05, + "loss": 0.3051, + "step": 4080 + }, + { + "epoch": 0.4155237224423448, + "grad_norm": 2.9375, + "learning_rate": 3.20106110402223e-05, + "loss": 0.2229, + "step": 4090 + }, + { + "epoch": 0.41653967286396426, + "grad_norm": 3.171875, + "learning_rate": 3.1933197645276455e-05, + "loss": 0.2224, + "step": 4100 + }, + { + "epoch": 0.41755562328558365, + "grad_norm": 2.09375, + "learning_rate": 3.185571216489209e-05, + "loss": 0.1297, + "step": 4110 + }, + { + "epoch": 0.4185715737072031, + "grad_norm": 3.625, + "learning_rate": 3.177815540469669e-05, + "loss": 0.2074, + "step": 4120 + }, + { + "epoch": 0.4195875241288225, + "grad_norm": 2.296875, + "learning_rate": 3.1700528171058916e-05, + "loss": 0.1949, + "step": 4130 + }, + { + "epoch": 0.42060347455044195, + "grad_norm": 3.8125, + "learning_rate": 3.162283127108011e-05, + "loss": 0.1661, + "step": 4140 + }, + { + "epoch": 0.42161942497206134, + "grad_norm": 2.5, + "learning_rate": 3.154506551258594e-05, + "loss": 0.2275, + "step": 4150 + }, + { + "epoch": 0.4226353753936808, + "grad_norm": 2.96875, + "learning_rate": 3.146723170411804e-05, + "loss": 0.2242, + "step": 4160 + }, + { + "epoch": 0.42365132581530024, + "grad_norm": 6.625, + "learning_rate": 3.138933065492552e-05, + "loss": 0.1897, + "step": 4170 + }, + { + "epoch": 0.42466727623691963, + "grad_norm": 0.8515625, + "learning_rate": 3.131136317495665e-05, + "loss": 0.1629, + "step": 4180 + }, + { + "epoch": 0.4256832266585391, + "grad_norm": 0.94140625, + "learning_rate": 3.1233330074850364e-05, + "loss": 0.1535, + "step": 4190 + }, + { + "epoch": 0.4266991770801585, + "grad_norm": 2.6875, + "learning_rate": 3.115523216592786e-05, + "loss": 0.2494, + "step": 4200 + }, + { + "epoch": 0.4277151275017779, + "grad_norm": 2.578125, + "learning_rate": 3.107707026018417e-05, + "loss": 0.1705, + "step": 4210 + }, + { + "epoch": 0.4287310779233973, + "grad_norm": 3.0625, + "learning_rate": 3.09988451702797e-05, + "loss": 0.1507, + "step": 4220 + }, + { + "epoch": 0.42974702834501677, + "grad_norm": 2.421875, + "learning_rate": 3.0920557709531804e-05, + "loss": 0.3071, + "step": 4230 + }, + { + "epoch": 0.43076297876663616, + "grad_norm": 3.640625, + "learning_rate": 3.0842208691906306e-05, + "loss": 0.199, + "step": 4240 + }, + { + "epoch": 0.4317789291882556, + "grad_norm": 3.5625, + "learning_rate": 3.076379893200904e-05, + "loss": 0.1987, + "step": 4250 + }, + { + "epoch": 0.43279487960987506, + "grad_norm": 3.65625, + "learning_rate": 3.068532924507739e-05, + "loss": 0.1945, + "step": 4260 + }, + { + "epoch": 0.43381083003149445, + "grad_norm": 5.875, + "learning_rate": 3.060680044697183e-05, + "loss": 0.1937, + "step": 4270 + }, + { + "epoch": 0.4348267804531139, + "grad_norm": 2.859375, + "learning_rate": 3.052821335416739e-05, + "loss": 0.1643, + "step": 4280 + }, + { + "epoch": 0.4358427308747333, + "grad_norm": 3.296875, + "learning_rate": 3.0449568783745203e-05, + "loss": 0.1455, + "step": 4290 + }, + { + "epoch": 0.43685868129635275, + "grad_norm": 0.427734375, + "learning_rate": 3.0370867553384023e-05, + "loss": 0.1891, + "step": 4300 + }, + { + "epoch": 0.43787463171797214, + "grad_norm": 0.361328125, + "learning_rate": 3.029211048135171e-05, + "loss": 0.1377, + "step": 4310 + }, + { + "epoch": 0.4388905821395916, + "grad_norm": 1.8203125, + "learning_rate": 3.021329838649668e-05, + "loss": 0.2194, + "step": 4320 + }, + { + "epoch": 0.43990653256121104, + "grad_norm": 1.8828125, + "learning_rate": 3.0134432088239462e-05, + "loss": 0.1915, + "step": 4330 + }, + { + "epoch": 0.44092248298283043, + "grad_norm": 2.015625, + "learning_rate": 3.0055512406564146e-05, + "loss": 0.1794, + "step": 4340 + }, + { + "epoch": 0.4419384334044499, + "grad_norm": 2.546875, + "learning_rate": 2.9976540162009836e-05, + "loss": 0.2154, + "step": 4350 + }, + { + "epoch": 0.4429543838260693, + "grad_norm": 4.09375, + "learning_rate": 2.9897516175662155e-05, + "loss": 0.1861, + "step": 4360 + }, + { + "epoch": 0.4439703342476887, + "grad_norm": 3.953125, + "learning_rate": 2.9818441269144693e-05, + "loss": 0.1857, + "step": 4370 + }, + { + "epoch": 0.4449862846693081, + "grad_norm": 2.234375, + "learning_rate": 2.9739316264610452e-05, + "loss": 0.1493, + "step": 4380 + }, + { + "epoch": 0.44600223509092757, + "grad_norm": 1.109375, + "learning_rate": 2.966014198473332e-05, + "loss": 0.186, + "step": 4390 + }, + { + "epoch": 0.44701818551254696, + "grad_norm": 4.5625, + "learning_rate": 2.9580919252699502e-05, + "loss": 0.1963, + "step": 4400 + }, + { + "epoch": 0.4480341359341664, + "grad_norm": 7.3125, + "learning_rate": 2.9501648892198984e-05, + "loss": 0.2882, + "step": 4410 + }, + { + "epoch": 0.44905008635578586, + "grad_norm": 3.03125, + "learning_rate": 2.942233172741693e-05, + "loss": 0.2154, + "step": 4420 + }, + { + "epoch": 0.45006603677740525, + "grad_norm": 2.421875, + "learning_rate": 2.934296858302515e-05, + "loss": 0.2228, + "step": 4430 + }, + { + "epoch": 0.4510819871990247, + "grad_norm": 1.6015625, + "learning_rate": 2.9263560284173485e-05, + "loss": 0.1637, + "step": 4440 + }, + { + "epoch": 0.4520979376206441, + "grad_norm": 4.5, + "learning_rate": 2.91841076564813e-05, + "loss": 0.1396, + "step": 4450 + }, + { + "epoch": 0.45311388804226355, + "grad_norm": 1.9609375, + "learning_rate": 2.9104611526028808e-05, + "loss": 0.186, + "step": 4460 + }, + { + "epoch": 0.45412983846388294, + "grad_norm": 2.046875, + "learning_rate": 2.902507271934855e-05, + "loss": 0.1706, + "step": 4470 + }, + { + "epoch": 0.4551457888855024, + "grad_norm": 2.390625, + "learning_rate": 2.8945492063416768e-05, + "loss": 0.2191, + "step": 4480 + }, + { + "epoch": 0.45616173930712184, + "grad_norm": 2.734375, + "learning_rate": 2.8865870385644823e-05, + "loss": 0.1651, + "step": 4490 + }, + { + "epoch": 0.45717768972874123, + "grad_norm": 4.4375, + "learning_rate": 2.8786208513870583e-05, + "loss": 0.1907, + "step": 4500 + }, + { + "epoch": 0.4581936401503607, + "grad_norm": 1.9609375, + "learning_rate": 2.8706507276349815e-05, + "loss": 0.2256, + "step": 4510 + }, + { + "epoch": 0.4592095905719801, + "grad_norm": 3.375, + "learning_rate": 2.8626767501747588e-05, + "loss": 0.215, + "step": 4520 + }, + { + "epoch": 0.4602255409935995, + "grad_norm": 2.296875, + "learning_rate": 2.854699001912964e-05, + "loss": 0.2241, + "step": 4530 + }, + { + "epoch": 0.4612414914152189, + "grad_norm": 2.078125, + "learning_rate": 2.846717565795376e-05, + "loss": 0.1541, + "step": 4540 + }, + { + "epoch": 0.46225744183683837, + "grad_norm": 0.81640625, + "learning_rate": 2.8387325248061164e-05, + "loss": 0.1718, + "step": 4550 + }, + { + "epoch": 0.46327339225845776, + "grad_norm": 5.6875, + "learning_rate": 2.8307439619667897e-05, + "loss": 0.259, + "step": 4560 + }, + { + "epoch": 0.4642893426800772, + "grad_norm": 1.78125, + "learning_rate": 2.8227519603356157e-05, + "loss": 0.2205, + "step": 4570 + }, + { + "epoch": 0.46530529310169666, + "grad_norm": 4.78125, + "learning_rate": 2.8147566030065677e-05, + "loss": 0.2256, + "step": 4580 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 3.296875, + "learning_rate": 2.8067579731085085e-05, + "loss": 0.1671, + "step": 4590 + }, + { + "epoch": 0.4673371939449355, + "grad_norm": 3.265625, + "learning_rate": 2.7987561538043273e-05, + "loss": 0.2471, + "step": 4600 + }, + { + "epoch": 0.4683531443665549, + "grad_norm": 3.390625, + "learning_rate": 2.7907512282900727e-05, + "loss": 0.1749, + "step": 4610 + }, + { + "epoch": 0.46936909478817435, + "grad_norm": 3.140625, + "learning_rate": 2.782743279794091e-05, + "loss": 0.2276, + "step": 4620 + }, + { + "epoch": 0.47038504520979374, + "grad_norm": 2.921875, + "learning_rate": 2.7747323915761574e-05, + "loss": 0.1971, + "step": 4630 + }, + { + "epoch": 0.4714009956314132, + "grad_norm": 4.15625, + "learning_rate": 2.7667186469266122e-05, + "loss": 0.1951, + "step": 4640 + }, + { + "epoch": 0.47241694605303264, + "grad_norm": 2.953125, + "learning_rate": 2.7587021291654924e-05, + "loss": 0.2045, + "step": 4650 + }, + { + "epoch": 0.47343289647465203, + "grad_norm": 1.6640625, + "learning_rate": 2.750682921641672e-05, + "loss": 0.155, + "step": 4660 + }, + { + "epoch": 0.4744488468962715, + "grad_norm": 4.375, + "learning_rate": 2.7426611077319864e-05, + "loss": 0.2038, + "step": 4670 + }, + { + "epoch": 0.4754647973178909, + "grad_norm": 5.5, + "learning_rate": 2.734636770840372e-05, + "loss": 0.159, + "step": 4680 + }, + { + "epoch": 0.4764807477395103, + "grad_norm": 1.703125, + "learning_rate": 2.7266099943969976e-05, + "loss": 0.1566, + "step": 4690 + }, + { + "epoch": 0.4774966981611297, + "grad_norm": 0.81640625, + "learning_rate": 2.7185808618573943e-05, + "loss": 0.1927, + "step": 4700 + }, + { + "epoch": 0.47851264858274917, + "grad_norm": 0.81640625, + "learning_rate": 2.710549456701592e-05, + "loss": 0.1873, + "step": 4710 + }, + { + "epoch": 0.47952859900436856, + "grad_norm": 3.828125, + "learning_rate": 2.702515862433247e-05, + "loss": 0.2474, + "step": 4720 + }, + { + "epoch": 0.480544549425988, + "grad_norm": 1.1640625, + "learning_rate": 2.6944801625787795e-05, + "loss": 0.204, + "step": 4730 + }, + { + "epoch": 0.48156049984760746, + "grad_norm": 2.953125, + "learning_rate": 2.6864424406864984e-05, + "loss": 0.1758, + "step": 4740 + }, + { + "epoch": 0.48257645026922685, + "grad_norm": 3.265625, + "learning_rate": 2.6784027803257377e-05, + "loss": 0.161, + "step": 4750 + }, + { + "epoch": 0.4835924006908463, + "grad_norm": 2.046875, + "learning_rate": 2.6703612650859848e-05, + "loss": 0.1469, + "step": 4760 + }, + { + "epoch": 0.4846083511124657, + "grad_norm": 4.03125, + "learning_rate": 2.6623179785760148e-05, + "loss": 0.1858, + "step": 4770 + }, + { + "epoch": 0.48562430153408515, + "grad_norm": 2.65625, + "learning_rate": 2.6542730044230175e-05, + "loss": 0.176, + "step": 4780 + }, + { + "epoch": 0.48664025195570454, + "grad_norm": 2.59375, + "learning_rate": 2.6462264262717278e-05, + "loss": 0.1657, + "step": 4790 + }, + { + "epoch": 0.487656202377324, + "grad_norm": 4.78125, + "learning_rate": 2.6381783277835605e-05, + "loss": 0.2705, + "step": 4800 + }, + { + "epoch": 0.48867215279894344, + "grad_norm": 3.65625, + "learning_rate": 2.6301287926357355e-05, + "loss": 0.2252, + "step": 4810 + }, + { + "epoch": 0.48968810322056283, + "grad_norm": 0.734375, + "learning_rate": 2.622077904520411e-05, + "loss": 0.2141, + "step": 4820 + }, + { + "epoch": 0.4907040536421823, + "grad_norm": 5.15625, + "learning_rate": 2.6140257471438108e-05, + "loss": 0.1935, + "step": 4830 + }, + { + "epoch": 0.4917200040638017, + "grad_norm": 3.625, + "learning_rate": 2.6059724042253574e-05, + "loss": 0.2121, + "step": 4840 + }, + { + "epoch": 0.4927359544854211, + "grad_norm": 1.2890625, + "learning_rate": 2.5979179594967983e-05, + "loss": 0.1221, + "step": 4850 + }, + { + "epoch": 0.4937519049070405, + "grad_norm": 3.4375, + "learning_rate": 2.5898624967013367e-05, + "loss": 0.2208, + "step": 4860 + }, + { + "epoch": 0.49476785532865997, + "grad_norm": 2.40625, + "learning_rate": 2.5818060995927607e-05, + "loss": 0.1904, + "step": 4870 + }, + { + "epoch": 0.49578380575027936, + "grad_norm": 2.921875, + "learning_rate": 2.573748851934574e-05, + "loss": 0.1658, + "step": 4880 + }, + { + "epoch": 0.4967997561718988, + "grad_norm": 1.6640625, + "learning_rate": 2.5656908374991213e-05, + "loss": 0.1626, + "step": 4890 + }, + { + "epoch": 0.49781570659351826, + "grad_norm": 1.8046875, + "learning_rate": 2.557632140066721e-05, + "loss": 0.1905, + "step": 4900 + }, + { + "epoch": 0.49883165701513765, + "grad_norm": 4.875, + "learning_rate": 2.5495728434247917e-05, + "loss": 0.2591, + "step": 4910 + }, + { + "epoch": 0.4998476074367571, + "grad_norm": 1.4453125, + "learning_rate": 2.5415130313669845e-05, + "loss": 0.1359, + "step": 4920 + }, + { + "epoch": 0.5008635578583766, + "grad_norm": 2.109375, + "learning_rate": 2.5334527876923063e-05, + "loss": 0.2353, + "step": 4930 + }, + { + "epoch": 0.501879508279996, + "grad_norm": 3.546875, + "learning_rate": 2.5253921962042525e-05, + "loss": 0.2173, + "step": 4940 + }, + { + "epoch": 0.5028954587016153, + "grad_norm": 1.8125, + "learning_rate": 2.5173313407099373e-05, + "loss": 0.1631, + "step": 4950 + }, + { + "epoch": 0.5039114091232347, + "grad_norm": 2.671875, + "learning_rate": 2.5092703050192163e-05, + "loss": 0.1884, + "step": 4960 + }, + { + "epoch": 0.5049273595448542, + "grad_norm": 2.5625, + "learning_rate": 2.501209172943819e-05, + "loss": 0.217, + "step": 4970 + }, + { + "epoch": 0.5059433099664736, + "grad_norm": 4.375, + "learning_rate": 2.49314802829648e-05, + "loss": 0.1854, + "step": 4980 + }, + { + "epoch": 0.506959260388093, + "grad_norm": 2.3125, + "learning_rate": 2.4850869548900628e-05, + "loss": 0.2049, + "step": 4990 + }, + { + "epoch": 0.5079752108097125, + "grad_norm": 3.859375, + "learning_rate": 2.477026036536688e-05, + "loss": 0.2093, + "step": 5000 + }, + { + "epoch": 0.5089911612313319, + "grad_norm": 1.09375, + "learning_rate": 2.4689653570468677e-05, + "loss": 0.164, + "step": 5010 + }, + { + "epoch": 0.5100071116529513, + "grad_norm": 3.40625, + "learning_rate": 2.460905000228628e-05, + "loss": 0.1649, + "step": 5020 + }, + { + "epoch": 0.5110230620745707, + "grad_norm": 3.546875, + "learning_rate": 2.4528450498866428e-05, + "loss": 0.1777, + "step": 5030 + }, + { + "epoch": 0.5120390124961902, + "grad_norm": 3.0, + "learning_rate": 2.444785589821356e-05, + "loss": 0.1505, + "step": 5040 + }, + { + "epoch": 0.5130549629178096, + "grad_norm": 1.6484375, + "learning_rate": 2.436726703828118e-05, + "loss": 0.2672, + "step": 5050 + }, + { + "epoch": 0.514070913339429, + "grad_norm": 4.34375, + "learning_rate": 2.428668475696308e-05, + "loss": 0.1756, + "step": 5060 + }, + { + "epoch": 0.5150868637610485, + "grad_norm": 2.78125, + "learning_rate": 2.420610989208465e-05, + "loss": 0.1655, + "step": 5070 + }, + { + "epoch": 0.5161028141826679, + "grad_norm": 1.4609375, + "learning_rate": 2.412554328139419e-05, + "loss": 0.1579, + "step": 5080 + }, + { + "epoch": 0.5171187646042873, + "grad_norm": 2.28125, + "learning_rate": 2.404498576255416e-05, + "loss": 0.1599, + "step": 5090 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 0.6484375, + "learning_rate": 2.3964438173132522e-05, + "loss": 0.1508, + "step": 5100 + }, + { + "epoch": 0.5191506654475262, + "grad_norm": 3.390625, + "learning_rate": 2.388390135059395e-05, + "loss": 0.1578, + "step": 5110 + }, + { + "epoch": 0.5201666158691456, + "grad_norm": 1.21875, + "learning_rate": 2.3803376132291226e-05, + "loss": 0.1374, + "step": 5120 + }, + { + "epoch": 0.521182566290765, + "grad_norm": 4.0625, + "learning_rate": 2.3722863355456436e-05, + "loss": 0.1854, + "step": 5130 + }, + { + "epoch": 0.5221985167123845, + "grad_norm": 4.71875, + "learning_rate": 2.364236385719236e-05, + "loss": 0.1391, + "step": 5140 + }, + { + "epoch": 0.5232144671340039, + "grad_norm": 3.296875, + "learning_rate": 2.356187847446366e-05, + "loss": 0.2106, + "step": 5150 + }, + { + "epoch": 0.5242304175556233, + "grad_norm": 3.296875, + "learning_rate": 2.348140804408829e-05, + "loss": 0.2383, + "step": 5160 + }, + { + "epoch": 0.5252463679772427, + "grad_norm": 3.359375, + "learning_rate": 2.3400953402728713e-05, + "loss": 0.1537, + "step": 5170 + }, + { + "epoch": 0.5262623183988622, + "grad_norm": 1.4921875, + "learning_rate": 2.332051538688322e-05, + "loss": 0.1841, + "step": 5180 + }, + { + "epoch": 0.5272782688204816, + "grad_norm": 3.25, + "learning_rate": 2.3240094832877287e-05, + "loss": 0.1855, + "step": 5190 + }, + { + "epoch": 0.528294219242101, + "grad_norm": 3.34375, + "learning_rate": 2.3159692576854793e-05, + "loss": 0.2625, + "step": 5200 + }, + { + "epoch": 0.5293101696637205, + "grad_norm": 3.6875, + "learning_rate": 2.3079309454769413e-05, + "loss": 0.1292, + "step": 5210 + }, + { + "epoch": 0.5303261200853399, + "grad_norm": 1.1171875, + "learning_rate": 2.2998946302375827e-05, + "loss": 0.1263, + "step": 5220 + }, + { + "epoch": 0.5313420705069593, + "grad_norm": 2.71875, + "learning_rate": 2.2918603955221148e-05, + "loss": 0.2296, + "step": 5230 + }, + { + "epoch": 0.5323580209285786, + "grad_norm": 2.015625, + "learning_rate": 2.283828324863613e-05, + "loss": 0.1231, + "step": 5240 + }, + { + "epoch": 0.5333739713501982, + "grad_norm": 3.671875, + "learning_rate": 2.2757985017726557e-05, + "loss": 0.1939, + "step": 5250 + }, + { + "epoch": 0.5343899217718175, + "grad_norm": 1.9765625, + "learning_rate": 2.2677710097364495e-05, + "loss": 0.168, + "step": 5260 + }, + { + "epoch": 0.5354058721934369, + "grad_norm": 2.609375, + "learning_rate": 2.259745932217969e-05, + "loss": 0.1883, + "step": 5270 + }, + { + "epoch": 0.5364218226150563, + "grad_norm": 2.8125, + "learning_rate": 2.2517233526550817e-05, + "loss": 0.1898, + "step": 5280 + }, + { + "epoch": 0.5374377730366758, + "grad_norm": 3.125, + "learning_rate": 2.2437033544596837e-05, + "loss": 0.1838, + "step": 5290 + }, + { + "epoch": 0.5384537234582952, + "grad_norm": 4.90625, + "learning_rate": 2.2356860210168336e-05, + "loss": 0.1553, + "step": 5300 + }, + { + "epoch": 0.5394696738799146, + "grad_norm": 3.171875, + "learning_rate": 2.2276714356838824e-05, + "loss": 0.2248, + "step": 5310 + }, + { + "epoch": 0.5404856243015341, + "grad_norm": 1.34375, + "learning_rate": 2.2196596817896118e-05, + "loss": 0.1421, + "step": 5320 + }, + { + "epoch": 0.5415015747231535, + "grad_norm": 3.28125, + "learning_rate": 2.2116508426333596e-05, + "loss": 0.1947, + "step": 5330 + }, + { + "epoch": 0.5425175251447729, + "grad_norm": 1.9296875, + "learning_rate": 2.2036450014841652e-05, + "loss": 0.2207, + "step": 5340 + }, + { + "epoch": 0.5435334755663923, + "grad_norm": 0.5703125, + "learning_rate": 2.19564224157989e-05, + "loss": 0.2208, + "step": 5350 + }, + { + "epoch": 0.5445494259880118, + "grad_norm": 7.5625, + "learning_rate": 2.1876426461263654e-05, + "loss": 0.1739, + "step": 5360 + }, + { + "epoch": 0.5455653764096312, + "grad_norm": 2.15625, + "learning_rate": 2.179646298296519e-05, + "loss": 0.1938, + "step": 5370 + }, + { + "epoch": 0.5465813268312506, + "grad_norm": 4.1875, + "learning_rate": 2.171653281229511e-05, + "loss": 0.1736, + "step": 5380 + }, + { + "epoch": 0.5475972772528701, + "grad_norm": 4.65625, + "learning_rate": 2.1636636780298732e-05, + "loss": 0.2167, + "step": 5390 + }, + { + "epoch": 0.5486132276744895, + "grad_norm": 1.84375, + "learning_rate": 2.1556775717666427e-05, + "loss": 0.1711, + "step": 5400 + }, + { + "epoch": 0.5496291780961089, + "grad_norm": 5.125, + "learning_rate": 2.147695045472499e-05, + "loss": 0.1789, + "step": 5410 + }, + { + "epoch": 0.5506451285177283, + "grad_norm": 3.859375, + "learning_rate": 2.1397161821428973e-05, + "loss": 0.2187, + "step": 5420 + }, + { + "epoch": 0.5516610789393478, + "grad_norm": 2.25, + "learning_rate": 2.131741064735212e-05, + "loss": 0.1367, + "step": 5430 + }, + { + "epoch": 0.5526770293609672, + "grad_norm": 4.65625, + "learning_rate": 2.1237697761678684e-05, + "loss": 0.1574, + "step": 5440 + }, + { + "epoch": 0.5536929797825866, + "grad_norm": 1.2265625, + "learning_rate": 2.1158023993194848e-05, + "loss": 0.1301, + "step": 5450 + }, + { + "epoch": 0.5547089302042061, + "grad_norm": 4.21875, + "learning_rate": 2.107839017028005e-05, + "loss": 0.2782, + "step": 5460 + }, + { + "epoch": 0.5557248806258255, + "grad_norm": 0.52734375, + "learning_rate": 2.0998797120898457e-05, + "loss": 0.2024, + "step": 5470 + }, + { + "epoch": 0.5567408310474449, + "grad_norm": 1.46875, + "learning_rate": 2.0919245672590277e-05, + "loss": 0.1755, + "step": 5480 + }, + { + "epoch": 0.5577567814690643, + "grad_norm": 2.140625, + "learning_rate": 2.083973665246318e-05, + "loss": 0.2058, + "step": 5490 + }, + { + "epoch": 0.5587727318906838, + "grad_norm": 1.5390625, + "learning_rate": 2.076027088718373e-05, + "loss": 0.2159, + "step": 5500 + }, + { + "epoch": 0.5597886823123032, + "grad_norm": 1.9921875, + "learning_rate": 2.0680849202968743e-05, + "loss": 0.2139, + "step": 5510 + }, + { + "epoch": 0.5608046327339226, + "grad_norm": 2.4375, + "learning_rate": 2.060147242557674e-05, + "loss": 0.183, + "step": 5520 + }, + { + "epoch": 0.5618205831555421, + "grad_norm": 5.5, + "learning_rate": 2.0522141380299308e-05, + "loss": 0.1673, + "step": 5530 + }, + { + "epoch": 0.5628365335771615, + "grad_norm": 4.25, + "learning_rate": 2.044285689195258e-05, + "loss": 0.1674, + "step": 5540 + }, + { + "epoch": 0.5638524839987809, + "grad_norm": 2.109375, + "learning_rate": 2.0363619784868604e-05, + "loss": 0.1531, + "step": 5550 + }, + { + "epoch": 0.5648684344204002, + "grad_norm": 2.59375, + "learning_rate": 2.0284430882886836e-05, + "loss": 0.1665, + "step": 5560 + }, + { + "epoch": 0.5658843848420197, + "grad_norm": 3.984375, + "learning_rate": 2.020529100934549e-05, + "loss": 0.1717, + "step": 5570 + }, + { + "epoch": 0.5669003352636391, + "grad_norm": 1.6015625, + "learning_rate": 2.012620098707306e-05, + "loss": 0.1167, + "step": 5580 + }, + { + "epoch": 0.5679162856852585, + "grad_norm": 6.0625, + "learning_rate": 2.004716163837972e-05, + "loss": 0.2084, + "step": 5590 + }, + { + "epoch": 0.5689322361068779, + "grad_norm": 2.5625, + "learning_rate": 1.996817378504876e-05, + "loss": 0.1939, + "step": 5600 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 3.109375, + "learning_rate": 1.9889238248328108e-05, + "loss": 0.1241, + "step": 5610 + }, + { + "epoch": 0.5709641369501168, + "grad_norm": 4.875, + "learning_rate": 1.981035584892171e-05, + "loss": 0.1865, + "step": 5620 + }, + { + "epoch": 0.5719800873717362, + "grad_norm": 2.984375, + "learning_rate": 1.9731527406981072e-05, + "loss": 0.1639, + "step": 5630 + }, + { + "epoch": 0.5729960377933557, + "grad_norm": 4.4375, + "learning_rate": 1.9652753742096655e-05, + "loss": 0.2019, + "step": 5640 + }, + { + "epoch": 0.5740119882149751, + "grad_norm": 4.3125, + "learning_rate": 1.9574035673289432e-05, + "loss": 0.1829, + "step": 5650 + }, + { + "epoch": 0.5750279386365945, + "grad_norm": 3.203125, + "learning_rate": 1.9495374019002312e-05, + "loss": 0.2267, + "step": 5660 + }, + { + "epoch": 0.5760438890582139, + "grad_norm": 1.765625, + "learning_rate": 1.9416769597091673e-05, + "loss": 0.1411, + "step": 5670 + }, + { + "epoch": 0.5770598394798334, + "grad_norm": 2.640625, + "learning_rate": 1.9338223224818818e-05, + "loss": 0.1476, + "step": 5680 + }, + { + "epoch": 0.5780757899014528, + "grad_norm": 4.84375, + "learning_rate": 1.9259735718841524e-05, + "loss": 0.1417, + "step": 5690 + }, + { + "epoch": 0.5790917403230722, + "grad_norm": 2.421875, + "learning_rate": 1.918130789520551e-05, + "loss": 0.1592, + "step": 5700 + }, + { + "epoch": 0.5801076907446917, + "grad_norm": 2.984375, + "learning_rate": 1.9102940569335963e-05, + "loss": 0.161, + "step": 5710 + }, + { + "epoch": 0.5811236411663111, + "grad_norm": 1.0234375, + "learning_rate": 1.9024634556029093e-05, + "loss": 0.1614, + "step": 5720 + }, + { + "epoch": 0.5821395915879305, + "grad_norm": 2.90625, + "learning_rate": 1.89463906694436e-05, + "loss": 0.1505, + "step": 5730 + }, + { + "epoch": 0.5831555420095499, + "grad_norm": 2.875, + "learning_rate": 1.8868209723092286e-05, + "loss": 0.1674, + "step": 5740 + }, + { + "epoch": 0.5841714924311694, + "grad_norm": 0.408203125, + "learning_rate": 1.8790092529833508e-05, + "loss": 0.1468, + "step": 5750 + }, + { + "epoch": 0.5851874428527888, + "grad_norm": 5.1875, + "learning_rate": 1.871203990186281e-05, + "loss": 0.1903, + "step": 5760 + }, + { + "epoch": 0.5862033932744082, + "grad_norm": 0.5546875, + "learning_rate": 1.8634052650704415e-05, + "loss": 0.2644, + "step": 5770 + }, + { + "epoch": 0.5872193436960277, + "grad_norm": 3.203125, + "learning_rate": 1.8556131587202848e-05, + "loss": 0.1968, + "step": 5780 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.484375, + "learning_rate": 1.8478277521514424e-05, + "loss": 0.2249, + "step": 5790 + }, + { + "epoch": 0.5892512445392665, + "grad_norm": 4.0, + "learning_rate": 1.8400491263098906e-05, + "loss": 0.1881, + "step": 5800 + }, + { + "epoch": 0.5902671949608859, + "grad_norm": 1.90625, + "learning_rate": 1.832277362071106e-05, + "loss": 0.1352, + "step": 5810 + }, + { + "epoch": 0.5912831453825054, + "grad_norm": 2.765625, + "learning_rate": 1.824512540239221e-05, + "loss": 0.2737, + "step": 5820 + }, + { + "epoch": 0.5922990958041248, + "grad_norm": 2.609375, + "learning_rate": 1.81675474154619e-05, + "loss": 0.1566, + "step": 5830 + }, + { + "epoch": 0.5933150462257442, + "grad_norm": 2.6875, + "learning_rate": 1.8090040466509444e-05, + "loss": 0.1999, + "step": 5840 + }, + { + "epoch": 0.5943309966473637, + "grad_norm": 2.609375, + "learning_rate": 1.8012605361385592e-05, + "loss": 0.2372, + "step": 5850 + }, + { + "epoch": 0.5953469470689831, + "grad_norm": 8.125, + "learning_rate": 1.7935242905194087e-05, + "loss": 0.2411, + "step": 5860 + }, + { + "epoch": 0.5963628974906025, + "grad_norm": 3.46875, + "learning_rate": 1.785795390228336e-05, + "loss": 0.138, + "step": 5870 + }, + { + "epoch": 0.5973788479122218, + "grad_norm": 2.3125, + "learning_rate": 1.7780739156238125e-05, + "loss": 0.1867, + "step": 5880 + }, + { + "epoch": 0.5983947983338413, + "grad_norm": 4.0625, + "learning_rate": 1.770359946987105e-05, + "loss": 0.2091, + "step": 5890 + }, + { + "epoch": 0.5994107487554607, + "grad_norm": 5.21875, + "learning_rate": 1.7626535645214378e-05, + "loss": 0.2091, + "step": 5900 + }, + { + "epoch": 0.6004266991770801, + "grad_norm": 3.15625, + "learning_rate": 1.7549548483511614e-05, + "loss": 0.1927, + "step": 5910 + }, + { + "epoch": 0.6014426495986995, + "grad_norm": 4.71875, + "learning_rate": 1.7472638785209198e-05, + "loss": 0.1893, + "step": 5920 + }, + { + "epoch": 0.602458600020319, + "grad_norm": 3.015625, + "learning_rate": 1.7395807349948145e-05, + "loss": 0.1557, + "step": 5930 + }, + { + "epoch": 0.6034745504419384, + "grad_norm": 2.9375, + "learning_rate": 1.73190549765558e-05, + "loss": 0.1717, + "step": 5940 + }, + { + "epoch": 0.6044905008635578, + "grad_norm": 3.109375, + "learning_rate": 1.724238246303745e-05, + "loss": 0.1879, + "step": 5950 + }, + { + "epoch": 0.6055064512851773, + "grad_norm": 3.875, + "learning_rate": 1.71657906065681e-05, + "loss": 0.1908, + "step": 5960 + }, + { + "epoch": 0.6065224017067967, + "grad_norm": 5.09375, + "learning_rate": 1.7089280203484115e-05, + "loss": 0.1712, + "step": 5970 + }, + { + "epoch": 0.6075383521284161, + "grad_norm": 3.015625, + "learning_rate": 1.701285204927502e-05, + "loss": 0.1454, + "step": 5980 + }, + { + "epoch": 0.6085543025500355, + "grad_norm": 3.265625, + "learning_rate": 1.693650693857515e-05, + "loss": 0.2283, + "step": 5990 + }, + { + "epoch": 0.609570252971655, + "grad_norm": 3.40625, + "learning_rate": 1.6860245665155466e-05, + "loss": 0.2188, + "step": 6000 + }, + { + "epoch": 0.6105862033932744, + "grad_norm": 2.5625, + "learning_rate": 1.678406902191521e-05, + "loss": 0.1605, + "step": 6010 + }, + { + "epoch": 0.6116021538148938, + "grad_norm": 0.6796875, + "learning_rate": 1.670797780087374e-05, + "loss": 0.1472, + "step": 6020 + }, + { + "epoch": 0.6126181042365133, + "grad_norm": 2.234375, + "learning_rate": 1.6631972793162288e-05, + "loss": 0.1676, + "step": 6030 + }, + { + "epoch": 0.6136340546581327, + "grad_norm": 1.25, + "learning_rate": 1.6556054789015662e-05, + "loss": 0.1508, + "step": 6040 + }, + { + "epoch": 0.6146500050797521, + "grad_norm": 4.78125, + "learning_rate": 1.6480224577764132e-05, + "loss": 0.1981, + "step": 6050 + }, + { + "epoch": 0.6156659555013715, + "grad_norm": 3.46875, + "learning_rate": 1.6404482947825137e-05, + "loss": 0.2514, + "step": 6060 + }, + { + "epoch": 0.616681905922991, + "grad_norm": 1.265625, + "learning_rate": 1.6328830686695154e-05, + "loss": 0.2397, + "step": 6070 + }, + { + "epoch": 0.6176978563446104, + "grad_norm": 1.953125, + "learning_rate": 1.625326858094144e-05, + "loss": 0.1523, + "step": 6080 + }, + { + "epoch": 0.6187138067662298, + "grad_norm": 3.484375, + "learning_rate": 1.6177797416193953e-05, + "loss": 0.218, + "step": 6090 + }, + { + "epoch": 0.6197297571878493, + "grad_norm": 3.484375, + "learning_rate": 1.6102417977137052e-05, + "loss": 0.1476, + "step": 6100 + }, + { + "epoch": 0.6207457076094687, + "grad_norm": 4.90625, + "learning_rate": 1.602713104750147e-05, + "loss": 0.1818, + "step": 6110 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 4.375, + "learning_rate": 1.5951937410056087e-05, + "loss": 0.2061, + "step": 6120 + }, + { + "epoch": 0.6227776084527075, + "grad_norm": 6.3125, + "learning_rate": 1.587683784659979e-05, + "loss": 0.1566, + "step": 6130 + }, + { + "epoch": 0.623793558874327, + "grad_norm": 2.828125, + "learning_rate": 1.58018331379534e-05, + "loss": 0.1376, + "step": 6140 + }, + { + "epoch": 0.6248095092959464, + "grad_norm": 2.40625, + "learning_rate": 1.572692406395149e-05, + "loss": 0.1655, + "step": 6150 + }, + { + "epoch": 0.6258254597175658, + "grad_norm": 4.34375, + "learning_rate": 1.5652111403434338e-05, + "loss": 0.2363, + "step": 6160 + }, + { + "epoch": 0.6268414101391853, + "grad_norm": 2.453125, + "learning_rate": 1.5577395934239757e-05, + "loss": 0.2464, + "step": 6170 + }, + { + "epoch": 0.6278573605608047, + "grad_norm": 2.53125, + "learning_rate": 1.5502778433195085e-05, + "loss": 0.1898, + "step": 6180 + }, + { + "epoch": 0.628873310982424, + "grad_norm": 2.28125, + "learning_rate": 1.5428259676109048e-05, + "loss": 0.1804, + "step": 6190 + }, + { + "epoch": 0.6298892614040434, + "grad_norm": 4.3125, + "learning_rate": 1.5353840437763732e-05, + "loss": 0.1409, + "step": 6200 + }, + { + "epoch": 0.630905211825663, + "grad_norm": 2.5625, + "learning_rate": 1.5279521491906496e-05, + "loss": 0.2449, + "step": 6210 + }, + { + "epoch": 0.6319211622472823, + "grad_norm": 3.0625, + "learning_rate": 1.520530361124195e-05, + "loss": 0.2103, + "step": 6220 + }, + { + "epoch": 0.6329371126689017, + "grad_norm": 2.609375, + "learning_rate": 1.5131187567423937e-05, + "loss": 0.2156, + "step": 6230 + }, + { + "epoch": 0.6339530630905211, + "grad_norm": 2.703125, + "learning_rate": 1.5057174131047446e-05, + "loss": 0.161, + "step": 6240 + }, + { + "epoch": 0.6349690135121406, + "grad_norm": 3.265625, + "learning_rate": 1.4983264071640679e-05, + "loss": 0.1757, + "step": 6250 + }, + { + "epoch": 0.63598496393376, + "grad_norm": 3.15625, + "learning_rate": 1.490945815765699e-05, + "loss": 0.2011, + "step": 6260 + }, + { + "epoch": 0.6370009143553794, + "grad_norm": 5.375, + "learning_rate": 1.4835757156466945e-05, + "loss": 0.1658, + "step": 6270 + }, + { + "epoch": 0.6380168647769989, + "grad_norm": 2.984375, + "learning_rate": 1.4762161834350271e-05, + "loss": 0.1754, + "step": 6280 + }, + { + "epoch": 0.6390328151986183, + "grad_norm": 2.015625, + "learning_rate": 1.4688672956487987e-05, + "loss": 0.1427, + "step": 6290 + }, + { + "epoch": 0.6400487656202377, + "grad_norm": 3.78125, + "learning_rate": 1.4615291286954352e-05, + "loss": 0.1517, + "step": 6300 + }, + { + "epoch": 0.6410647160418571, + "grad_norm": 2.859375, + "learning_rate": 1.4542017588709005e-05, + "loss": 0.2348, + "step": 6310 + }, + { + "epoch": 0.6420806664634766, + "grad_norm": 2.421875, + "learning_rate": 1.4468852623588961e-05, + "loss": 0.2089, + "step": 6320 + }, + { + "epoch": 0.643096616885096, + "grad_norm": 2.15625, + "learning_rate": 1.4395797152300719e-05, + "loss": 0.1702, + "step": 6330 + }, + { + "epoch": 0.6441125673067154, + "grad_norm": 1.53125, + "learning_rate": 1.4322851934412382e-05, + "loss": 0.1017, + "step": 6340 + }, + { + "epoch": 0.6451285177283349, + "grad_norm": 1.90625, + "learning_rate": 1.4250017728345716e-05, + "loss": 0.1813, + "step": 6350 + }, + { + "epoch": 0.6461444681499543, + "grad_norm": 2.015625, + "learning_rate": 1.4177295291368292e-05, + "loss": 0.1095, + "step": 6360 + }, + { + "epoch": 0.6471604185715737, + "grad_norm": 2.625, + "learning_rate": 1.410468537958558e-05, + "loss": 0.2259, + "step": 6370 + }, + { + "epoch": 0.6481763689931931, + "grad_norm": 3.5, + "learning_rate": 1.4032188747933136e-05, + "loss": 0.1595, + "step": 6380 + }, + { + "epoch": 0.6491923194148126, + "grad_norm": 5.21875, + "learning_rate": 1.39598061501687e-05, + "loss": 0.2226, + "step": 6390 + }, + { + "epoch": 0.650208269836432, + "grad_norm": 5.34375, + "learning_rate": 1.388753833886442e-05, + "loss": 0.2132, + "step": 6400 + }, + { + "epoch": 0.6512242202580514, + "grad_norm": 3.640625, + "learning_rate": 1.3815386065398945e-05, + "loss": 0.1227, + "step": 6410 + }, + { + "epoch": 0.6522401706796709, + "grad_norm": 1.0, + "learning_rate": 1.3743350079949705e-05, + "loss": 0.1755, + "step": 6420 + }, + { + "epoch": 0.6532561211012903, + "grad_norm": 2.359375, + "learning_rate": 1.3671431131485057e-05, + "loss": 0.1552, + "step": 6430 + }, + { + "epoch": 0.6542720715229097, + "grad_norm": 5.3125, + "learning_rate": 1.3599629967756483e-05, + "loss": 0.1917, + "step": 6440 + }, + { + "epoch": 0.6552880219445291, + "grad_norm": 4.625, + "learning_rate": 1.3527947335290877e-05, + "loss": 0.1812, + "step": 6450 + }, + { + "epoch": 0.6563039723661486, + "grad_norm": 1.234375, + "learning_rate": 1.3456383979382708e-05, + "loss": 0.1896, + "step": 6460 + }, + { + "epoch": 0.657319922787768, + "grad_norm": 3.984375, + "learning_rate": 1.3384940644086352e-05, + "loss": 0.1484, + "step": 6470 + }, + { + "epoch": 0.6583358732093874, + "grad_norm": 2.40625, + "learning_rate": 1.3313618072208268e-05, + "loss": 0.1334, + "step": 6480 + }, + { + "epoch": 0.6593518236310069, + "grad_norm": 4.375, + "learning_rate": 1.3242417005299357e-05, + "loss": 0.1351, + "step": 6490 + }, + { + "epoch": 0.6603677740526263, + "grad_norm": 2.640625, + "learning_rate": 1.31713381836472e-05, + "loss": 0.1717, + "step": 6500 + }, + { + "epoch": 0.6613837244742456, + "grad_norm": 2.640625, + "learning_rate": 1.3100382346268392e-05, + "loss": 0.1867, + "step": 6510 + }, + { + "epoch": 0.662399674895865, + "grad_norm": 1.734375, + "learning_rate": 1.3029550230900812e-05, + "loss": 0.1997, + "step": 6520 + }, + { + "epoch": 0.6634156253174845, + "grad_norm": 3.609375, + "learning_rate": 1.2958842573996016e-05, + "loss": 0.1969, + "step": 6530 + }, + { + "epoch": 0.6644315757391039, + "grad_norm": 3.578125, + "learning_rate": 1.2888260110711525e-05, + "loss": 0.1469, + "step": 6540 + }, + { + "epoch": 0.6654475261607233, + "grad_norm": 1.3515625, + "learning_rate": 1.2817803574903212e-05, + "loss": 0.1524, + "step": 6550 + }, + { + "epoch": 0.6664634765823427, + "grad_norm": 2.109375, + "learning_rate": 1.2747473699117668e-05, + "loss": 0.159, + "step": 6560 + }, + { + "epoch": 0.6674794270039622, + "grad_norm": 1.53125, + "learning_rate": 1.267727121458458e-05, + "loss": 0.1999, + "step": 6570 + }, + { + "epoch": 0.6684953774255816, + "grad_norm": 1.7265625, + "learning_rate": 1.2607196851209137e-05, + "loss": 0.2216, + "step": 6580 + }, + { + "epoch": 0.669511327847201, + "grad_norm": 3.125, + "learning_rate": 1.2537251337564412e-05, + "loss": 0.1607, + "step": 6590 + }, + { + "epoch": 0.6705272782688205, + "grad_norm": 2.421875, + "learning_rate": 1.2467435400883839e-05, + "loss": 0.2187, + "step": 6600 + }, + { + "epoch": 0.6715432286904399, + "grad_norm": 1.5078125, + "learning_rate": 1.239774976705359e-05, + "loss": 0.1753, + "step": 6610 + }, + { + "epoch": 0.6725591791120593, + "grad_norm": 1.140625, + "learning_rate": 1.2328195160605092e-05, + "loss": 0.194, + "step": 6620 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 4.9375, + "learning_rate": 1.225877230470743e-05, + "loss": 0.1485, + "step": 6630 + }, + { + "epoch": 0.6745910799552982, + "grad_norm": 3.65625, + "learning_rate": 1.218948192115988e-05, + "loss": 0.1847, + "step": 6640 + }, + { + "epoch": 0.6756070303769176, + "grad_norm": 3.875, + "learning_rate": 1.21203247303844e-05, + "loss": 0.1874, + "step": 6650 + }, + { + "epoch": 0.676622980798537, + "grad_norm": 2.65625, + "learning_rate": 1.2051301451418073e-05, + "loss": 0.2377, + "step": 6660 + }, + { + "epoch": 0.6776389312201565, + "grad_norm": 2.09375, + "learning_rate": 1.198241280190574e-05, + "loss": 0.1508, + "step": 6670 + }, + { + "epoch": 0.6786548816417759, + "grad_norm": 2.203125, + "learning_rate": 1.1913659498092431e-05, + "loss": 0.1537, + "step": 6680 + }, + { + "epoch": 0.6796708320633953, + "grad_norm": 2.484375, + "learning_rate": 1.184504225481601e-05, + "loss": 0.2339, + "step": 6690 + }, + { + "epoch": 0.6806867824850147, + "grad_norm": 5.625, + "learning_rate": 1.177656178549966e-05, + "loss": 0.2102, + "step": 6700 + }, + { + "epoch": 0.6817027329066342, + "grad_norm": 2.5, + "learning_rate": 1.1708218802144536e-05, + "loss": 0.1435, + "step": 6710 + }, + { + "epoch": 0.6827186833282536, + "grad_norm": 3.84375, + "learning_rate": 1.1640014015322323e-05, + "loss": 0.1823, + "step": 6720 + }, + { + "epoch": 0.683734633749873, + "grad_norm": 2.359375, + "learning_rate": 1.1571948134167862e-05, + "loss": 0.1154, + "step": 6730 + }, + { + "epoch": 0.6847505841714925, + "grad_norm": 2.90625, + "learning_rate": 1.1504021866371761e-05, + "loss": 0.2105, + "step": 6740 + }, + { + "epoch": 0.6857665345931119, + "grad_norm": 5.46875, + "learning_rate": 1.143623591817304e-05, + "loss": 0.1317, + "step": 6750 + }, + { + "epoch": 0.6867824850147313, + "grad_norm": 3.34375, + "learning_rate": 1.1368590994351835e-05, + "loss": 0.1406, + "step": 6760 + }, + { + "epoch": 0.6877984354363507, + "grad_norm": 3.78125, + "learning_rate": 1.130108779822198e-05, + "loss": 0.1425, + "step": 6770 + }, + { + "epoch": 0.6888143858579702, + "grad_norm": 0.77734375, + "learning_rate": 1.1233727031623783e-05, + "loss": 0.1623, + "step": 6780 + }, + { + "epoch": 0.6898303362795896, + "grad_norm": 4.625, + "learning_rate": 1.1166509394916682e-05, + "loss": 0.1591, + "step": 6790 + }, + { + "epoch": 0.690846286701209, + "grad_norm": 3.84375, + "learning_rate": 1.1099435586971982e-05, + "loss": 0.1758, + "step": 6800 + }, + { + "epoch": 0.6918622371228285, + "grad_norm": 2.4375, + "learning_rate": 1.1032506305165555e-05, + "loss": 0.1018, + "step": 6810 + }, + { + "epoch": 0.6928781875444479, + "grad_norm": 3.203125, + "learning_rate": 1.0965722245370641e-05, + "loss": 0.1485, + "step": 6820 + }, + { + "epoch": 0.6938941379660672, + "grad_norm": 0.7109375, + "learning_rate": 1.0899084101950561e-05, + "loss": 0.1762, + "step": 6830 + }, + { + "epoch": 0.6949100883876866, + "grad_norm": 1.9765625, + "learning_rate": 1.0832592567751555e-05, + "loss": 0.1402, + "step": 6840 + }, + { + "epoch": 0.6959260388093061, + "grad_norm": 1.4609375, + "learning_rate": 1.0766248334095505e-05, + "loss": 0.2278, + "step": 6850 + }, + { + "epoch": 0.6969419892309255, + "grad_norm": 3.953125, + "learning_rate": 1.0700052090772828e-05, + "loss": 0.1969, + "step": 6860 + }, + { + "epoch": 0.6979579396525449, + "grad_norm": 2.453125, + "learning_rate": 1.0634004526035249e-05, + "loss": 0.2073, + "step": 6870 + }, + { + "epoch": 0.6989738900741643, + "grad_norm": 1.6171875, + "learning_rate": 1.0568106326588645e-05, + "loss": 0.1902, + "step": 6880 + }, + { + "epoch": 0.6999898404957838, + "grad_norm": 1.2734375, + "learning_rate": 1.0502358177585953e-05, + "loss": 0.2165, + "step": 6890 + }, + { + "epoch": 0.7010057909174032, + "grad_norm": 1.671875, + "learning_rate": 1.0436760762619977e-05, + "loss": 0.1952, + "step": 6900 + }, + { + "epoch": 0.7020217413390226, + "grad_norm": 2.8125, + "learning_rate": 1.0371314763716347e-05, + "loss": 0.1422, + "step": 6910 + }, + { + "epoch": 0.7030376917606421, + "grad_norm": 2.53125, + "learning_rate": 1.0306020861326388e-05, + "loss": 0.0961, + "step": 6920 + }, + { + "epoch": 0.7040536421822615, + "grad_norm": 3.046875, + "learning_rate": 1.0240879734320068e-05, + "loss": 0.1542, + "step": 6930 + }, + { + "epoch": 0.7050695926038809, + "grad_norm": 2.859375, + "learning_rate": 1.0175892059978901e-05, + "loss": 0.1748, + "step": 6940 + }, + { + "epoch": 0.7060855430255003, + "grad_norm": 2.671875, + "learning_rate": 1.0111058513988958e-05, + "loss": 0.0819, + "step": 6950 + }, + { + "epoch": 0.7071014934471198, + "grad_norm": 3.5625, + "learning_rate": 1.0046379770433803e-05, + "loss": 0.1933, + "step": 6960 + }, + { + "epoch": 0.7081174438687392, + "grad_norm": 2.859375, + "learning_rate": 9.98185650178749e-06, + "loss": 0.1891, + "step": 6970 + }, + { + "epoch": 0.7091333942903586, + "grad_norm": 3.15625, + "learning_rate": 9.917489378907591e-06, + "loss": 0.2102, + "step": 6980 + }, + { + "epoch": 0.7101493447119781, + "grad_norm": 6.40625, + "learning_rate": 9.853279071028212e-06, + "loss": 0.1714, + "step": 6990 + }, + { + "epoch": 0.7111652951335975, + "grad_norm": 2.375, + "learning_rate": 9.78922624575303e-06, + "loss": 0.1299, + "step": 7000 + }, + { + "epoch": 0.7121812455552169, + "grad_norm": 2.078125, + "learning_rate": 9.72533156904833e-06, + "loss": 0.1914, + "step": 7010 + }, + { + "epoch": 0.7131971959768363, + "grad_norm": 3.859375, + "learning_rate": 9.661595705236137e-06, + "loss": 0.2377, + "step": 7020 + }, + { + "epoch": 0.7142131463984558, + "grad_norm": 1.171875, + "learning_rate": 9.598019316987244e-06, + "loss": 0.1851, + "step": 7030 + }, + { + "epoch": 0.7152290968200752, + "grad_norm": 1.078125, + "learning_rate": 9.53460306531439e-06, + "loss": 0.2661, + "step": 7040 + }, + { + "epoch": 0.7162450472416946, + "grad_norm": 1.6484375, + "learning_rate": 9.471347609565311e-06, + "loss": 0.1669, + "step": 7050 + }, + { + "epoch": 0.7172609976633141, + "grad_norm": 4.59375, + "learning_rate": 9.408253607415957e-06, + "loss": 0.2487, + "step": 7060 + }, + { + "epoch": 0.7182769480849335, + "grad_norm": 3.09375, + "learning_rate": 9.345321714863614e-06, + "loss": 0.186, + "step": 7070 + }, + { + "epoch": 0.7192928985065529, + "grad_norm": 6.0625, + "learning_rate": 9.282552586220075e-06, + "loss": 0.2249, + "step": 7080 + }, + { + "epoch": 0.7203088489281723, + "grad_norm": 1.5703125, + "learning_rate": 9.219946874104885e-06, + "loss": 0.1255, + "step": 7090 + }, + { + "epoch": 0.7213247993497918, + "grad_norm": 1.9453125, + "learning_rate": 9.157505229438481e-06, + "loss": 0.1999, + "step": 7100 + }, + { + "epoch": 0.7223407497714112, + "grad_norm": 5.1875, + "learning_rate": 9.095228301435518e-06, + "loss": 0.199, + "step": 7110 + }, + { + "epoch": 0.7233567001930306, + "grad_norm": 2.078125, + "learning_rate": 9.03311673759802e-06, + "loss": 0.2182, + "step": 7120 + }, + { + "epoch": 0.7243726506146501, + "grad_norm": 6.46875, + "learning_rate": 8.971171183708733e-06, + "loss": 0.1573, + "step": 7130 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 3.015625, + "learning_rate": 8.909392283824353e-06, + "loss": 0.2044, + "step": 7140 + }, + { + "epoch": 0.7264045514578888, + "grad_norm": 2.921875, + "learning_rate": 8.847780680268872e-06, + "loss": 0.11, + "step": 7150 + }, + { + "epoch": 0.7274205018795082, + "grad_norm": 2.96875, + "learning_rate": 8.786337013626853e-06, + "loss": 0.1897, + "step": 7160 + }, + { + "epoch": 0.7284364523011277, + "grad_norm": 1.7578125, + "learning_rate": 8.725061922736799e-06, + "loss": 0.153, + "step": 7170 + }, + { + "epoch": 0.7294524027227471, + "grad_norm": 1.609375, + "learning_rate": 8.663956044684532e-06, + "loss": 0.1746, + "step": 7180 + }, + { + "epoch": 0.7304683531443665, + "grad_norm": 1.9375, + "learning_rate": 8.603020014796507e-06, + "loss": 0.2284, + "step": 7190 + }, + { + "epoch": 0.7314843035659859, + "grad_norm": 1.515625, + "learning_rate": 8.542254466633273e-06, + "loss": 0.1186, + "step": 7200 + }, + { + "epoch": 0.7325002539876054, + "grad_norm": 1.671875, + "learning_rate": 8.481660031982844e-06, + "loss": 0.1971, + "step": 7210 + }, + { + "epoch": 0.7335162044092248, + "grad_norm": 1.453125, + "learning_rate": 8.421237340854157e-06, + "loss": 0.196, + "step": 7220 + }, + { + "epoch": 0.7345321548308442, + "grad_norm": 0.65234375, + "learning_rate": 8.360987021470479e-06, + "loss": 0.1724, + "step": 7230 + }, + { + "epoch": 0.7355481052524637, + "grad_norm": 2.84375, + "learning_rate": 8.300909700262929e-06, + "loss": 0.175, + "step": 7240 + }, + { + "epoch": 0.7365640556740831, + "grad_norm": 3.109375, + "learning_rate": 8.241006001863924e-06, + "loss": 0.2276, + "step": 7250 + }, + { + "epoch": 0.7375800060957025, + "grad_norm": 4.8125, + "learning_rate": 8.181276549100714e-06, + "loss": 0.2029, + "step": 7260 + }, + { + "epoch": 0.7385959565173219, + "grad_norm": 4.03125, + "learning_rate": 8.12172196298887e-06, + "loss": 0.175, + "step": 7270 + }, + { + "epoch": 0.7396119069389414, + "grad_norm": 3.046875, + "learning_rate": 8.062342862725878e-06, + "loss": 0.1662, + "step": 7280 + }, + { + "epoch": 0.7406278573605608, + "grad_norm": 3.375, + "learning_rate": 8.003139865684662e-06, + "loss": 0.1616, + "step": 7290 + }, + { + "epoch": 0.7416438077821802, + "grad_norm": 2.5625, + "learning_rate": 7.944113587407157e-06, + "loss": 0.2448, + "step": 7300 + }, + { + "epoch": 0.7426597582037997, + "grad_norm": 4.125, + "learning_rate": 7.885264641597961e-06, + "loss": 0.1618, + "step": 7310 + }, + { + "epoch": 0.7436757086254191, + "grad_norm": 3.5, + "learning_rate": 7.826593640117889e-06, + "loss": 0.1134, + "step": 7320 + }, + { + "epoch": 0.7446916590470385, + "grad_norm": 2.6875, + "learning_rate": 7.76810119297767e-06, + "loss": 0.1795, + "step": 7330 + }, + { + "epoch": 0.7457076094686579, + "grad_norm": 4.34375, + "learning_rate": 7.709787908331556e-06, + "loss": 0.2736, + "step": 7340 + }, + { + "epoch": 0.7467235598902774, + "grad_norm": 1.21875, + "learning_rate": 7.651654392471038e-06, + "loss": 0.139, + "step": 7350 + }, + { + "epoch": 0.7477395103118968, + "grad_norm": 3.578125, + "learning_rate": 7.593701249818521e-06, + "loss": 0.2023, + "step": 7360 + }, + { + "epoch": 0.7487554607335162, + "grad_norm": 2.15625, + "learning_rate": 7.535929082921048e-06, + "loss": 0.1702, + "step": 7370 + }, + { + "epoch": 0.7497714111551357, + "grad_norm": 1.96875, + "learning_rate": 7.47833849244402e-06, + "loss": 0.1835, + "step": 7380 + }, + { + "epoch": 0.7507873615767551, + "grad_norm": 2.796875, + "learning_rate": 7.420930077164959e-06, + "loss": 0.1713, + "step": 7390 + }, + { + "epoch": 0.7518033119983745, + "grad_norm": 4.46875, + "learning_rate": 7.363704433967311e-06, + "loss": 0.1906, + "step": 7400 + }, + { + "epoch": 0.7528192624199939, + "grad_norm": 1.75, + "learning_rate": 7.306662157834185e-06, + "loss": 0.1421, + "step": 7410 + }, + { + "epoch": 0.7538352128416134, + "grad_norm": 1.140625, + "learning_rate": 7.2498038418422145e-06, + "loss": 0.1793, + "step": 7420 + }, + { + "epoch": 0.7548511632632328, + "grad_norm": 2.578125, + "learning_rate": 7.193130077155374e-06, + "loss": 0.1603, + "step": 7430 + }, + { + "epoch": 0.7558671136848522, + "grad_norm": 4.3125, + "learning_rate": 7.13664145301883e-06, + "loss": 0.2169, + "step": 7440 + }, + { + "epoch": 0.7568830641064717, + "grad_norm": 3.078125, + "learning_rate": 7.0803385567528025e-06, + "loss": 0.1685, + "step": 7450 + }, + { + "epoch": 0.757899014528091, + "grad_norm": 3.5625, + "learning_rate": 7.024221973746495e-06, + "loss": 0.2282, + "step": 7460 + }, + { + "epoch": 0.7589149649497104, + "grad_norm": 2.265625, + "learning_rate": 6.968292287451961e-06, + "loss": 0.1786, + "step": 7470 + }, + { + "epoch": 0.7599309153713298, + "grad_norm": 4.71875, + "learning_rate": 6.912550079378091e-06, + "loss": 0.1811, + "step": 7480 + }, + { + "epoch": 0.7609468657929493, + "grad_norm": 2.328125, + "learning_rate": 6.856995929084506e-06, + "loss": 0.1747, + "step": 7490 + }, + { + "epoch": 0.7619628162145687, + "grad_norm": 5.21875, + "learning_rate": 6.801630414175589e-06, + "loss": 0.2028, + "step": 7500 + }, + { + "epoch": 0.7629787666361881, + "grad_norm": 3.78125, + "learning_rate": 6.746454110294451e-06, + "loss": 0.2255, + "step": 7510 + }, + { + "epoch": 0.7639947170578075, + "grad_norm": 1.625, + "learning_rate": 6.691467591116931e-06, + "loss": 0.1604, + "step": 7520 + }, + { + "epoch": 0.765010667479427, + "grad_norm": 1.7734375, + "learning_rate": 6.6366714283456755e-06, + "loss": 0.2559, + "step": 7530 + }, + { + "epoch": 0.7660266179010464, + "grad_norm": 4.59375, + "learning_rate": 6.582066191704142e-06, + "loss": 0.2034, + "step": 7540 + }, + { + "epoch": 0.7670425683226658, + "grad_norm": 1.578125, + "learning_rate": 6.527652448930724e-06, + "loss": 0.148, + "step": 7550 + }, + { + "epoch": 0.7680585187442853, + "grad_norm": 1.7109375, + "learning_rate": 6.4734307657728e-06, + "loss": 0.1811, + "step": 7560 + }, + { + "epoch": 0.7690744691659047, + "grad_norm": 1.2734375, + "learning_rate": 6.419401705980924e-06, + "loss": 0.1407, + "step": 7570 + }, + { + "epoch": 0.7700904195875241, + "grad_norm": 2.25, + "learning_rate": 6.365565831302869e-06, + "loss": 0.1893, + "step": 7580 + }, + { + "epoch": 0.7711063700091435, + "grad_norm": 1.625, + "learning_rate": 6.311923701477854e-06, + "loss": 0.1835, + "step": 7590 + }, + { + "epoch": 0.772122320430763, + "grad_norm": 2.375, + "learning_rate": 6.258475874230713e-06, + "loss": 0.1579, + "step": 7600 + }, + { + "epoch": 0.7731382708523824, + "grad_norm": 4.5, + "learning_rate": 6.205222905266067e-06, + "loss": 0.1794, + "step": 7610 + }, + { + "epoch": 0.7741542212740018, + "grad_norm": 4.25, + "learning_rate": 6.152165348262598e-06, + "loss": 0.1477, + "step": 7620 + }, + { + "epoch": 0.7751701716956213, + "grad_norm": 1.9765625, + "learning_rate": 6.0993037548672246e-06, + "loss": 0.2396, + "step": 7630 + }, + { + "epoch": 0.7761861221172407, + "grad_norm": 2.671875, + "learning_rate": 6.046638674689454e-06, + "loss": 0.1717, + "step": 7640 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 3.671875, + "learning_rate": 5.994170655295567e-06, + "loss": 0.2646, + "step": 7650 + }, + { + "epoch": 0.7782180229604795, + "grad_norm": 1.3046875, + "learning_rate": 5.9419002422030106e-06, + "loss": 0.1553, + "step": 7660 + }, + { + "epoch": 0.779233973382099, + "grad_norm": 3.734375, + "learning_rate": 5.889827978874665e-06, + "loss": 0.1854, + "step": 7670 + }, + { + "epoch": 0.7802499238037184, + "grad_norm": 2.140625, + "learning_rate": 5.837954406713245e-06, + "loss": 0.1857, + "step": 7680 + }, + { + "epoch": 0.7812658742253378, + "grad_norm": 3.34375, + "learning_rate": 5.786280065055619e-06, + "loss": 0.1797, + "step": 7690 + }, + { + "epoch": 0.7822818246469573, + "grad_norm": 0.97265625, + "learning_rate": 5.734805491167244e-06, + "loss": 0.1488, + "step": 7700 + }, + { + "epoch": 0.7832977750685767, + "grad_norm": 2.078125, + "learning_rate": 5.683531220236576e-06, + "loss": 0.1688, + "step": 7710 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 3.046875, + "learning_rate": 5.632457785369455e-06, + "loss": 0.1503, + "step": 7720 + }, + { + "epoch": 0.7853296759118155, + "grad_norm": 1.6875, + "learning_rate": 5.581585717583637e-06, + "loss": 0.1658, + "step": 7730 + }, + { + "epoch": 0.786345626333435, + "grad_norm": 3.421875, + "learning_rate": 5.530915545803209e-06, + "loss": 0.2112, + "step": 7740 + }, + { + "epoch": 0.7873615767550544, + "grad_norm": 4.1875, + "learning_rate": 5.480447796853141e-06, + "loss": 0.165, + "step": 7750 + }, + { + "epoch": 0.7883775271766738, + "grad_norm": 5.3125, + "learning_rate": 5.430182995453756e-06, + "loss": 0.1499, + "step": 7760 + }, + { + "epoch": 0.7893934775982933, + "grad_norm": 2.1875, + "learning_rate": 5.380121664215329e-06, + "loss": 0.1559, + "step": 7770 + }, + { + "epoch": 0.7904094280199127, + "grad_norm": 1.46875, + "learning_rate": 5.330264323632611e-06, + "loss": 0.2098, + "step": 7780 + }, + { + "epoch": 0.791425378441532, + "grad_norm": 4.65625, + "learning_rate": 5.280611492079449e-06, + "loss": 0.1776, + "step": 7790 + }, + { + "epoch": 0.7924413288631514, + "grad_norm": 1.3359375, + "learning_rate": 5.231163685803361e-06, + "loss": 0.1497, + "step": 7800 + }, + { + "epoch": 0.7934572792847709, + "grad_norm": 2.640625, + "learning_rate": 5.181921418920191e-06, + "loss": 0.12, + "step": 7810 + }, + { + "epoch": 0.7944732297063903, + "grad_norm": 2.328125, + "learning_rate": 5.13288520340878e-06, + "loss": 0.1981, + "step": 7820 + }, + { + "epoch": 0.7954891801280097, + "grad_norm": 3.0625, + "learning_rate": 5.084055549105596e-06, + "loss": 0.1389, + "step": 7830 + }, + { + "epoch": 0.7965051305496291, + "grad_norm": 2.796875, + "learning_rate": 5.035432963699479e-06, + "loss": 0.2293, + "step": 7840 + }, + { + "epoch": 0.7975210809712486, + "grad_norm": 5.0625, + "learning_rate": 4.98701795272635e-06, + "loss": 0.1618, + "step": 7850 + }, + { + "epoch": 0.798537031392868, + "grad_norm": 5.09375, + "learning_rate": 4.938811019563938e-06, + "loss": 0.1755, + "step": 7860 + }, + { + "epoch": 0.7995529818144874, + "grad_norm": 2.140625, + "learning_rate": 4.8908126654265475e-06, + "loss": 0.1565, + "step": 7870 + }, + { + "epoch": 0.8005689322361069, + "grad_norm": 0.76171875, + "learning_rate": 4.843023389359885e-06, + "loss": 0.2176, + "step": 7880 + }, + { + "epoch": 0.8015848826577263, + "grad_norm": 2.625, + "learning_rate": 4.79544368823581e-06, + "loss": 0.2013, + "step": 7890 + }, + { + "epoch": 0.8026008330793457, + "grad_norm": 2.078125, + "learning_rate": 4.748074056747234e-06, + "loss": 0.1246, + "step": 7900 + }, + { + "epoch": 0.8036167835009651, + "grad_norm": 3.5, + "learning_rate": 4.700914987402919e-06, + "loss": 0.1638, + "step": 7910 + }, + { + "epoch": 0.8046327339225846, + "grad_norm": 3.4375, + "learning_rate": 4.6539669705223916e-06, + "loss": 0.2213, + "step": 7920 + }, + { + "epoch": 0.805648684344204, + "grad_norm": 2.96875, + "learning_rate": 4.607230494230849e-06, + "loss": 0.1822, + "step": 7930 + }, + { + "epoch": 0.8066646347658234, + "grad_norm": 2.359375, + "learning_rate": 4.560706044454047e-06, + "loss": 0.1763, + "step": 7940 + }, + { + "epoch": 0.8076805851874429, + "grad_norm": 4.59375, + "learning_rate": 4.514394104913291e-06, + "loss": 0.234, + "step": 7950 + }, + { + "epoch": 0.8086965356090623, + "grad_norm": 1.96875, + "learning_rate": 4.468295157120372e-06, + "loss": 0.1939, + "step": 7960 + }, + { + "epoch": 0.8097124860306817, + "grad_norm": 2.578125, + "learning_rate": 4.422409680372594e-06, + "loss": 0.174, + "step": 7970 + }, + { + "epoch": 0.8107284364523011, + "grad_norm": 4.5625, + "learning_rate": 4.3767381517477505e-06, + "loss": 0.2375, + "step": 7980 + }, + { + "epoch": 0.8117443868739206, + "grad_norm": 0.9609375, + "learning_rate": 4.331281046099203e-06, + "loss": 0.2076, + "step": 7990 + }, + { + "epoch": 0.81276033729554, + "grad_norm": 6.0625, + "learning_rate": 4.286038836050929e-06, + "loss": 0.2504, + "step": 8000 + }, + { + "epoch": 0.8137762877171594, + "grad_norm": 3.484375, + "learning_rate": 4.241011991992586e-06, + "loss": 0.2102, + "step": 8010 + }, + { + "epoch": 0.8147922381387789, + "grad_norm": 1.9765625, + "learning_rate": 4.1962009820746635e-06, + "loss": 0.1846, + "step": 8020 + }, + { + "epoch": 0.8158081885603983, + "grad_norm": 1.875, + "learning_rate": 4.15160627220357e-06, + "loss": 0.1741, + "step": 8030 + }, + { + "epoch": 0.8168241389820177, + "grad_norm": 5.5625, + "learning_rate": 4.107228326036838e-06, + "loss": 0.2078, + "step": 8040 + }, + { + "epoch": 0.8178400894036371, + "grad_norm": 1.7578125, + "learning_rate": 4.063067604978252e-06, + "loss": 0.212, + "step": 8050 + }, + { + "epoch": 0.8188560398252566, + "grad_norm": 4.09375, + "learning_rate": 4.019124568173094e-06, + "loss": 0.1831, + "step": 8060 + }, + { + "epoch": 0.819871990246876, + "grad_norm": 6.625, + "learning_rate": 3.975399672503341e-06, + "loss": 0.2196, + "step": 8070 + }, + { + "epoch": 0.8208879406684954, + "grad_norm": 2.78125, + "learning_rate": 3.931893372582943e-06, + "loss": 0.2002, + "step": 8080 + }, + { + "epoch": 0.8219038910901149, + "grad_norm": 6.90625, + "learning_rate": 3.888606120753047e-06, + "loss": 0.2138, + "step": 8090 + }, + { + "epoch": 0.8229198415117343, + "grad_norm": 4.09375, + "learning_rate": 3.845538367077362e-06, + "loss": 0.2593, + "step": 8100 + }, + { + "epoch": 0.8239357919333536, + "grad_norm": 1.859375, + "learning_rate": 3.8026905593374213e-06, + "loss": 0.2062, + "step": 8110 + }, + { + "epoch": 0.824951742354973, + "grad_norm": 4.3125, + "learning_rate": 3.760063143027945e-06, + "loss": 0.1343, + "step": 8120 + }, + { + "epoch": 0.8259676927765925, + "grad_norm": 1.984375, + "learning_rate": 3.7176565613522313e-06, + "loss": 0.2494, + "step": 8130 + }, + { + "epoch": 0.8269836431982119, + "grad_norm": 3.71875, + "learning_rate": 3.675471255217516e-06, + "loss": 0.1502, + "step": 8140 + }, + { + "epoch": 0.8279995936198313, + "grad_norm": 2.359375, + "learning_rate": 3.6335076632304175e-06, + "loss": 0.1256, + "step": 8150 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 1.46875, + "learning_rate": 3.5917662216923332e-06, + "loss": 0.1709, + "step": 8160 + }, + { + "epoch": 0.8300314944630702, + "grad_norm": 2.78125, + "learning_rate": 3.550247364594958e-06, + "loss": 0.1881, + "step": 8170 + }, + { + "epoch": 0.8310474448846896, + "grad_norm": 1.0703125, + "learning_rate": 3.508951523615725e-06, + "loss": 0.1998, + "step": 8180 + }, + { + "epoch": 0.832063395306309, + "grad_norm": 2.40625, + "learning_rate": 3.467879128113352e-06, + "loss": 0.2429, + "step": 8190 + }, + { + "epoch": 0.8330793457279285, + "grad_norm": 2.609375, + "learning_rate": 3.427030605123352e-06, + "loss": 0.1942, + "step": 8200 + }, + { + "epoch": 0.8340952961495479, + "grad_norm": 1.6015625, + "learning_rate": 3.3864063793536043e-06, + "loss": 0.1898, + "step": 8210 + }, + { + "epoch": 0.8351112465711673, + "grad_norm": 5.375, + "learning_rate": 3.3460068731799577e-06, + "loss": 0.1919, + "step": 8220 + }, + { + "epoch": 0.8361271969927867, + "grad_norm": 3.3125, + "learning_rate": 3.3058325066417818e-06, + "loss": 0.1516, + "step": 8230 + }, + { + "epoch": 0.8371431474144062, + "grad_norm": 0.76171875, + "learning_rate": 3.26588369743768e-06, + "loss": 0.1068, + "step": 8240 + }, + { + "epoch": 0.8381590978360256, + "grad_norm": 3.171875, + "learning_rate": 3.2261608609210653e-06, + "loss": 0.1203, + "step": 8250 + }, + { + "epoch": 0.839175048257645, + "grad_norm": 2.359375, + "learning_rate": 3.186664410095913e-06, + "loss": 0.2172, + "step": 8260 + }, + { + "epoch": 0.8401909986792645, + "grad_norm": 3.328125, + "learning_rate": 3.1473947556124093e-06, + "loss": 0.1249, + "step": 8270 + }, + { + "epoch": 0.8412069491008839, + "grad_norm": 2.484375, + "learning_rate": 3.1083523057627213e-06, + "loss": 0.1744, + "step": 8280 + }, + { + "epoch": 0.8422228995225033, + "grad_norm": 4.46875, + "learning_rate": 3.0695374664767353e-06, + "loss": 0.1772, + "step": 8290 + }, + { + "epoch": 0.8432388499441227, + "grad_norm": 0.59375, + "learning_rate": 3.0309506413178397e-06, + "loss": 0.2302, + "step": 8300 + }, + { + "epoch": 0.8442548003657422, + "grad_norm": 2.390625, + "learning_rate": 2.9925922314787136e-06, + "loss": 0.1635, + "step": 8310 + }, + { + "epoch": 0.8452707507873616, + "grad_norm": 2.34375, + "learning_rate": 2.954462635777194e-06, + "loss": 0.1573, + "step": 8320 + }, + { + "epoch": 0.846286701208981, + "grad_norm": 2.015625, + "learning_rate": 2.916562250652083e-06, + "loss": 0.1608, + "step": 8330 + }, + { + "epoch": 0.8473026516306005, + "grad_norm": 4.125, + "learning_rate": 2.878891470159048e-06, + "loss": 0.184, + "step": 8340 + }, + { + "epoch": 0.8483186020522199, + "grad_norm": 2.515625, + "learning_rate": 2.8414506859665514e-06, + "loss": 0.2141, + "step": 8350 + }, + { + "epoch": 0.8493345524738393, + "grad_norm": 3.375, + "learning_rate": 2.8042402873517197e-06, + "loss": 0.1729, + "step": 8360 + }, + { + "epoch": 0.8503505028954587, + "grad_norm": 3.078125, + "learning_rate": 2.76726066119635e-06, + "loss": 0.2252, + "step": 8370 + }, + { + "epoch": 0.8513664533170782, + "grad_norm": 1.5390625, + "learning_rate": 2.730512191982845e-06, + "loss": 0.1644, + "step": 8380 + }, + { + "epoch": 0.8523824037386976, + "grad_norm": 1.9296875, + "learning_rate": 2.693995261790261e-06, + "loss": 0.1822, + "step": 8390 + }, + { + "epoch": 0.853398354160317, + "grad_norm": 3.3125, + "learning_rate": 2.657710250290285e-06, + "loss": 0.2068, + "step": 8400 + }, + { + "epoch": 0.8544143045819365, + "grad_norm": 0.640625, + "learning_rate": 2.621657534743327e-06, + "loss": 0.1224, + "step": 8410 + }, + { + "epoch": 0.8554302550035559, + "grad_norm": 3.421875, + "learning_rate": 2.5858374899945804e-06, + "loss": 0.179, + "step": 8420 + }, + { + "epoch": 0.8564462054251752, + "grad_norm": 3.484375, + "learning_rate": 2.550250488470135e-06, + "loss": 0.1873, + "step": 8430 + }, + { + "epoch": 0.8574621558467946, + "grad_norm": 3.984375, + "learning_rate": 2.5148969001730806e-06, + "loss": 0.1799, + "step": 8440 + }, + { + "epoch": 0.8584781062684141, + "grad_norm": 1.375, + "learning_rate": 2.4797770926796858e-06, + "loss": 0.176, + "step": 8450 + }, + { + "epoch": 0.8594940566900335, + "grad_norm": 1.8984375, + "learning_rate": 2.444891431135571e-06, + "loss": 0.1664, + "step": 8460 + }, + { + "epoch": 0.8605100071116529, + "grad_norm": 4.15625, + "learning_rate": 2.4102402782518936e-06, + "loss": 0.1512, + "step": 8470 + }, + { + "epoch": 0.8615259575332723, + "grad_norm": 1.34375, + "learning_rate": 2.3758239943016096e-06, + "loss": 0.1629, + "step": 8480 + }, + { + "epoch": 0.8625419079548918, + "grad_norm": 5.3125, + "learning_rate": 2.3416429371157013e-06, + "loss": 0.2099, + "step": 8490 + }, + { + "epoch": 0.8635578583765112, + "grad_norm": 5.9375, + "learning_rate": 2.307697462079464e-06, + "loss": 0.2221, + "step": 8500 + }, + { + "epoch": 0.8645738087981306, + "grad_norm": 5.4375, + "learning_rate": 2.273987922128809e-06, + "loss": 0.2191, + "step": 8510 + }, + { + "epoch": 0.8655897592197501, + "grad_norm": 2.171875, + "learning_rate": 2.240514667746607e-06, + "loss": 0.1843, + "step": 8520 + }, + { + "epoch": 0.8666057096413695, + "grad_norm": 2.5625, + "learning_rate": 2.2072780469590245e-06, + "loss": 0.2494, + "step": 8530 + }, + { + "epoch": 0.8676216600629889, + "grad_norm": 2.25, + "learning_rate": 2.1742784053319116e-06, + "loss": 0.1712, + "step": 8540 + }, + { + "epoch": 0.8686376104846083, + "grad_norm": 4.5625, + "learning_rate": 2.141516085967224e-06, + "loss": 0.1169, + "step": 8550 + }, + { + "epoch": 0.8696535609062278, + "grad_norm": 4.25, + "learning_rate": 2.1089914294994434e-06, + "loss": 0.1374, + "step": 8560 + }, + { + "epoch": 0.8706695113278472, + "grad_norm": 3.265625, + "learning_rate": 2.0767047740920336e-06, + "loss": 0.2162, + "step": 8570 + }, + { + "epoch": 0.8716854617494666, + "grad_norm": 1.8203125, + "learning_rate": 2.0446564554339187e-06, + "loss": 0.1593, + "step": 8580 + }, + { + "epoch": 0.8727014121710861, + "grad_norm": 2.671875, + "learning_rate": 2.0128468067360185e-06, + "loss": 0.1857, + "step": 8590 + }, + { + "epoch": 0.8737173625927055, + "grad_norm": 2.765625, + "learning_rate": 1.981276158727749e-06, + "loss": 0.1989, + "step": 8600 + }, + { + "epoch": 0.8747333130143249, + "grad_norm": 2.65625, + "learning_rate": 1.949944839653625e-06, + "loss": 0.2077, + "step": 8610 + }, + { + "epoch": 0.8757492634359443, + "grad_norm": 2.625, + "learning_rate": 1.918853175269797e-06, + "loss": 0.2003, + "step": 8620 + }, + { + "epoch": 0.8767652138575638, + "grad_norm": 0.71875, + "learning_rate": 1.8880014888407127e-06, + "loss": 0.2486, + "step": 8630 + }, + { + "epoch": 0.8777811642791832, + "grad_norm": 4.71875, + "learning_rate": 1.8573901011357336e-06, + "loss": 0.1896, + "step": 8640 + }, + { + "epoch": 0.8787971147008026, + "grad_norm": 5.0625, + "learning_rate": 1.8270193304257887e-06, + "loss": 0.1727, + "step": 8650 + }, + { + "epoch": 0.8798130651224221, + "grad_norm": 1.75, + "learning_rate": 1.7968894924800916e-06, + "loss": 0.1687, + "step": 8660 + }, + { + "epoch": 0.8808290155440415, + "grad_norm": 2.65625, + "learning_rate": 1.7670009005628291e-06, + "loss": 0.166, + "step": 8670 + }, + { + "epoch": 0.8818449659656609, + "grad_norm": 4.71875, + "learning_rate": 1.737353865429936e-06, + "loss": 0.1471, + "step": 8680 + }, + { + "epoch": 0.8828609163872803, + "grad_norm": 0.546875, + "learning_rate": 1.7079486953258283e-06, + "loss": 0.1075, + "step": 8690 + }, + { + "epoch": 0.8838768668088998, + "grad_norm": 1.640625, + "learning_rate": 1.6787856959802367e-06, + "loss": 0.2113, + "step": 8700 + }, + { + "epoch": 0.8848928172305192, + "grad_norm": 2.953125, + "learning_rate": 1.6498651706049945e-06, + "loss": 0.1412, + "step": 8710 + }, + { + "epoch": 0.8859087676521386, + "grad_norm": 3.796875, + "learning_rate": 1.6211874198909072e-06, + "loss": 0.1701, + "step": 8720 + }, + { + "epoch": 0.8869247180737581, + "grad_norm": 3.734375, + "learning_rate": 1.592752742004605e-06, + "loss": 0.1348, + "step": 8730 + }, + { + "epoch": 0.8879406684953774, + "grad_norm": 2.21875, + "learning_rate": 1.5645614325854735e-06, + "loss": 0.1931, + "step": 8740 + }, + { + "epoch": 0.8889566189169968, + "grad_norm": 3.4375, + "learning_rate": 1.5366137847425466e-06, + "loss": 0.1705, + "step": 8750 + }, + { + "epoch": 0.8899725693386162, + "grad_norm": 3.5625, + "learning_rate": 1.5089100890514769e-06, + "loss": 0.1889, + "step": 8760 + }, + { + "epoch": 0.8909885197602357, + "grad_norm": 2.65625, + "learning_rate": 1.4814506335515176e-06, + "loss": 0.1837, + "step": 8770 + }, + { + "epoch": 0.8920044701818551, + "grad_norm": 1.421875, + "learning_rate": 1.4542357037425207e-06, + "loss": 0.1728, + "step": 8780 + }, + { + "epoch": 0.8930204206034745, + "grad_norm": 1.625, + "learning_rate": 1.4272655825819713e-06, + "loss": 0.1562, + "step": 8790 + }, + { + "epoch": 0.8940363710250939, + "grad_norm": 4.0625, + "learning_rate": 1.4005405504820351e-06, + "loss": 0.1681, + "step": 8800 + }, + { + "epoch": 0.8950523214467134, + "grad_norm": 2.328125, + "learning_rate": 1.3740608853066634e-06, + "loss": 0.1449, + "step": 8810 + }, + { + "epoch": 0.8960682718683328, + "grad_norm": 4.0625, + "learning_rate": 1.347826862368684e-06, + "loss": 0.2418, + "step": 8820 + }, + { + "epoch": 0.8970842222899522, + "grad_norm": 0.55859375, + "learning_rate": 1.3218387544269545e-06, + "loss": 0.2473, + "step": 8830 + }, + { + "epoch": 0.8981001727115717, + "grad_norm": 4.78125, + "learning_rate": 1.2960968316835132e-06, + "loss": 0.194, + "step": 8840 + }, + { + "epoch": 0.8991161231331911, + "grad_norm": 3.921875, + "learning_rate": 1.2706013617807822e-06, + "loss": 0.2109, + "step": 8850 + }, + { + "epoch": 0.9001320735548105, + "grad_norm": 5.03125, + "learning_rate": 1.2453526097987778e-06, + "loss": 0.151, + "step": 8860 + }, + { + "epoch": 0.9011480239764299, + "grad_norm": 5.96875, + "learning_rate": 1.2203508382523431e-06, + "loss": 0.1811, + "step": 8870 + }, + { + "epoch": 0.9021639743980494, + "grad_norm": 3.828125, + "learning_rate": 1.1955963070884534e-06, + "loss": 0.2004, + "step": 8880 + }, + { + "epoch": 0.9031799248196688, + "grad_norm": 1.9765625, + "learning_rate": 1.171089273683465e-06, + "loss": 0.1395, + "step": 8890 + }, + { + "epoch": 0.9041958752412882, + "grad_norm": 2.328125, + "learning_rate": 1.1468299928404868e-06, + "loss": 0.1915, + "step": 8900 + }, + { + "epoch": 0.9052118256629077, + "grad_norm": 1.265625, + "learning_rate": 1.1228187167866943e-06, + "loss": 0.1281, + "step": 8910 + }, + { + "epoch": 0.9062277760845271, + "grad_norm": 1.4375, + "learning_rate": 1.099055695170728e-06, + "loss": 0.1627, + "step": 8920 + }, + { + "epoch": 0.9072437265061465, + "grad_norm": 0.6953125, + "learning_rate": 1.0755411750600962e-06, + "loss": 0.1768, + "step": 8930 + }, + { + "epoch": 0.9082596769277659, + "grad_norm": 1.046875, + "learning_rate": 1.052275400938596e-06, + "loss": 0.1544, + "step": 8940 + }, + { + "epoch": 0.9092756273493854, + "grad_norm": 2.71875, + "learning_rate": 1.0292586147037764e-06, + "loss": 0.2498, + "step": 8950 + }, + { + "epoch": 0.9102915777710048, + "grad_norm": 3.0625, + "learning_rate": 1.0064910556644214e-06, + "loss": 0.1918, + "step": 8960 + }, + { + "epoch": 0.9113075281926242, + "grad_norm": 4.0, + "learning_rate": 9.839729605380766e-07, + "loss": 0.2388, + "step": 8970 + }, + { + "epoch": 0.9123234786142437, + "grad_norm": 3.765625, + "learning_rate": 9.61704563448565e-07, + "loss": 0.1944, + "step": 8980 + }, + { + "epoch": 0.9133394290358631, + "grad_norm": 2.90625, + "learning_rate": 9.396860959235671e-07, + "loss": 0.1667, + "step": 8990 + }, + { + "epoch": 0.9143553794574825, + "grad_norm": 2.4375, + "learning_rate": 9.179177868922085e-07, + "loss": 0.2143, + "step": 9000 + }, + { + "epoch": 0.9153713298791019, + "grad_norm": 3.03125, + "learning_rate": 8.963998626826925e-07, + "loss": 0.1994, + "step": 9010 + }, + { + "epoch": 0.9163872803007214, + "grad_norm": 3.859375, + "learning_rate": 8.751325470199134e-07, + "loss": 0.1714, + "step": 9020 + }, + { + "epoch": 0.9174032307223408, + "grad_norm": 3.375, + "learning_rate": 8.541160610231803e-07, + "loss": 0.144, + "step": 9030 + }, + { + "epoch": 0.9184191811439602, + "grad_norm": 1.3046875, + "learning_rate": 8.333506232038629e-07, + "loss": 0.1333, + "step": 9040 + }, + { + "epoch": 0.9194351315655795, + "grad_norm": 1.734375, + "learning_rate": 8.128364494631724e-07, + "loss": 0.1504, + "step": 9050 + }, + { + "epoch": 0.920451081987199, + "grad_norm": 2.90625, + "learning_rate": 7.925737530898702e-07, + "loss": 0.2235, + "step": 9060 + }, + { + "epoch": 0.9214670324088184, + "grad_norm": 3.359375, + "learning_rate": 7.725627447580902e-07, + "loss": 0.1256, + "step": 9070 + }, + { + "epoch": 0.9224829828304378, + "grad_norm": 6.125, + "learning_rate": 7.528036325251231e-07, + "loss": 0.1963, + "step": 9080 + }, + { + "epoch": 0.9234989332520573, + "grad_norm": 2.359375, + "learning_rate": 7.33296621829252e-07, + "loss": 0.2208, + "step": 9090 + }, + { + "epoch": 0.9245148836736767, + "grad_norm": 3.546875, + "learning_rate": 7.140419154876372e-07, + "loss": 0.184, + "step": 9100 + }, + { + "epoch": 0.9255308340952961, + "grad_norm": 2.640625, + "learning_rate": 6.950397136941872e-07, + "loss": 0.2097, + "step": 9110 + }, + { + "epoch": 0.9265467845169155, + "grad_norm": 3.671875, + "learning_rate": 6.762902140174888e-07, + "loss": 0.19, + "step": 9120 + }, + { + "epoch": 0.927562734938535, + "grad_norm": 2.03125, + "learning_rate": 6.577936113987437e-07, + "loss": 0.1427, + "step": 9130 + }, + { + "epoch": 0.9285786853601544, + "grad_norm": 2.1875, + "learning_rate": 6.395500981497577e-07, + "loss": 0.2116, + "step": 9140 + }, + { + "epoch": 0.9295946357817738, + "grad_norm": 5.25, + "learning_rate": 6.215598639509185e-07, + "loss": 0.2384, + "step": 9150 + }, + { + "epoch": 0.9306105862033933, + "grad_norm": 3.953125, + "learning_rate": 6.038230958492403e-07, + "loss": 0.2406, + "step": 9160 + }, + { + "epoch": 0.9316265366250127, + "grad_norm": 4.09375, + "learning_rate": 5.863399782564199e-07, + "loss": 0.1889, + "step": 9170 + }, + { + "epoch": 0.9326424870466321, + "grad_norm": 3.109375, + "learning_rate": 5.691106929469004e-07, + "loss": 0.1916, + "step": 9180 + }, + { + "epoch": 0.9336584374682515, + "grad_norm": 1.65625, + "learning_rate": 5.521354190560102e-07, + "loss": 0.1752, + "step": 9190 + }, + { + "epoch": 0.934674387889871, + "grad_norm": 5.21875, + "learning_rate": 5.354143330780714e-07, + "loss": 0.1779, + "step": 9200 + }, + { + "epoch": 0.9356903383114904, + "grad_norm": 2.359375, + "learning_rate": 5.18947608864595e-07, + "loss": 0.1461, + "step": 9210 + }, + { + "epoch": 0.9367062887331098, + "grad_norm": 1.0859375, + "learning_rate": 5.027354176224353e-07, + "loss": 0.2565, + "step": 9220 + }, + { + "epoch": 0.9377222391547293, + "grad_norm": 4.5, + "learning_rate": 4.867779279120493e-07, + "loss": 0.2301, + "step": 9230 + }, + { + "epoch": 0.9387381895763487, + "grad_norm": 2.984375, + "learning_rate": 4.710753056457157e-07, + "loss": 0.1916, + "step": 9240 + }, + { + "epoch": 0.9397541399979681, + "grad_norm": 4.40625, + "learning_rate": 4.556277140858267e-07, + "loss": 0.1808, + "step": 9250 + }, + { + "epoch": 0.9407700904195875, + "grad_norm": 4.3125, + "learning_rate": 4.404353138431766e-07, + "loss": 0.1552, + "step": 9260 + }, + { + "epoch": 0.941786040841207, + "grad_norm": 2.640625, + "learning_rate": 4.254982628753096e-07, + "loss": 0.1995, + "step": 9270 + }, + { + "epoch": 0.9428019912628264, + "grad_norm": 3.03125, + "learning_rate": 4.108167164848575e-07, + "loss": 0.1495, + "step": 9280 + }, + { + "epoch": 0.9438179416844458, + "grad_norm": 3.203125, + "learning_rate": 3.963908273179384e-07, + "loss": 0.1787, + "step": 9290 + }, + { + "epoch": 0.9448338921060653, + "grad_norm": 0.8671875, + "learning_rate": 3.8222074536257144e-07, + "loss": 0.1742, + "step": 9300 + }, + { + "epoch": 0.9458498425276847, + "grad_norm": 0.734375, + "learning_rate": 3.683066179470979e-07, + "loss": 0.1386, + "step": 9310 + }, + { + "epoch": 0.9468657929493041, + "grad_norm": 2.796875, + "learning_rate": 3.5464858973868476e-07, + "loss": 0.1806, + "step": 9320 + }, + { + "epoch": 0.9478817433709235, + "grad_norm": 2.734375, + "learning_rate": 3.4124680274177646e-07, + "loss": 0.1873, + "step": 9330 + }, + { + "epoch": 0.948897693792543, + "grad_norm": 2.390625, + "learning_rate": 3.2810139629665393e-07, + "loss": 0.1745, + "step": 9340 + }, + { + "epoch": 0.9499136442141624, + "grad_norm": 3.546875, + "learning_rate": 3.152125070779749e-07, + "loss": 0.2116, + "step": 9350 + }, + { + "epoch": 0.9509295946357817, + "grad_norm": 5.5625, + "learning_rate": 3.0258026909334713e-07, + "loss": 0.2088, + "step": 9360 + }, + { + "epoch": 0.9519455450574011, + "grad_norm": 4.5, + "learning_rate": 2.9020481368193795e-07, + "loss": 0.111, + "step": 9370 + }, + { + "epoch": 0.9529614954790206, + "grad_norm": 3.15625, + "learning_rate": 2.7808626951310867e-07, + "loss": 0.2391, + "step": 9380 + }, + { + "epoch": 0.95397744590064, + "grad_norm": 2.6875, + "learning_rate": 2.662247625850822e-07, + "loss": 0.3217, + "step": 9390 + }, + { + "epoch": 0.9549933963222594, + "grad_norm": 2.65625, + "learning_rate": 2.5462041622362767e-07, + "loss": 0.1667, + "step": 9400 + }, + { + "epoch": 0.9560093467438789, + "grad_norm": 0.8984375, + "learning_rate": 2.4327335108077773e-07, + "loss": 0.1709, + "step": 9410 + }, + { + "epoch": 0.9570252971654983, + "grad_norm": 4.65625, + "learning_rate": 2.3218368513357737e-07, + "loss": 0.1912, + "step": 9420 + }, + { + "epoch": 0.9580412475871177, + "grad_norm": 3.390625, + "learning_rate": 2.213515336828592e-07, + "loss": 0.1544, + "step": 9430 + }, + { + "epoch": 0.9590571980087371, + "grad_norm": 2.921875, + "learning_rate": 2.1077700935202836e-07, + "loss": 0.1806, + "step": 9440 + }, + { + "epoch": 0.9600731484303566, + "grad_norm": 3.5625, + "learning_rate": 2.004602220859214e-07, + "loss": 0.152, + "step": 9450 + }, + { + "epoch": 0.961089098851976, + "grad_norm": 2.671875, + "learning_rate": 1.9040127914963514e-07, + "loss": 0.1799, + "step": 9460 + }, + { + "epoch": 0.9621050492735954, + "grad_norm": 2.828125, + "learning_rate": 1.8060028512742188e-07, + "loss": 0.1811, + "step": 9470 + }, + { + "epoch": 0.9631209996952149, + "grad_norm": 5.875, + "learning_rate": 1.7105734192160717e-07, + "loss": 0.2012, + "step": 9480 + }, + { + "epoch": 0.9641369501168343, + "grad_norm": 3.21875, + "learning_rate": 1.6177254875152647e-07, + "loss": 0.2129, + "step": 9490 + }, + { + "epoch": 0.9651529005384537, + "grad_norm": 1.078125, + "learning_rate": 1.5274600215248736e-07, + "loss": 0.1498, + "step": 9500 + } + ], + "logging_steps": 10, + "max_steps": 9843, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoints/checkpoint-9500/training_args.bin b/checkpoints/checkpoint-9500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/checkpoints/checkpoint-9500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984 diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6bb0a4ec10ab29ca1942aecdfa4212d352e373f1 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +size 4984