diff --git a/adapter_config.json b/adapter_config.json index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..c836ae77959e70554d42ba16c549bf81b3a7a2a4 100644 --- a/adapter_config.json +++ b/adapter_config.json @@ -1,7 +1,7 @@ { "alpha_pattern": {}, "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", + "base_model_name_or_path": "../pretrained/MiniCPM-2B-dpo-bf16", "bias": "none", "fan_in_fan_out": false, "inference_mode": true, @@ -20,8 +20,8 @@ "rank_pattern": {}, "revision": null, "target_modules": [ - "v_proj", - "q_proj" + "q_proj", + "v_proj" ], "task_type": "CAUSAL_LM", "use_dora": false, diff --git a/adapter_model.safetensors b/adapter_model.safetensors index d4507be88293606900a542574f09ce87322b185b..1eb3689198e9cf5913ef9a7a1384a1eb24fe9836 100644 --- a/adapter_model.safetensors +++ b/adapter_model.safetensors @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e566c61bed3b05cde1ebb3a325febfce0fafe3d3f3e648acb48bf8bd33cedb20 +oid sha256:98a6673650f0341a09b85b29df958305686f39a2c562102c86dbf0ba9443f436 size 5919456 diff --git a/checkpoints/.DS_Store b/checkpoints/.DS_Store deleted file mode 100644 index 319da2ad88eb2d840c5fe704eb9e32c055d29681..0000000000000000000000000000000000000000 Binary files a/checkpoints/.DS_Store and /dev/null differ diff --git a/checkpoints/checkpoint-1000/.DS_Store b/checkpoints/checkpoint-1000/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/checkpoints/checkpoint-1000/.DS_Store and /dev/null differ diff --git a/checkpoints/checkpoint-1000/adapter_config.json b/checkpoints/checkpoint-1000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-1000/adapter_model.safetensors b/checkpoints/checkpoint-1000/adapter_model.safetensors deleted file mode 100644 index ada62a9056d2bc3e2d2cbb277e551f55e6b4aa0b..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7021d0f53b2af54bc0fb2b36f002603368f00cc5e37cf8ffa4d35e14e850cfc -size 5919456 diff --git a/checkpoints/checkpoint-1000/optimizer.pt b/checkpoints/checkpoint-1000/optimizer.pt deleted file mode 100644 index e68730afa32e5f46d317ce668b5fa018beb68976..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cae149eaee16a95e002bf921dbb3bd8869b631f62813911b3680edf18d7ff723 -size 11930938 diff --git a/checkpoints/checkpoint-1000/rng_state_0.pth b/checkpoints/checkpoint-1000/rng_state_0.pth deleted file mode 100644 index 63c03aea1b9f7d3ab56583f85946ad1c03bba717..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:287014c23e0c4e9613e974e969516cd0fc0bdce46daf1c6f1c0b66e3eb091e50 -size 15024 diff --git a/checkpoints/checkpoint-1000/rng_state_1.pth b/checkpoints/checkpoint-1000/rng_state_1.pth deleted file mode 100644 index fced8d099bdfbcd4544fef78e3314059f786c5cd..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2ab87bd92d10c436f5b79e618401a8481e12e99083829692c2fc0de84edcad99 -size 15024 diff --git a/checkpoints/checkpoint-1000/rng_state_2.pth b/checkpoints/checkpoint-1000/rng_state_2.pth deleted file mode 100644 index 10c8b81e903b01c272edbdb09a6d11cdd8111bb9..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8553566dff78f7c07a5a3e5517a0a640c6ef80a0e95eb328fed4c566945f6fd0 -size 15024 diff --git a/checkpoints/checkpoint-1000/rng_state_3.pth b/checkpoints/checkpoint-1000/rng_state_3.pth deleted file mode 100644 index 0cc13eb6a1f6df01cb785380fb1717d312f38f54..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:60c899ef713ad40446331266b18149afa90e55eed1210839243a923ea8aa772d -size 15024 diff --git a/checkpoints/checkpoint-1000/scheduler.pt b/checkpoints/checkpoint-1000/scheduler.pt deleted file mode 100644 index a0d6f7f9da279b71ea03802b24c083ad94591354..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:feeecf82b24e0ddf3c8d8285a678fe39a1184c1c961cf677b5ac8d36409a9a05 -size 1064 diff --git a/checkpoints/checkpoint-1000/trainer_state.json b/checkpoints/checkpoint-1000/trainer_state.json deleted file mode 100644 index 390fc3ea2539619e97f2debda2245ee0140a82df..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/trainer_state.json +++ /dev/null @@ -1,721 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.1015950421619425, - "eval_steps": 500, - "global_step": 1000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-1000/training_args.bin b/checkpoints/checkpoint-1000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-1500/.DS_Store b/checkpoints/checkpoint-1500/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/checkpoints/checkpoint-1500/.DS_Store and /dev/null differ diff --git a/checkpoints/checkpoint-1500/adapter_config.json b/checkpoints/checkpoint-1500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-1500/adapter_model.safetensors b/checkpoints/checkpoint-1500/adapter_model.safetensors deleted file mode 100644 index 019ae5acc3c39c11dcf29cb8a850039b38bc7fa3..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0429f41a7679282d393919c7b40c076ed2bde721f9f95c9c14f1f57fa0b63f6 -size 5919456 diff --git a/checkpoints/checkpoint-1500/optimizer.pt b/checkpoints/checkpoint-1500/optimizer.pt deleted file mode 100644 index b8ea20aac602d99eebb3facca793da7481fb829f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f1cffc930b18a6507977b7928c70647d37429193d760799a24e4aa466c787fd3 -size 11930938 diff --git a/checkpoints/checkpoint-1500/rng_state_0.pth b/checkpoints/checkpoint-1500/rng_state_0.pth deleted file mode 100644 index dd23fb6362f7699ea3bcc22f01e5b102e65d2118..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af2583d0490e9c0c8a5832ae63a5d486d9951078b0ec9594ee0125a1807e7528 -size 15024 diff --git a/checkpoints/checkpoint-1500/rng_state_1.pth b/checkpoints/checkpoint-1500/rng_state_1.pth deleted file mode 100644 index d2851666bc50646272a2546dd1712a3eea0259bd..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:af9fd6e5a8f29754daaa4f1a3c57f904108d748a165ca0e1ad16571d90e39fa3 -size 15024 diff --git a/checkpoints/checkpoint-1500/rng_state_2.pth b/checkpoints/checkpoint-1500/rng_state_2.pth deleted file mode 100644 index ac093b6104318e553a760be63eb06cd0d3f1ec17..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd55da1784f3e2dba7244fb29d3bbc59fbefd6b4bb1357a4ded5822c60485304 -size 15024 diff --git a/checkpoints/checkpoint-1500/rng_state_3.pth b/checkpoints/checkpoint-1500/rng_state_3.pth deleted file mode 100644 index 7ddbc69dcf0ed5d1b65deb88a8f1131eccc34e07..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b10bab8fcd02c56978a924d6d0be691a36e321523009f8caa015318d52823f2 -size 15024 diff --git a/checkpoints/checkpoint-1500/scheduler.pt b/checkpoints/checkpoint-1500/scheduler.pt deleted file mode 100644 index 51876d3aaf0be1aae071cb0c017d51b61448d7ed..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:68a6cef91a2e578166a3e870fa8585312c20b665553a57976a77b1e7d2ca0ef3 -size 1064 diff --git a/checkpoints/checkpoint-1500/trainer_state.json b/checkpoints/checkpoint-1500/trainer_state.json deleted file mode 100644 index 228276cc158ac36aef26a0b87e0e57950412ba46..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/trainer_state.json +++ /dev/null @@ -1,1071 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.15239256324291375, - "eval_steps": 500, - "global_step": 1500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-1500/training_args.bin b/checkpoints/checkpoint-1500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-1500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-2000/.DS_Store b/checkpoints/checkpoint-2000/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/checkpoints/checkpoint-2000/.DS_Store and /dev/null differ diff --git a/checkpoints/checkpoint-2000/adapter_config.json b/checkpoints/checkpoint-2000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-2000/adapter_model.safetensors b/checkpoints/checkpoint-2000/adapter_model.safetensors deleted file mode 100644 index 4c42278d439b28a8eb84d8f3d7b93d55df223451..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f511e364452e06b28c03299a544dc2d5fd730fdfd45007cd4ec530e30144139 -size 5919456 diff --git a/checkpoints/checkpoint-2000/optimizer.pt b/checkpoints/checkpoint-2000/optimizer.pt deleted file mode 100644 index 27718bc4fc2e46b271db6199e877b9ca8ddec162..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4121e055d593f91209d1cfca077444bb962a8848054f778e04d22d7e269a91e3 -size 11930938 diff --git a/checkpoints/checkpoint-2000/rng_state_0.pth b/checkpoints/checkpoint-2000/rng_state_0.pth deleted file mode 100644 index 3b1b550785550980db2dab7c4db776074d507397..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7814979e8476866bd135a55e64d375b840c3c1436aa60c3d69ece3f0a10c3408 -size 15024 diff --git a/checkpoints/checkpoint-2000/rng_state_1.pth b/checkpoints/checkpoint-2000/rng_state_1.pth deleted file mode 100644 index 7a3ed4fbc5afe1f89fd45284b352bf27deedb8ea..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b4b8a9f5b33e3bc1f4b31217176bbabc65ace6c56a7bf77b1b7153dc062ba709 -size 15024 diff --git a/checkpoints/checkpoint-2000/rng_state_2.pth b/checkpoints/checkpoint-2000/rng_state_2.pth deleted file mode 100644 index 6f9d319e7d0f09d65d040354c5e0320b59dec0f0..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70445c98497e5f1d7bea8f44a93bc0211d0226177b834ac2949fe24e3b538d05 -size 15024 diff --git a/checkpoints/checkpoint-2000/rng_state_3.pth b/checkpoints/checkpoint-2000/rng_state_3.pth deleted file mode 100644 index c23f68acc06264aeb9b09bf60269640ce8b147c8..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4c244dce3fafef8abbf503718fb81319ff12831928a8845134b78a845e0c6e14 -size 15024 diff --git a/checkpoints/checkpoint-2000/scheduler.pt b/checkpoints/checkpoint-2000/scheduler.pt deleted file mode 100644 index c133553a2b6f7143f327ed61a33a83fcc4a43b74..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f9cf822c73733a63e3bebbb2802abc0acedf3d9423d9f25c6bb103f490864c06 -size 1064 diff --git a/checkpoints/checkpoint-2000/trainer_state.json b/checkpoints/checkpoint-2000/trainer_state.json deleted file mode 100644 index e7a9301a9ac4e7f3efde74d978b93fbd74160b85..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/trainer_state.json +++ /dev/null @@ -1,1421 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.203190084323885, - "eval_steps": 500, - "global_step": 2000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-2000/training_args.bin b/checkpoints/checkpoint-2000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-2500/.DS_Store b/checkpoints/checkpoint-2500/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/checkpoints/checkpoint-2500/.DS_Store and /dev/null differ diff --git a/checkpoints/checkpoint-2500/adapter_config.json b/checkpoints/checkpoint-2500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-2500/adapter_model.safetensors b/checkpoints/checkpoint-2500/adapter_model.safetensors deleted file mode 100644 index b72c393034479ae51f68bfdb488baeb0eca04321..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:56179446117ddaff95696487189a032d48a215d8456339c2ae5eda1870df93b9 -size 5919456 diff --git a/checkpoints/checkpoint-2500/optimizer.pt b/checkpoints/checkpoint-2500/optimizer.pt deleted file mode 100644 index 0b52ed1ee54b291770f15300bf140925bca02fde..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dd04baeeb8fe514953f04bca66abb944e11003693bffa63c977e38849273cb49 -size 11930938 diff --git a/checkpoints/checkpoint-2500/rng_state_0.pth b/checkpoints/checkpoint-2500/rng_state_0.pth deleted file mode 100644 index 17380c6e5ac5082c526e6f93eccf203432124116..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:460a81dc2425be030afb2c9930b7e9fc9de54ad9cf988c330851fefe47a118c2 -size 15024 diff --git a/checkpoints/checkpoint-2500/rng_state_1.pth b/checkpoints/checkpoint-2500/rng_state_1.pth deleted file mode 100644 index 38dd76771028ab39d8a7742325c670728c9d9b17..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f39c713bcb800d5d9d55e44b2d2e744bbee1e449a935b9a681868e507ac58f86 -size 15024 diff --git a/checkpoints/checkpoint-2500/rng_state_2.pth b/checkpoints/checkpoint-2500/rng_state_2.pth deleted file mode 100644 index ba8fc1fc2b8078c75a9a126da3a3f68cd3411b42..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6b1c81cf4c5daaf0fe1a5aa4e87259ede9f631f0867a6d085b8eba9a03f7275 -size 15024 diff --git a/checkpoints/checkpoint-2500/rng_state_3.pth b/checkpoints/checkpoint-2500/rng_state_3.pth deleted file mode 100644 index 7f389e22cd4c2984db2f53ef472201c762aff9e6..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d7ae3d71695a016b734120d0e84d8510429872cccd4cdfc5ad051249bdceb709 -size 15024 diff --git a/checkpoints/checkpoint-2500/scheduler.pt b/checkpoints/checkpoint-2500/scheduler.pt deleted file mode 100644 index cfe96bf12bb338be15a0e572c935955e022dddb6..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0992ea36b796a9efd875f53514a1a7b72426fb94846549bcad84ea3eae0acee -size 1064 diff --git a/checkpoints/checkpoint-2500/trainer_state.json b/checkpoints/checkpoint-2500/trainer_state.json deleted file mode 100644 index 085f59bbcf897cba748b6ce5aee019fdbd65bb0d..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/trainer_state.json +++ /dev/null @@ -1,1771 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.25398760540485626, - "eval_steps": 500, - "global_step": 2500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-2500/training_args.bin b/checkpoints/checkpoint-2500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-2500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-3000/.DS_Store b/checkpoints/checkpoint-3000/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/checkpoints/checkpoint-3000/.DS_Store and /dev/null differ diff --git a/checkpoints/checkpoint-3000/adapter_config.json b/checkpoints/checkpoint-3000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-3000/adapter_model.safetensors b/checkpoints/checkpoint-3000/adapter_model.safetensors deleted file mode 100644 index f00b979e893c419aa35f668c6083983f38a702aa..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:24a237f0667aea50351b9a62bb2ddead2e09a567815c526d250be96397fa3798 -size 5919456 diff --git a/checkpoints/checkpoint-3000/optimizer.pt b/checkpoints/checkpoint-3000/optimizer.pt deleted file mode 100644 index 5dfd65a95d4b75ec54a7c63d0fd72ce2529b8de1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb7f56c657475efd0fee1d492240e0c5e5d10ed8ec46630177e04192f2950d28 -size 11930938 diff --git a/checkpoints/checkpoint-3000/rng_state_0.pth b/checkpoints/checkpoint-3000/rng_state_0.pth deleted file mode 100644 index fd0da84997e5db079fe7df2a31aeef3efb336831..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:75565d94ecdf86007209f425495e6767052cfd0684203dcaf72c23b7c2dc2740 -size 15024 diff --git a/checkpoints/checkpoint-3000/rng_state_1.pth b/checkpoints/checkpoint-3000/rng_state_1.pth deleted file mode 100644 index 9582aaaef271b2855ca000b54510b991df4f866d..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9f986f3238448cdfd898312547f70e6317c7c4c40d4f49e7782f5380769c5f64 -size 15024 diff --git a/checkpoints/checkpoint-3000/rng_state_2.pth b/checkpoints/checkpoint-3000/rng_state_2.pth deleted file mode 100644 index 0673c505e8967506093c68f5b6c6140a292635b5..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:113f522722f00018489ce132b51c24506ef9b8b849c0c2a783913565b35f6cba -size 15024 diff --git a/checkpoints/checkpoint-3000/rng_state_3.pth b/checkpoints/checkpoint-3000/rng_state_3.pth deleted file mode 100644 index af1dca7b2f752be096e9e7b1b84f0d1897e9f780..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1134f64cfe9f598e01f68bfb6bc3e69460c4b0c7ef7617bf88d7f539babada99 -size 15024 diff --git a/checkpoints/checkpoint-3000/scheduler.pt b/checkpoints/checkpoint-3000/scheduler.pt deleted file mode 100644 index 78eed6ed36accb49b60ba0c6db316b6bf427c96b..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4c5a5e7ee1063db317ee978d346cf3b726f5d9858cb8bd50568280ada9f34910 -size 1064 diff --git a/checkpoints/checkpoint-3000/trainer_state.json b/checkpoints/checkpoint-3000/trainer_state.json deleted file mode 100644 index b813032acb2fc29774636a82567e34aeaaddb1b4..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/trainer_state.json +++ /dev/null @@ -1,2121 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.3047851264858275, - "eval_steps": 500, - "global_step": 3000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-3000/training_args.bin b/checkpoints/checkpoint-3000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-3500/adapter_config.json b/checkpoints/checkpoint-3500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-3500/adapter_model.safetensors b/checkpoints/checkpoint-3500/adapter_model.safetensors deleted file mode 100644 index def26c035ec01dbe6399e1deda89633a3951ba73..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cbf46c695f24f5160623672ddb7b6e3230a79b1c62b8448e3dedb83005ca9935 -size 5919456 diff --git a/checkpoints/checkpoint-3500/optimizer.pt b/checkpoints/checkpoint-3500/optimizer.pt deleted file mode 100644 index 5c107db7039ba7af8dcfa32fd7aa7a8fbd21c74f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22f791945b5138ee83b222ac582e4f208ff52e423ba51084873ae89d31ac1845 -size 11930938 diff --git a/checkpoints/checkpoint-3500/rng_state_0.pth b/checkpoints/checkpoint-3500/rng_state_0.pth deleted file mode 100644 index 6e2e5670f0b559873d46a22a975469d24350dae5..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:937c0b4f9e0770f6eca6c4cdbab48674491808073f9a0008242c7747205b4b0a -size 15024 diff --git a/checkpoints/checkpoint-3500/rng_state_1.pth b/checkpoints/checkpoint-3500/rng_state_1.pth deleted file mode 100644 index 7eb9d5104e7e96e55a0cd774e3249194f48334fc..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:db86a3a2768366792f057841f721820dff6815adc16d464af7828df941223c01 -size 15024 diff --git a/checkpoints/checkpoint-3500/rng_state_2.pth b/checkpoints/checkpoint-3500/rng_state_2.pth deleted file mode 100644 index fda380880dfe7ab57f96a6a7914e3ae9c2b57485..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6fc2e3f389bdedf3f7c940ce7cfd74a54d5ef411c9271a0b570f0f8078360d61 -size 15024 diff --git a/checkpoints/checkpoint-3500/rng_state_3.pth b/checkpoints/checkpoint-3500/rng_state_3.pth deleted file mode 100644 index 8422b115f583e37a495ec690fab1f7a2441ec0a8..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:700a170dc517eaa370c876455bedd04e0b25e7140ee505e562880ac0c268e199 -size 15024 diff --git a/checkpoints/checkpoint-3500/scheduler.pt b/checkpoints/checkpoint-3500/scheduler.pt deleted file mode 100644 index 4c79541e6a9cb3467a5d3ceaa6d2065ec4525bfb..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:586f3b2ba8da5de2fa8f36edc9313f3dbd8056e41828a6210b6c8e0807689cc2 -size 1064 diff --git a/checkpoints/checkpoint-3500/trainer_state.json b/checkpoints/checkpoint-3500/trainer_state.json deleted file mode 100644 index d8f21bb043f14ed3122d67861c839124c000b40f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/trainer_state.json +++ /dev/null @@ -1,2471 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.35558264756679875, - "eval_steps": 500, - "global_step": 3500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-3500/training_args.bin b/checkpoints/checkpoint-3500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-3500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-4000/adapter_config.json b/checkpoints/checkpoint-4000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-4000/adapter_model.safetensors b/checkpoints/checkpoint-4000/adapter_model.safetensors deleted file mode 100644 index 0010e62e10adb58b59a4171d87f3fa37c74f4b27..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7bfc0a45e20d761b2e036c7f419351073780fc963f478f6726d61bf6913308b5 -size 5919456 diff --git a/checkpoints/checkpoint-4000/optimizer.pt b/checkpoints/checkpoint-4000/optimizer.pt deleted file mode 100644 index 7c063fe1aecc2f351c80b43394373815b139c4c7..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:030a97b642b9ca1bca5998b5c77cfdcf063fd297b4aded2538f62462ffcb0d5f -size 11930938 diff --git a/checkpoints/checkpoint-4000/rng_state_0.pth b/checkpoints/checkpoint-4000/rng_state_0.pth deleted file mode 100644 index 5c7dcab2c1fa96e39da4e4616e5e8b282a2f6923..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cdcaaa7cfa8093e078769990ecfb571790fea6d1f1143531eadfe0ef4d53d941 -size 15024 diff --git a/checkpoints/checkpoint-4000/rng_state_1.pth b/checkpoints/checkpoint-4000/rng_state_1.pth deleted file mode 100644 index db8664d3862d07cb99bf09b73c3eb39bf3a8cfb0..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d1e818c945017fe6ffde32b0e04843796059b044ff660b74cf4c5e8397ef5bc3 -size 15024 diff --git a/checkpoints/checkpoint-4000/rng_state_2.pth b/checkpoints/checkpoint-4000/rng_state_2.pth deleted file mode 100644 index 485a59f7e6666a852992eb89a47736cd878a4cf8..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2847ec9ff5d020b7601c16b4794123d4fc5216fa782ba5350f8d8d42d63dec99 -size 15024 diff --git a/checkpoints/checkpoint-4000/rng_state_3.pth b/checkpoints/checkpoint-4000/rng_state_3.pth deleted file mode 100644 index a9f9eb06dbb4d30c19a4bbda9fb0234299c087a7..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d3b7dd272ec99203448b0a7997c01b2d5e414ac34fcaf4a77f23495f1864166 -size 15024 diff --git a/checkpoints/checkpoint-4000/scheduler.pt b/checkpoints/checkpoint-4000/scheduler.pt deleted file mode 100644 index 2f9cc3da0d207e2f0ffb59d7ccbff46198b1d23f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa29240df220021e7e33aee35824e879cf6ebdc5ef315b12a0da849e15ca9816 -size 1064 diff --git a/checkpoints/checkpoint-4000/trainer_state.json b/checkpoints/checkpoint-4000/trainer_state.json deleted file mode 100644 index f1ed37679ac992e6c6dcdfdce95ee04973dd71d7..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/trainer_state.json +++ /dev/null @@ -1,2821 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.40638016864777, - "eval_steps": 500, - "global_step": 4000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-4000/training_args.bin b/checkpoints/checkpoint-4000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-4500/adapter_config.json b/checkpoints/checkpoint-4500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-4500/adapter_model.safetensors b/checkpoints/checkpoint-4500/adapter_model.safetensors deleted file mode 100644 index 7aa3071a8b60a1911581ad4fc94db44a264cd1e1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d7c6963c0425b0c2a06a71e6d47d4fbd98e2a133e8d06948e611eee31aa731b5 -size 5919456 diff --git a/checkpoints/checkpoint-4500/optimizer.pt b/checkpoints/checkpoint-4500/optimizer.pt deleted file mode 100644 index d0cf2fc1750b9ba6635b3270b2477e9c69023282..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0ca7ae9cafbccffe17ee656b749d23931f579d38d02f6c6b7b74c6a48567952 -size 11930938 diff --git a/checkpoints/checkpoint-4500/rng_state_0.pth b/checkpoints/checkpoint-4500/rng_state_0.pth deleted file mode 100644 index 79c43bfab83f7a282b4b114757cd92312afef77e..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7db56d699f6c4a471d74081289b1dc738167e2c8cb96555bccff59ced1cca60e -size 15024 diff --git a/checkpoints/checkpoint-4500/rng_state_1.pth b/checkpoints/checkpoint-4500/rng_state_1.pth deleted file mode 100644 index 0c66d8f438b500c92afd06878b0368f9291d1969..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fcf572c6921c759e3118791b83271fd15d5f57461b5f6071b7e35d40b08d7c33 -size 15024 diff --git a/checkpoints/checkpoint-4500/rng_state_2.pth b/checkpoints/checkpoint-4500/rng_state_2.pth deleted file mode 100644 index b19443304b49a814d70104106104e9252ab03822..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce499e2b0e0eea0c44c5e4fe25e271573ced5a8fefd0acff896293acd6102969 -size 15024 diff --git a/checkpoints/checkpoint-4500/rng_state_3.pth b/checkpoints/checkpoint-4500/rng_state_3.pth deleted file mode 100644 index 118e3785c401a599c8cce9ab1d24766b140cee99..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6b4a17f4cb7c3a2fc3e2abbc1b812e3de44942fd00e096dc1860327c3e174cf1 -size 15024 diff --git a/checkpoints/checkpoint-4500/scheduler.pt b/checkpoints/checkpoint-4500/scheduler.pt deleted file mode 100644 index 3cf7e49f7f8f9c6c02c1fea57551f9b19ff36b35..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:157e40aa454761c5046fdbfd6ead1c155f32a8d8a91eba044250c5c2d1b7f7fa -size 1064 diff --git a/checkpoints/checkpoint-4500/trainer_state.json b/checkpoints/checkpoint-4500/trainer_state.json deleted file mode 100644 index f17c9ab3c88953159e52b1f4f07808a0b09ac082..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/trainer_state.json +++ /dev/null @@ -1,3171 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.45717768972874123, - "eval_steps": 500, - "global_step": 4500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-4500/training_args.bin b/checkpoints/checkpoint-4500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-4500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-500/.DS_Store b/checkpoints/checkpoint-500/.DS_Store deleted file mode 100644 index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..0000000000000000000000000000000000000000 Binary files a/checkpoints/checkpoint-500/.DS_Store and /dev/null differ diff --git a/checkpoints/checkpoint-500/adapter_config.json b/checkpoints/checkpoint-500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-500/adapter_model.safetensors b/checkpoints/checkpoint-500/adapter_model.safetensors deleted file mode 100644 index e9303fa912a4ef3e12e8166a7375958634ce8f36..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe875d0568aa37f2927a8a3eb8f6951cb2a06db4bb4acf3f08994191b8fcd074 -size 5919456 diff --git a/checkpoints/checkpoint-500/optimizer.pt b/checkpoints/checkpoint-500/optimizer.pt deleted file mode 100644 index df3daed252c5700e51cc06cd0106fb5befa94f45..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcefad058bb98bc378f909e2924345eaa383cbd31ac20a7c96a0144fbdad481b -size 11930938 diff --git a/checkpoints/checkpoint-500/rng_state_0.pth b/checkpoints/checkpoint-500/rng_state_0.pth deleted file mode 100644 index 5e831911e803d34db0dee43e81dbe00dd8ce41a1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:809bd09d083f3cc7af22a0d5fa482e3b4a80ee095c2d25606491adff7d437298 -size 15024 diff --git a/checkpoints/checkpoint-500/rng_state_1.pth b/checkpoints/checkpoint-500/rng_state_1.pth deleted file mode 100644 index 5fc931d40d8b42b24ee9524277102c98e47a7f1c..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f59c5bba33ae6a197dd330c2dad3e367df715420869ffee03f041f6bf374bd47 -size 15024 diff --git a/checkpoints/checkpoint-500/rng_state_2.pth b/checkpoints/checkpoint-500/rng_state_2.pth deleted file mode 100644 index db1a2781c331b27ce633c771e2eb5ef32c951b3b..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:55e1bb7909b9abc9e4e3f3acf48720cbfc308fbfb27de6a9043a72cfc99abc32 -size 15024 diff --git a/checkpoints/checkpoint-500/rng_state_3.pth b/checkpoints/checkpoint-500/rng_state_3.pth deleted file mode 100644 index 58211359f16a7adad0b07c2acd96237202353fe7..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ade95ac7fe747f684693248dcde4845f243707debd6487625345278fdd7ad55 -size 15024 diff --git a/checkpoints/checkpoint-500/scheduler.pt b/checkpoints/checkpoint-500/scheduler.pt deleted file mode 100644 index f50d6e23087c0212f07d2b373bd3914a857dbcf4..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9494fbf9d2b6d20c7d21fb5ce991f87947f9899a55c5171fe969b0e4974d2103 -size 1064 diff --git a/checkpoints/checkpoint-500/trainer_state.json b/checkpoints/checkpoint-500/trainer_state.json deleted file mode 100644 index b0d2b5be8b9988aeafaf4997b3b16453e990e766..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/trainer_state.json +++ /dev/null @@ -1,371 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.05079752108097125, - "eval_steps": 500, - "global_step": 500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-500/training_args.bin b/checkpoints/checkpoint-500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-5000/adapter_config.json b/checkpoints/checkpoint-5000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-5000/adapter_model.safetensors b/checkpoints/checkpoint-5000/adapter_model.safetensors deleted file mode 100644 index c5323954094d8a6496c38b30ffa0df0ac0cfffe8..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ae5f313d7b029213719465291718ddf1e014b263490e78fc2a5169bf0b5252a -size 5919456 diff --git a/checkpoints/checkpoint-5000/optimizer.pt b/checkpoints/checkpoint-5000/optimizer.pt deleted file mode 100644 index a79946ce5dd1275af1622625dd5f1f7e9f277b2d..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:12db3a411b1a777bb2de716e77eb87fa0eb100990039bc8de878ad51ab65b732 -size 11930938 diff --git a/checkpoints/checkpoint-5000/rng_state_0.pth b/checkpoints/checkpoint-5000/rng_state_0.pth deleted file mode 100644 index 3af645b6e41cbac80e3d515e0208c0dfd154af79..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:67bfaa3beff54b75efd8c9a44ed5971676113e2db1f831aa1c6af1ce6a9caa0b -size 15024 diff --git a/checkpoints/checkpoint-5000/rng_state_1.pth b/checkpoints/checkpoint-5000/rng_state_1.pth deleted file mode 100644 index 597c43e1460e8c299d649484b08ee0173163f16f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:41ed0b09eacb9d8a8c6e72cf8f95eee4f179b7f40a1ead1975ec33f39a10b112 -size 15024 diff --git a/checkpoints/checkpoint-5000/rng_state_2.pth b/checkpoints/checkpoint-5000/rng_state_2.pth deleted file mode 100644 index c826825016b292604ccf1c156229766246fb116e..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b8d837f21251c93900eb66777d51bff1fbc27c649b7852fa3ff8933fb4214304 -size 15024 diff --git a/checkpoints/checkpoint-5000/rng_state_3.pth b/checkpoints/checkpoint-5000/rng_state_3.pth deleted file mode 100644 index c344167e5746676b6dde2a30375e5d12f6600233..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:efef25c82d2ac74d69a7a158e6fb9e4ce251ed5ebd2235b890bf805ca5dc093e -size 15024 diff --git a/checkpoints/checkpoint-5000/scheduler.pt b/checkpoints/checkpoint-5000/scheduler.pt deleted file mode 100644 index c0604ddc2c73d6b2fa706115b186b8876b4db0f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ee42deb5c1610056c95ecf4b744f4223a6749192630e5e7100203f9bf0540ef4 -size 1064 diff --git a/checkpoints/checkpoint-5000/trainer_state.json b/checkpoints/checkpoint-5000/trainer_state.json deleted file mode 100644 index 360c1e39d3d8b26d00d06e5b1ec148375669ff0d..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/trainer_state.json +++ /dev/null @@ -1,3521 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.5079752108097125, - "eval_steps": 500, - "global_step": 5000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-5000/training_args.bin b/checkpoints/checkpoint-5000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-5500/adapter_config.json b/checkpoints/checkpoint-5500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-5500/adapter_model.safetensors b/checkpoints/checkpoint-5500/adapter_model.safetensors deleted file mode 100644 index 21091e5fb953f81db93b5a0a1b0e35419d285ff5..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bb810b92dc1e55cf94b723b80404b69163c821876b612daca7c18abd50e5cc29 -size 5919456 diff --git a/checkpoints/checkpoint-5500/optimizer.pt b/checkpoints/checkpoint-5500/optimizer.pt deleted file mode 100644 index 283b36631e755fd00b60aa6cd5641a403ac76671..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf105da9fd77aa95ac72217635110bc055276ac42b13c61c0ea8afe029df437c -size 11930938 diff --git a/checkpoints/checkpoint-5500/rng_state_0.pth b/checkpoints/checkpoint-5500/rng_state_0.pth deleted file mode 100644 index 7b48e0fe6cf94e78b55f9c79e0ea7aa19d768484..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:70343f9cb54717109b4b086674f4317c77a6d5236e01a3f60ae87d47f6637943 -size 15024 diff --git a/checkpoints/checkpoint-5500/rng_state_1.pth b/checkpoints/checkpoint-5500/rng_state_1.pth deleted file mode 100644 index f870e3cd3d0baad3b6137f60d4d1eed197e6b688..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:17e6a7bceffa2b144c23b4b2c0efacf64ed5a2f0808ff897d8e65e0e129c709d -size 15024 diff --git a/checkpoints/checkpoint-5500/rng_state_2.pth b/checkpoints/checkpoint-5500/rng_state_2.pth deleted file mode 100644 index a3f7b4465900221f00761bcc0d874e660643e438..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:109af0ffa2fb7f677448e11bba29f896c314794923e50f5ab77002d63db44682 -size 15024 diff --git a/checkpoints/checkpoint-5500/rng_state_3.pth b/checkpoints/checkpoint-5500/rng_state_3.pth deleted file mode 100644 index 3912593f43081836761363a4b700970c467e59f3..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:043288038a590b0ede4ede98760e44fe949c1db3d9630240030a93cb18c91259 -size 15024 diff --git a/checkpoints/checkpoint-5500/scheduler.pt b/checkpoints/checkpoint-5500/scheduler.pt deleted file mode 100644 index 8b047cded6249de59455f00473cd79106488c266..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5608c457217d0ace38955805f4007ad8ac45872c2d7cde753cbd1a93ae8304bd -size 1064 diff --git a/checkpoints/checkpoint-5500/trainer_state.json b/checkpoints/checkpoint-5500/trainer_state.json deleted file mode 100644 index d651783d2a32611ed806f44e48a3ee9af386d8df..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/trainer_state.json +++ /dev/null @@ -1,3871 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.5587727318906838, - "eval_steps": 500, - "global_step": 5500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-5500/training_args.bin b/checkpoints/checkpoint-5500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-5500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-6000/adapter_config.json b/checkpoints/checkpoint-6000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-6000/adapter_model.safetensors b/checkpoints/checkpoint-6000/adapter_model.safetensors deleted file mode 100644 index 4952b4876cdc8d06ed1f82e75e7acfb784d7c302..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:69f4641ab691c0b74c8e8075b2ec23906859a6fe6d1279160c1d6f7281d2611e -size 5919456 diff --git a/checkpoints/checkpoint-6000/optimizer.pt b/checkpoints/checkpoint-6000/optimizer.pt deleted file mode 100644 index 8a6d428db6bdf0026167613f3f78e4cbb391cf2e..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3ccafd81d481629f8476e5ad9a763e445cfca7e7ec2aba5cffe2ed9ccacdb684 -size 11930938 diff --git a/checkpoints/checkpoint-6000/rng_state_0.pth b/checkpoints/checkpoint-6000/rng_state_0.pth deleted file mode 100644 index 856f6e848cfe511c4d3682477fc96768fa71e17b..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cf4e27664b653acfc21de70ae172ec0726ec640b898e117c9e038d403049764b -size 15024 diff --git a/checkpoints/checkpoint-6000/rng_state_1.pth b/checkpoints/checkpoint-6000/rng_state_1.pth deleted file mode 100644 index 91439d971c1906d490e43d5ab3edec52712a346c..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ee19749a64079934bb3c47b0f06798649c23ef5b2d28a94a6161759a9b11f5ae -size 15024 diff --git a/checkpoints/checkpoint-6000/rng_state_2.pth b/checkpoints/checkpoint-6000/rng_state_2.pth deleted file mode 100644 index 6426756daceb7e8531a5a3124f838933e0855709..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a1eb0b946551fdb70140918364d81243b8abd5ef8ab1df8ba0a040cd91d240e5 -size 15024 diff --git a/checkpoints/checkpoint-6000/rng_state_3.pth b/checkpoints/checkpoint-6000/rng_state_3.pth deleted file mode 100644 index 8c11f16945bb9db73e4cb93bc2cc5c8ebd465457..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b5684284bf0bd122be00905a7d1688bf38e87d37239ed219e780f5b8d8f5f3eb -size 15024 diff --git a/checkpoints/checkpoint-6000/scheduler.pt b/checkpoints/checkpoint-6000/scheduler.pt deleted file mode 100644 index 6e228e34327b45950a62a960d14420ebe108d897..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:202173950488d1b3f4ccd086089dbc7f02c295eb11ea39c10e13fac52ece8f3a -size 1064 diff --git a/checkpoints/checkpoint-6000/trainer_state.json b/checkpoints/checkpoint-6000/trainer_state.json deleted file mode 100644 index 8bfa9cc79f70409d1c63f3a37aa77adcf038e581..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/trainer_state.json +++ /dev/null @@ -1,4221 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.609570252971655, - "eval_steps": 500, - "global_step": 6000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - }, - { - "epoch": 0.5597886823123032, - "grad_norm": 1.9921875, - "learning_rate": 2.0680849202968743e-05, - "loss": 0.2139, - "step": 5510 - }, - { - "epoch": 0.5608046327339226, - "grad_norm": 2.4375, - "learning_rate": 2.060147242557674e-05, - "loss": 0.183, - "step": 5520 - }, - { - "epoch": 0.5618205831555421, - "grad_norm": 5.5, - "learning_rate": 2.0522141380299308e-05, - "loss": 0.1673, - "step": 5530 - }, - { - "epoch": 0.5628365335771615, - "grad_norm": 4.25, - "learning_rate": 2.044285689195258e-05, - "loss": 0.1674, - "step": 5540 - }, - { - "epoch": 0.5638524839987809, - "grad_norm": 2.109375, - "learning_rate": 2.0363619784868604e-05, - "loss": 0.1531, - "step": 5550 - }, - { - "epoch": 0.5648684344204002, - "grad_norm": 2.59375, - "learning_rate": 2.0284430882886836e-05, - "loss": 0.1665, - "step": 5560 - }, - { - "epoch": 0.5658843848420197, - "grad_norm": 3.984375, - "learning_rate": 2.020529100934549e-05, - "loss": 0.1717, - "step": 5570 - }, - { - "epoch": 0.5669003352636391, - "grad_norm": 1.6015625, - "learning_rate": 2.012620098707306e-05, - "loss": 0.1167, - "step": 5580 - }, - { - "epoch": 0.5679162856852585, - "grad_norm": 6.0625, - "learning_rate": 2.004716163837972e-05, - "loss": 0.2084, - "step": 5590 - }, - { - "epoch": 0.5689322361068779, - "grad_norm": 2.5625, - "learning_rate": 1.996817378504876e-05, - "loss": 0.1939, - "step": 5600 - }, - { - "epoch": 0.5699481865284974, - "grad_norm": 3.109375, - "learning_rate": 1.9889238248328108e-05, - "loss": 0.1241, - "step": 5610 - }, - { - "epoch": 0.5709641369501168, - "grad_norm": 4.875, - "learning_rate": 1.981035584892171e-05, - "loss": 0.1865, - "step": 5620 - }, - { - "epoch": 0.5719800873717362, - "grad_norm": 2.984375, - "learning_rate": 1.9731527406981072e-05, - "loss": 0.1639, - "step": 5630 - }, - { - "epoch": 0.5729960377933557, - "grad_norm": 4.4375, - "learning_rate": 1.9652753742096655e-05, - "loss": 0.2019, - "step": 5640 - }, - { - "epoch": 0.5740119882149751, - "grad_norm": 4.3125, - "learning_rate": 1.9574035673289432e-05, - "loss": 0.1829, - "step": 5650 - }, - { - "epoch": 0.5750279386365945, - "grad_norm": 3.203125, - "learning_rate": 1.9495374019002312e-05, - "loss": 0.2267, - "step": 5660 - }, - { - "epoch": 0.5760438890582139, - "grad_norm": 1.765625, - "learning_rate": 1.9416769597091673e-05, - "loss": 0.1411, - "step": 5670 - }, - { - "epoch": 0.5770598394798334, - "grad_norm": 2.640625, - "learning_rate": 1.9338223224818818e-05, - "loss": 0.1476, - "step": 5680 - }, - { - "epoch": 0.5780757899014528, - "grad_norm": 4.84375, - "learning_rate": 1.9259735718841524e-05, - "loss": 0.1417, - "step": 5690 - }, - { - "epoch": 0.5790917403230722, - "grad_norm": 2.421875, - "learning_rate": 1.918130789520551e-05, - "loss": 0.1592, - "step": 5700 - }, - { - "epoch": 0.5801076907446917, - "grad_norm": 2.984375, - "learning_rate": 1.9102940569335963e-05, - "loss": 0.161, - "step": 5710 - }, - { - "epoch": 0.5811236411663111, - "grad_norm": 1.0234375, - "learning_rate": 1.9024634556029093e-05, - "loss": 0.1614, - "step": 5720 - }, - { - "epoch": 0.5821395915879305, - "grad_norm": 2.90625, - "learning_rate": 1.89463906694436e-05, - "loss": 0.1505, - "step": 5730 - }, - { - "epoch": 0.5831555420095499, - "grad_norm": 2.875, - "learning_rate": 1.8868209723092286e-05, - "loss": 0.1674, - "step": 5740 - }, - { - "epoch": 0.5841714924311694, - "grad_norm": 0.408203125, - "learning_rate": 1.8790092529833508e-05, - "loss": 0.1468, - "step": 5750 - }, - { - "epoch": 0.5851874428527888, - "grad_norm": 5.1875, - "learning_rate": 1.871203990186281e-05, - "loss": 0.1903, - "step": 5760 - }, - { - "epoch": 0.5862033932744082, - "grad_norm": 0.5546875, - "learning_rate": 1.8634052650704415e-05, - "loss": 0.2644, - "step": 5770 - }, - { - "epoch": 0.5872193436960277, - "grad_norm": 3.203125, - "learning_rate": 1.8556131587202848e-05, - "loss": 0.1968, - "step": 5780 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.484375, - "learning_rate": 1.8478277521514424e-05, - "loss": 0.2249, - "step": 5790 - }, - { - "epoch": 0.5892512445392665, - "grad_norm": 4.0, - "learning_rate": 1.8400491263098906e-05, - "loss": 0.1881, - "step": 5800 - }, - { - "epoch": 0.5902671949608859, - "grad_norm": 1.90625, - "learning_rate": 1.832277362071106e-05, - "loss": 0.1352, - "step": 5810 - }, - { - "epoch": 0.5912831453825054, - "grad_norm": 2.765625, - "learning_rate": 1.824512540239221e-05, - "loss": 0.2737, - "step": 5820 - }, - { - "epoch": 0.5922990958041248, - "grad_norm": 2.609375, - "learning_rate": 1.81675474154619e-05, - "loss": 0.1566, - "step": 5830 - }, - { - "epoch": 0.5933150462257442, - "grad_norm": 2.6875, - "learning_rate": 1.8090040466509444e-05, - "loss": 0.1999, - "step": 5840 - }, - { - "epoch": 0.5943309966473637, - "grad_norm": 2.609375, - "learning_rate": 1.8012605361385592e-05, - "loss": 0.2372, - "step": 5850 - }, - { - "epoch": 0.5953469470689831, - "grad_norm": 8.125, - "learning_rate": 1.7935242905194087e-05, - "loss": 0.2411, - "step": 5860 - }, - { - "epoch": 0.5963628974906025, - "grad_norm": 3.46875, - "learning_rate": 1.785795390228336e-05, - "loss": 0.138, - "step": 5870 - }, - { - "epoch": 0.5973788479122218, - "grad_norm": 2.3125, - "learning_rate": 1.7780739156238125e-05, - "loss": 0.1867, - "step": 5880 - }, - { - "epoch": 0.5983947983338413, - "grad_norm": 4.0625, - "learning_rate": 1.770359946987105e-05, - "loss": 0.2091, - "step": 5890 - }, - { - "epoch": 0.5994107487554607, - "grad_norm": 5.21875, - "learning_rate": 1.7626535645214378e-05, - "loss": 0.2091, - "step": 5900 - }, - { - "epoch": 0.6004266991770801, - "grad_norm": 3.15625, - "learning_rate": 1.7549548483511614e-05, - "loss": 0.1927, - "step": 5910 - }, - { - "epoch": 0.6014426495986995, - "grad_norm": 4.71875, - "learning_rate": 1.7472638785209198e-05, - "loss": 0.1893, - "step": 5920 - }, - { - "epoch": 0.602458600020319, - "grad_norm": 3.015625, - "learning_rate": 1.7395807349948145e-05, - "loss": 0.1557, - "step": 5930 - }, - { - "epoch": 0.6034745504419384, - "grad_norm": 2.9375, - "learning_rate": 1.73190549765558e-05, - "loss": 0.1717, - "step": 5940 - }, - { - "epoch": 0.6044905008635578, - "grad_norm": 3.109375, - "learning_rate": 1.724238246303745e-05, - "loss": 0.1879, - "step": 5950 - }, - { - "epoch": 0.6055064512851773, - "grad_norm": 3.875, - "learning_rate": 1.71657906065681e-05, - "loss": 0.1908, - "step": 5960 - }, - { - "epoch": 0.6065224017067967, - "grad_norm": 5.09375, - "learning_rate": 1.7089280203484115e-05, - "loss": 0.1712, - "step": 5970 - }, - { - "epoch": 0.6075383521284161, - "grad_norm": 3.015625, - "learning_rate": 1.701285204927502e-05, - "loss": 0.1454, - "step": 5980 - }, - { - "epoch": 0.6085543025500355, - "grad_norm": 3.265625, - "learning_rate": 1.693650693857515e-05, - "loss": 0.2283, - "step": 5990 - }, - { - "epoch": 0.609570252971655, - "grad_norm": 3.40625, - "learning_rate": 1.6860245665155466e-05, - "loss": 0.2188, - "step": 6000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-6000/training_args.bin b/checkpoints/checkpoint-6000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-6500/adapter_config.json b/checkpoints/checkpoint-6500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-6500/adapter_model.safetensors b/checkpoints/checkpoint-6500/adapter_model.safetensors deleted file mode 100644 index ea7be9ee67cb55626a3d8d3ef209fc4f1bbf148f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14ab2e0853dd44a4b03a9d15251deba57b15ed3261fc5e75ea0c78d8f9481486 -size 5919456 diff --git a/checkpoints/checkpoint-6500/optimizer.pt b/checkpoints/checkpoint-6500/optimizer.pt deleted file mode 100644 index 5ce7862baad54ec5623f4b4c5b4996f11f14e8c1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7047b3d5af563c786060cdd116349c114053ee2307bebbbf5f5214200bcde1ce -size 11930938 diff --git a/checkpoints/checkpoint-6500/rng_state_0.pth b/checkpoints/checkpoint-6500/rng_state_0.pth deleted file mode 100644 index 0e8e09c2f4a00e6486fc03f4d8a2e8a9eb1caac7..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a591de0feafd38f3dfb34a5639b5f07fc69364d8918cb41bee6dd6766d4ef1d2 -size 15024 diff --git a/checkpoints/checkpoint-6500/rng_state_1.pth b/checkpoints/checkpoint-6500/rng_state_1.pth deleted file mode 100644 index 0b9a1717d7d349f476f13e5192f06dc234ca82ea..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:da06f3a5ac82888a33cdffca90c924defae45ba6fd9ff5004219e2c9f4170e79 -size 15024 diff --git a/checkpoints/checkpoint-6500/rng_state_2.pth b/checkpoints/checkpoint-6500/rng_state_2.pth deleted file mode 100644 index 504d07665b63dfacbdb864145313ac5c3fee5a57..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4d391446f8c14431e9c25329555ff1b3c6971fcfbfabf701b23c665e748c381f -size 15024 diff --git a/checkpoints/checkpoint-6500/rng_state_3.pth b/checkpoints/checkpoint-6500/rng_state_3.pth deleted file mode 100644 index fcf06f5ee914106315184abc1f4e43a3e0dc6a3e..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34b3a168af7b46a8a4885487b4c51f6f868f8ac39e0a20191646e4671ae739ea -size 15024 diff --git a/checkpoints/checkpoint-6500/scheduler.pt b/checkpoints/checkpoint-6500/scheduler.pt deleted file mode 100644 index bed16b1cc961349a026d0a7300f4c19a4a4cb3b3..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9bdea953796bedc1929c6f94c7498a5dbf5c6f421163832ce66f06c7e038f707 -size 1064 diff --git a/checkpoints/checkpoint-6500/trainer_state.json b/checkpoints/checkpoint-6500/trainer_state.json deleted file mode 100644 index 1e3fe36bbb17bb1b607af62a6682ab9d45672e47..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/trainer_state.json +++ /dev/null @@ -1,4571 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.6603677740526263, - "eval_steps": 500, - "global_step": 6500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - }, - { - "epoch": 0.5597886823123032, - "grad_norm": 1.9921875, - "learning_rate": 2.0680849202968743e-05, - "loss": 0.2139, - "step": 5510 - }, - { - "epoch": 0.5608046327339226, - "grad_norm": 2.4375, - "learning_rate": 2.060147242557674e-05, - "loss": 0.183, - "step": 5520 - }, - { - "epoch": 0.5618205831555421, - "grad_norm": 5.5, - "learning_rate": 2.0522141380299308e-05, - "loss": 0.1673, - "step": 5530 - }, - { - "epoch": 0.5628365335771615, - "grad_norm": 4.25, - "learning_rate": 2.044285689195258e-05, - "loss": 0.1674, - "step": 5540 - }, - { - "epoch": 0.5638524839987809, - "grad_norm": 2.109375, - "learning_rate": 2.0363619784868604e-05, - "loss": 0.1531, - "step": 5550 - }, - { - "epoch": 0.5648684344204002, - "grad_norm": 2.59375, - "learning_rate": 2.0284430882886836e-05, - "loss": 0.1665, - "step": 5560 - }, - { - "epoch": 0.5658843848420197, - "grad_norm": 3.984375, - "learning_rate": 2.020529100934549e-05, - "loss": 0.1717, - "step": 5570 - }, - { - "epoch": 0.5669003352636391, - "grad_norm": 1.6015625, - "learning_rate": 2.012620098707306e-05, - "loss": 0.1167, - "step": 5580 - }, - { - "epoch": 0.5679162856852585, - "grad_norm": 6.0625, - "learning_rate": 2.004716163837972e-05, - "loss": 0.2084, - "step": 5590 - }, - { - "epoch": 0.5689322361068779, - "grad_norm": 2.5625, - "learning_rate": 1.996817378504876e-05, - "loss": 0.1939, - "step": 5600 - }, - { - "epoch": 0.5699481865284974, - "grad_norm": 3.109375, - "learning_rate": 1.9889238248328108e-05, - "loss": 0.1241, - "step": 5610 - }, - { - "epoch": 0.5709641369501168, - "grad_norm": 4.875, - "learning_rate": 1.981035584892171e-05, - "loss": 0.1865, - "step": 5620 - }, - { - "epoch": 0.5719800873717362, - "grad_norm": 2.984375, - "learning_rate": 1.9731527406981072e-05, - "loss": 0.1639, - "step": 5630 - }, - { - "epoch": 0.5729960377933557, - "grad_norm": 4.4375, - "learning_rate": 1.9652753742096655e-05, - "loss": 0.2019, - "step": 5640 - }, - { - "epoch": 0.5740119882149751, - "grad_norm": 4.3125, - "learning_rate": 1.9574035673289432e-05, - "loss": 0.1829, - "step": 5650 - }, - { - "epoch": 0.5750279386365945, - "grad_norm": 3.203125, - "learning_rate": 1.9495374019002312e-05, - "loss": 0.2267, - "step": 5660 - }, - { - "epoch": 0.5760438890582139, - "grad_norm": 1.765625, - "learning_rate": 1.9416769597091673e-05, - "loss": 0.1411, - "step": 5670 - }, - { - "epoch": 0.5770598394798334, - "grad_norm": 2.640625, - "learning_rate": 1.9338223224818818e-05, - "loss": 0.1476, - "step": 5680 - }, - { - "epoch": 0.5780757899014528, - "grad_norm": 4.84375, - "learning_rate": 1.9259735718841524e-05, - "loss": 0.1417, - "step": 5690 - }, - { - "epoch": 0.5790917403230722, - "grad_norm": 2.421875, - "learning_rate": 1.918130789520551e-05, - "loss": 0.1592, - "step": 5700 - }, - { - "epoch": 0.5801076907446917, - "grad_norm": 2.984375, - "learning_rate": 1.9102940569335963e-05, - "loss": 0.161, - "step": 5710 - }, - { - "epoch": 0.5811236411663111, - "grad_norm": 1.0234375, - "learning_rate": 1.9024634556029093e-05, - "loss": 0.1614, - "step": 5720 - }, - { - "epoch": 0.5821395915879305, - "grad_norm": 2.90625, - "learning_rate": 1.89463906694436e-05, - "loss": 0.1505, - "step": 5730 - }, - { - "epoch": 0.5831555420095499, - "grad_norm": 2.875, - "learning_rate": 1.8868209723092286e-05, - "loss": 0.1674, - "step": 5740 - }, - { - "epoch": 0.5841714924311694, - "grad_norm": 0.408203125, - "learning_rate": 1.8790092529833508e-05, - "loss": 0.1468, - "step": 5750 - }, - { - "epoch": 0.5851874428527888, - "grad_norm": 5.1875, - "learning_rate": 1.871203990186281e-05, - "loss": 0.1903, - "step": 5760 - }, - { - "epoch": 0.5862033932744082, - "grad_norm": 0.5546875, - "learning_rate": 1.8634052650704415e-05, - "loss": 0.2644, - "step": 5770 - }, - { - "epoch": 0.5872193436960277, - "grad_norm": 3.203125, - "learning_rate": 1.8556131587202848e-05, - "loss": 0.1968, - "step": 5780 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.484375, - "learning_rate": 1.8478277521514424e-05, - "loss": 0.2249, - "step": 5790 - }, - { - "epoch": 0.5892512445392665, - "grad_norm": 4.0, - "learning_rate": 1.8400491263098906e-05, - "loss": 0.1881, - "step": 5800 - }, - { - "epoch": 0.5902671949608859, - "grad_norm": 1.90625, - "learning_rate": 1.832277362071106e-05, - "loss": 0.1352, - "step": 5810 - }, - { - "epoch": 0.5912831453825054, - "grad_norm": 2.765625, - "learning_rate": 1.824512540239221e-05, - "loss": 0.2737, - "step": 5820 - }, - { - "epoch": 0.5922990958041248, - "grad_norm": 2.609375, - "learning_rate": 1.81675474154619e-05, - "loss": 0.1566, - "step": 5830 - }, - { - "epoch": 0.5933150462257442, - "grad_norm": 2.6875, - "learning_rate": 1.8090040466509444e-05, - "loss": 0.1999, - "step": 5840 - }, - { - "epoch": 0.5943309966473637, - "grad_norm": 2.609375, - "learning_rate": 1.8012605361385592e-05, - "loss": 0.2372, - "step": 5850 - }, - { - "epoch": 0.5953469470689831, - "grad_norm": 8.125, - "learning_rate": 1.7935242905194087e-05, - "loss": 0.2411, - "step": 5860 - }, - { - "epoch": 0.5963628974906025, - "grad_norm": 3.46875, - "learning_rate": 1.785795390228336e-05, - "loss": 0.138, - "step": 5870 - }, - { - "epoch": 0.5973788479122218, - "grad_norm": 2.3125, - "learning_rate": 1.7780739156238125e-05, - "loss": 0.1867, - "step": 5880 - }, - { - "epoch": 0.5983947983338413, - "grad_norm": 4.0625, - "learning_rate": 1.770359946987105e-05, - "loss": 0.2091, - "step": 5890 - }, - { - "epoch": 0.5994107487554607, - "grad_norm": 5.21875, - "learning_rate": 1.7626535645214378e-05, - "loss": 0.2091, - "step": 5900 - }, - { - "epoch": 0.6004266991770801, - "grad_norm": 3.15625, - "learning_rate": 1.7549548483511614e-05, - "loss": 0.1927, - "step": 5910 - }, - { - "epoch": 0.6014426495986995, - "grad_norm": 4.71875, - "learning_rate": 1.7472638785209198e-05, - "loss": 0.1893, - "step": 5920 - }, - { - "epoch": 0.602458600020319, - "grad_norm": 3.015625, - "learning_rate": 1.7395807349948145e-05, - "loss": 0.1557, - "step": 5930 - }, - { - "epoch": 0.6034745504419384, - "grad_norm": 2.9375, - "learning_rate": 1.73190549765558e-05, - "loss": 0.1717, - "step": 5940 - }, - { - "epoch": 0.6044905008635578, - "grad_norm": 3.109375, - "learning_rate": 1.724238246303745e-05, - "loss": 0.1879, - "step": 5950 - }, - { - "epoch": 0.6055064512851773, - "grad_norm": 3.875, - "learning_rate": 1.71657906065681e-05, - "loss": 0.1908, - "step": 5960 - }, - { - "epoch": 0.6065224017067967, - "grad_norm": 5.09375, - "learning_rate": 1.7089280203484115e-05, - "loss": 0.1712, - "step": 5970 - }, - { - "epoch": 0.6075383521284161, - "grad_norm": 3.015625, - "learning_rate": 1.701285204927502e-05, - "loss": 0.1454, - "step": 5980 - }, - { - "epoch": 0.6085543025500355, - "grad_norm": 3.265625, - "learning_rate": 1.693650693857515e-05, - "loss": 0.2283, - "step": 5990 - }, - { - "epoch": 0.609570252971655, - "grad_norm": 3.40625, - "learning_rate": 1.6860245665155466e-05, - "loss": 0.2188, - "step": 6000 - }, - { - "epoch": 0.6105862033932744, - "grad_norm": 2.5625, - "learning_rate": 1.678406902191521e-05, - "loss": 0.1605, - "step": 6010 - }, - { - "epoch": 0.6116021538148938, - "grad_norm": 0.6796875, - "learning_rate": 1.670797780087374e-05, - "loss": 0.1472, - "step": 6020 - }, - { - "epoch": 0.6126181042365133, - "grad_norm": 2.234375, - "learning_rate": 1.6631972793162288e-05, - "loss": 0.1676, - "step": 6030 - }, - { - "epoch": 0.6136340546581327, - "grad_norm": 1.25, - "learning_rate": 1.6556054789015662e-05, - "loss": 0.1508, - "step": 6040 - }, - { - "epoch": 0.6146500050797521, - "grad_norm": 4.78125, - "learning_rate": 1.6480224577764132e-05, - "loss": 0.1981, - "step": 6050 - }, - { - "epoch": 0.6156659555013715, - "grad_norm": 3.46875, - "learning_rate": 1.6404482947825137e-05, - "loss": 0.2514, - "step": 6060 - }, - { - "epoch": 0.616681905922991, - "grad_norm": 1.265625, - "learning_rate": 1.6328830686695154e-05, - "loss": 0.2397, - "step": 6070 - }, - { - "epoch": 0.6176978563446104, - "grad_norm": 1.953125, - "learning_rate": 1.625326858094144e-05, - "loss": 0.1523, - "step": 6080 - }, - { - "epoch": 0.6187138067662298, - "grad_norm": 3.484375, - "learning_rate": 1.6177797416193953e-05, - "loss": 0.218, - "step": 6090 - }, - { - "epoch": 0.6197297571878493, - "grad_norm": 3.484375, - "learning_rate": 1.6102417977137052e-05, - "loss": 0.1476, - "step": 6100 - }, - { - "epoch": 0.6207457076094687, - "grad_norm": 4.90625, - "learning_rate": 1.602713104750147e-05, - "loss": 0.1818, - "step": 6110 - }, - { - "epoch": 0.6217616580310881, - "grad_norm": 4.375, - "learning_rate": 1.5951937410056087e-05, - "loss": 0.2061, - "step": 6120 - }, - { - "epoch": 0.6227776084527075, - "grad_norm": 6.3125, - "learning_rate": 1.587683784659979e-05, - "loss": 0.1566, - "step": 6130 - }, - { - "epoch": 0.623793558874327, - "grad_norm": 2.828125, - "learning_rate": 1.58018331379534e-05, - "loss": 0.1376, - "step": 6140 - }, - { - "epoch": 0.6248095092959464, - "grad_norm": 2.40625, - "learning_rate": 1.572692406395149e-05, - "loss": 0.1655, - "step": 6150 - }, - { - "epoch": 0.6258254597175658, - "grad_norm": 4.34375, - "learning_rate": 1.5652111403434338e-05, - "loss": 0.2363, - "step": 6160 - }, - { - "epoch": 0.6268414101391853, - "grad_norm": 2.453125, - "learning_rate": 1.5577395934239757e-05, - "loss": 0.2464, - "step": 6170 - }, - { - "epoch": 0.6278573605608047, - "grad_norm": 2.53125, - "learning_rate": 1.5502778433195085e-05, - "loss": 0.1898, - "step": 6180 - }, - { - "epoch": 0.628873310982424, - "grad_norm": 2.28125, - "learning_rate": 1.5428259676109048e-05, - "loss": 0.1804, - "step": 6190 - }, - { - "epoch": 0.6298892614040434, - "grad_norm": 4.3125, - "learning_rate": 1.5353840437763732e-05, - "loss": 0.1409, - "step": 6200 - }, - { - "epoch": 0.630905211825663, - "grad_norm": 2.5625, - "learning_rate": 1.5279521491906496e-05, - "loss": 0.2449, - "step": 6210 - }, - { - "epoch": 0.6319211622472823, - "grad_norm": 3.0625, - "learning_rate": 1.520530361124195e-05, - "loss": 0.2103, - "step": 6220 - }, - { - "epoch": 0.6329371126689017, - "grad_norm": 2.609375, - "learning_rate": 1.5131187567423937e-05, - "loss": 0.2156, - "step": 6230 - }, - { - "epoch": 0.6339530630905211, - "grad_norm": 2.703125, - "learning_rate": 1.5057174131047446e-05, - "loss": 0.161, - "step": 6240 - }, - { - "epoch": 0.6349690135121406, - "grad_norm": 3.265625, - "learning_rate": 1.4983264071640679e-05, - "loss": 0.1757, - "step": 6250 - }, - { - "epoch": 0.63598496393376, - "grad_norm": 3.15625, - "learning_rate": 1.490945815765699e-05, - "loss": 0.2011, - "step": 6260 - }, - { - "epoch": 0.6370009143553794, - "grad_norm": 5.375, - "learning_rate": 1.4835757156466945e-05, - "loss": 0.1658, - "step": 6270 - }, - { - "epoch": 0.6380168647769989, - "grad_norm": 2.984375, - "learning_rate": 1.4762161834350271e-05, - "loss": 0.1754, - "step": 6280 - }, - { - "epoch": 0.6390328151986183, - "grad_norm": 2.015625, - "learning_rate": 1.4688672956487987e-05, - "loss": 0.1427, - "step": 6290 - }, - { - "epoch": 0.6400487656202377, - "grad_norm": 3.78125, - "learning_rate": 1.4615291286954352e-05, - "loss": 0.1517, - "step": 6300 - }, - { - "epoch": 0.6410647160418571, - "grad_norm": 2.859375, - "learning_rate": 1.4542017588709005e-05, - "loss": 0.2348, - "step": 6310 - }, - { - "epoch": 0.6420806664634766, - "grad_norm": 2.421875, - "learning_rate": 1.4468852623588961e-05, - "loss": 0.2089, - "step": 6320 - }, - { - "epoch": 0.643096616885096, - "grad_norm": 2.15625, - "learning_rate": 1.4395797152300719e-05, - "loss": 0.1702, - "step": 6330 - }, - { - "epoch": 0.6441125673067154, - "grad_norm": 1.53125, - "learning_rate": 1.4322851934412382e-05, - "loss": 0.1017, - "step": 6340 - }, - { - "epoch": 0.6451285177283349, - "grad_norm": 1.90625, - "learning_rate": 1.4250017728345716e-05, - "loss": 0.1813, - "step": 6350 - }, - { - "epoch": 0.6461444681499543, - "grad_norm": 2.015625, - "learning_rate": 1.4177295291368292e-05, - "loss": 0.1095, - "step": 6360 - }, - { - "epoch": 0.6471604185715737, - "grad_norm": 2.625, - "learning_rate": 1.410468537958558e-05, - "loss": 0.2259, - "step": 6370 - }, - { - "epoch": 0.6481763689931931, - "grad_norm": 3.5, - "learning_rate": 1.4032188747933136e-05, - "loss": 0.1595, - "step": 6380 - }, - { - "epoch": 0.6491923194148126, - "grad_norm": 5.21875, - "learning_rate": 1.39598061501687e-05, - "loss": 0.2226, - "step": 6390 - }, - { - "epoch": 0.650208269836432, - "grad_norm": 5.34375, - "learning_rate": 1.388753833886442e-05, - "loss": 0.2132, - "step": 6400 - }, - { - "epoch": 0.6512242202580514, - "grad_norm": 3.640625, - "learning_rate": 1.3815386065398945e-05, - "loss": 0.1227, - "step": 6410 - }, - { - "epoch": 0.6522401706796709, - "grad_norm": 1.0, - "learning_rate": 1.3743350079949705e-05, - "loss": 0.1755, - "step": 6420 - }, - { - "epoch": 0.6532561211012903, - "grad_norm": 2.359375, - "learning_rate": 1.3671431131485057e-05, - "loss": 0.1552, - "step": 6430 - }, - { - "epoch": 0.6542720715229097, - "grad_norm": 5.3125, - "learning_rate": 1.3599629967756483e-05, - "loss": 0.1917, - "step": 6440 - }, - { - "epoch": 0.6552880219445291, - "grad_norm": 4.625, - "learning_rate": 1.3527947335290877e-05, - "loss": 0.1812, - "step": 6450 - }, - { - "epoch": 0.6563039723661486, - "grad_norm": 1.234375, - "learning_rate": 1.3456383979382708e-05, - "loss": 0.1896, - "step": 6460 - }, - { - "epoch": 0.657319922787768, - "grad_norm": 3.984375, - "learning_rate": 1.3384940644086352e-05, - "loss": 0.1484, - "step": 6470 - }, - { - "epoch": 0.6583358732093874, - "grad_norm": 2.40625, - "learning_rate": 1.3313618072208268e-05, - "loss": 0.1334, - "step": 6480 - }, - { - "epoch": 0.6593518236310069, - "grad_norm": 4.375, - "learning_rate": 1.3242417005299357e-05, - "loss": 0.1351, - "step": 6490 - }, - { - "epoch": 0.6603677740526263, - "grad_norm": 2.640625, - "learning_rate": 1.31713381836472e-05, - "loss": 0.1717, - "step": 6500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-6500/training_args.bin b/checkpoints/checkpoint-6500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-6500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-7000/adapter_config.json b/checkpoints/checkpoint-7000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-7000/adapter_model.safetensors b/checkpoints/checkpoint-7000/adapter_model.safetensors deleted file mode 100644 index f035c3b02bb414b55a51e0686d93c6e41268b717..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcfbb907ba164ea8f4886b499272e552c75b66b5f57c2758a433e516afc4d4ce -size 5919456 diff --git a/checkpoints/checkpoint-7000/optimizer.pt b/checkpoints/checkpoint-7000/optimizer.pt deleted file mode 100644 index e2b419628c564de6327192effa738d6dcc841ee6..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18252d03a633e94bef652fd14050f4f4c005fa3e1408f904a8582bde8cf6615c -size 11930938 diff --git a/checkpoints/checkpoint-7000/rng_state_0.pth b/checkpoints/checkpoint-7000/rng_state_0.pth deleted file mode 100644 index 8a64a4f50d135d9aab30a3efe7dd0f870f273099..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:189e349a3350f8ede7259e8b9c3805a87a95924562b5e53a021c94a808a1c148 -size 15024 diff --git a/checkpoints/checkpoint-7000/rng_state_1.pth b/checkpoints/checkpoint-7000/rng_state_1.pth deleted file mode 100644 index 3669897a1580ca7339f57897968f1cd06fa56704..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb75d1d5d631f8db237b129d4d42263dc618eca39919111a64bdee685ea49d66 -size 15024 diff --git a/checkpoints/checkpoint-7000/rng_state_2.pth b/checkpoints/checkpoint-7000/rng_state_2.pth deleted file mode 100644 index 26649e78cb314240d04745c505919161570ff823..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:35edf2986023cc71305d54b71c695198c4e470f12b08318d2bdd928b6f0040bb -size 15024 diff --git a/checkpoints/checkpoint-7000/rng_state_3.pth b/checkpoints/checkpoint-7000/rng_state_3.pth deleted file mode 100644 index 18f1294c99a3714188a7fd11fee212fba2a544af..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f6aeb7ee84fba724996f3c432c15837e87dd1f98cb887991d0b5902eadb092ea -size 15024 diff --git a/checkpoints/checkpoint-7000/scheduler.pt b/checkpoints/checkpoint-7000/scheduler.pt deleted file mode 100644 index 38eb3798e21f4a386d944a1c7b7f72bbd3680866..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c38f0818bc728352543f333573634bed4254978229e01cce08ff84863c12dc13 -size 1064 diff --git a/checkpoints/checkpoint-7000/trainer_state.json b/checkpoints/checkpoint-7000/trainer_state.json deleted file mode 100644 index 2be0b7f759106ae9ab905f59d43ffd30cde0054b..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/trainer_state.json +++ /dev/null @@ -1,4921 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.7111652951335975, - "eval_steps": 500, - "global_step": 7000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - }, - { - "epoch": 0.5597886823123032, - "grad_norm": 1.9921875, - "learning_rate": 2.0680849202968743e-05, - "loss": 0.2139, - "step": 5510 - }, - { - "epoch": 0.5608046327339226, - "grad_norm": 2.4375, - "learning_rate": 2.060147242557674e-05, - "loss": 0.183, - "step": 5520 - }, - { - "epoch": 0.5618205831555421, - "grad_norm": 5.5, - "learning_rate": 2.0522141380299308e-05, - "loss": 0.1673, - "step": 5530 - }, - { - "epoch": 0.5628365335771615, - "grad_norm": 4.25, - "learning_rate": 2.044285689195258e-05, - "loss": 0.1674, - "step": 5540 - }, - { - "epoch": 0.5638524839987809, - "grad_norm": 2.109375, - "learning_rate": 2.0363619784868604e-05, - "loss": 0.1531, - "step": 5550 - }, - { - "epoch": 0.5648684344204002, - "grad_norm": 2.59375, - "learning_rate": 2.0284430882886836e-05, - "loss": 0.1665, - "step": 5560 - }, - { - "epoch": 0.5658843848420197, - "grad_norm": 3.984375, - "learning_rate": 2.020529100934549e-05, - "loss": 0.1717, - "step": 5570 - }, - { - "epoch": 0.5669003352636391, - "grad_norm": 1.6015625, - "learning_rate": 2.012620098707306e-05, - "loss": 0.1167, - "step": 5580 - }, - { - "epoch": 0.5679162856852585, - "grad_norm": 6.0625, - "learning_rate": 2.004716163837972e-05, - "loss": 0.2084, - "step": 5590 - }, - { - "epoch": 0.5689322361068779, - "grad_norm": 2.5625, - "learning_rate": 1.996817378504876e-05, - "loss": 0.1939, - "step": 5600 - }, - { - "epoch": 0.5699481865284974, - "grad_norm": 3.109375, - "learning_rate": 1.9889238248328108e-05, - "loss": 0.1241, - "step": 5610 - }, - { - "epoch": 0.5709641369501168, - "grad_norm": 4.875, - "learning_rate": 1.981035584892171e-05, - "loss": 0.1865, - "step": 5620 - }, - { - "epoch": 0.5719800873717362, - "grad_norm": 2.984375, - "learning_rate": 1.9731527406981072e-05, - "loss": 0.1639, - "step": 5630 - }, - { - "epoch": 0.5729960377933557, - "grad_norm": 4.4375, - "learning_rate": 1.9652753742096655e-05, - "loss": 0.2019, - "step": 5640 - }, - { - "epoch": 0.5740119882149751, - "grad_norm": 4.3125, - "learning_rate": 1.9574035673289432e-05, - "loss": 0.1829, - "step": 5650 - }, - { - "epoch": 0.5750279386365945, - "grad_norm": 3.203125, - "learning_rate": 1.9495374019002312e-05, - "loss": 0.2267, - "step": 5660 - }, - { - "epoch": 0.5760438890582139, - "grad_norm": 1.765625, - "learning_rate": 1.9416769597091673e-05, - "loss": 0.1411, - "step": 5670 - }, - { - "epoch": 0.5770598394798334, - "grad_norm": 2.640625, - "learning_rate": 1.9338223224818818e-05, - "loss": 0.1476, - "step": 5680 - }, - { - "epoch": 0.5780757899014528, - "grad_norm": 4.84375, - "learning_rate": 1.9259735718841524e-05, - "loss": 0.1417, - "step": 5690 - }, - { - "epoch": 0.5790917403230722, - "grad_norm": 2.421875, - "learning_rate": 1.918130789520551e-05, - "loss": 0.1592, - "step": 5700 - }, - { - "epoch": 0.5801076907446917, - "grad_norm": 2.984375, - "learning_rate": 1.9102940569335963e-05, - "loss": 0.161, - "step": 5710 - }, - { - "epoch": 0.5811236411663111, - "grad_norm": 1.0234375, - "learning_rate": 1.9024634556029093e-05, - "loss": 0.1614, - "step": 5720 - }, - { - "epoch": 0.5821395915879305, - "grad_norm": 2.90625, - "learning_rate": 1.89463906694436e-05, - "loss": 0.1505, - "step": 5730 - }, - { - "epoch": 0.5831555420095499, - "grad_norm": 2.875, - "learning_rate": 1.8868209723092286e-05, - "loss": 0.1674, - "step": 5740 - }, - { - "epoch": 0.5841714924311694, - "grad_norm": 0.408203125, - "learning_rate": 1.8790092529833508e-05, - "loss": 0.1468, - "step": 5750 - }, - { - "epoch": 0.5851874428527888, - "grad_norm": 5.1875, - "learning_rate": 1.871203990186281e-05, - "loss": 0.1903, - "step": 5760 - }, - { - "epoch": 0.5862033932744082, - "grad_norm": 0.5546875, - "learning_rate": 1.8634052650704415e-05, - "loss": 0.2644, - "step": 5770 - }, - { - "epoch": 0.5872193436960277, - "grad_norm": 3.203125, - "learning_rate": 1.8556131587202848e-05, - "loss": 0.1968, - "step": 5780 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.484375, - "learning_rate": 1.8478277521514424e-05, - "loss": 0.2249, - "step": 5790 - }, - { - "epoch": 0.5892512445392665, - "grad_norm": 4.0, - "learning_rate": 1.8400491263098906e-05, - "loss": 0.1881, - "step": 5800 - }, - { - "epoch": 0.5902671949608859, - "grad_norm": 1.90625, - "learning_rate": 1.832277362071106e-05, - "loss": 0.1352, - "step": 5810 - }, - { - "epoch": 0.5912831453825054, - "grad_norm": 2.765625, - "learning_rate": 1.824512540239221e-05, - "loss": 0.2737, - "step": 5820 - }, - { - "epoch": 0.5922990958041248, - "grad_norm": 2.609375, - "learning_rate": 1.81675474154619e-05, - "loss": 0.1566, - "step": 5830 - }, - { - "epoch": 0.5933150462257442, - "grad_norm": 2.6875, - "learning_rate": 1.8090040466509444e-05, - "loss": 0.1999, - "step": 5840 - }, - { - "epoch": 0.5943309966473637, - "grad_norm": 2.609375, - "learning_rate": 1.8012605361385592e-05, - "loss": 0.2372, - "step": 5850 - }, - { - "epoch": 0.5953469470689831, - "grad_norm": 8.125, - "learning_rate": 1.7935242905194087e-05, - "loss": 0.2411, - "step": 5860 - }, - { - "epoch": 0.5963628974906025, - "grad_norm": 3.46875, - "learning_rate": 1.785795390228336e-05, - "loss": 0.138, - "step": 5870 - }, - { - "epoch": 0.5973788479122218, - "grad_norm": 2.3125, - "learning_rate": 1.7780739156238125e-05, - "loss": 0.1867, - "step": 5880 - }, - { - "epoch": 0.5983947983338413, - "grad_norm": 4.0625, - "learning_rate": 1.770359946987105e-05, - "loss": 0.2091, - "step": 5890 - }, - { - "epoch": 0.5994107487554607, - "grad_norm": 5.21875, - "learning_rate": 1.7626535645214378e-05, - "loss": 0.2091, - "step": 5900 - }, - { - "epoch": 0.6004266991770801, - "grad_norm": 3.15625, - "learning_rate": 1.7549548483511614e-05, - "loss": 0.1927, - "step": 5910 - }, - { - "epoch": 0.6014426495986995, - "grad_norm": 4.71875, - "learning_rate": 1.7472638785209198e-05, - "loss": 0.1893, - "step": 5920 - }, - { - "epoch": 0.602458600020319, - "grad_norm": 3.015625, - "learning_rate": 1.7395807349948145e-05, - "loss": 0.1557, - "step": 5930 - }, - { - "epoch": 0.6034745504419384, - "grad_norm": 2.9375, - "learning_rate": 1.73190549765558e-05, - "loss": 0.1717, - "step": 5940 - }, - { - "epoch": 0.6044905008635578, - "grad_norm": 3.109375, - "learning_rate": 1.724238246303745e-05, - "loss": 0.1879, - "step": 5950 - }, - { - "epoch": 0.6055064512851773, - "grad_norm": 3.875, - "learning_rate": 1.71657906065681e-05, - "loss": 0.1908, - "step": 5960 - }, - { - "epoch": 0.6065224017067967, - "grad_norm": 5.09375, - "learning_rate": 1.7089280203484115e-05, - "loss": 0.1712, - "step": 5970 - }, - { - "epoch": 0.6075383521284161, - "grad_norm": 3.015625, - "learning_rate": 1.701285204927502e-05, - "loss": 0.1454, - "step": 5980 - }, - { - "epoch": 0.6085543025500355, - "grad_norm": 3.265625, - "learning_rate": 1.693650693857515e-05, - "loss": 0.2283, - "step": 5990 - }, - { - "epoch": 0.609570252971655, - "grad_norm": 3.40625, - "learning_rate": 1.6860245665155466e-05, - "loss": 0.2188, - "step": 6000 - }, - { - "epoch": 0.6105862033932744, - "grad_norm": 2.5625, - "learning_rate": 1.678406902191521e-05, - "loss": 0.1605, - "step": 6010 - }, - { - "epoch": 0.6116021538148938, - "grad_norm": 0.6796875, - "learning_rate": 1.670797780087374e-05, - "loss": 0.1472, - "step": 6020 - }, - { - "epoch": 0.6126181042365133, - "grad_norm": 2.234375, - "learning_rate": 1.6631972793162288e-05, - "loss": 0.1676, - "step": 6030 - }, - { - "epoch": 0.6136340546581327, - "grad_norm": 1.25, - "learning_rate": 1.6556054789015662e-05, - "loss": 0.1508, - "step": 6040 - }, - { - "epoch": 0.6146500050797521, - "grad_norm": 4.78125, - "learning_rate": 1.6480224577764132e-05, - "loss": 0.1981, - "step": 6050 - }, - { - "epoch": 0.6156659555013715, - "grad_norm": 3.46875, - "learning_rate": 1.6404482947825137e-05, - "loss": 0.2514, - "step": 6060 - }, - { - "epoch": 0.616681905922991, - "grad_norm": 1.265625, - "learning_rate": 1.6328830686695154e-05, - "loss": 0.2397, - "step": 6070 - }, - { - "epoch": 0.6176978563446104, - "grad_norm": 1.953125, - "learning_rate": 1.625326858094144e-05, - "loss": 0.1523, - "step": 6080 - }, - { - "epoch": 0.6187138067662298, - "grad_norm": 3.484375, - "learning_rate": 1.6177797416193953e-05, - "loss": 0.218, - "step": 6090 - }, - { - "epoch": 0.6197297571878493, - "grad_norm": 3.484375, - "learning_rate": 1.6102417977137052e-05, - "loss": 0.1476, - "step": 6100 - }, - { - "epoch": 0.6207457076094687, - "grad_norm": 4.90625, - "learning_rate": 1.602713104750147e-05, - "loss": 0.1818, - "step": 6110 - }, - { - "epoch": 0.6217616580310881, - "grad_norm": 4.375, - "learning_rate": 1.5951937410056087e-05, - "loss": 0.2061, - "step": 6120 - }, - { - "epoch": 0.6227776084527075, - "grad_norm": 6.3125, - "learning_rate": 1.587683784659979e-05, - "loss": 0.1566, - "step": 6130 - }, - { - "epoch": 0.623793558874327, - "grad_norm": 2.828125, - "learning_rate": 1.58018331379534e-05, - "loss": 0.1376, - "step": 6140 - }, - { - "epoch": 0.6248095092959464, - "grad_norm": 2.40625, - "learning_rate": 1.572692406395149e-05, - "loss": 0.1655, - "step": 6150 - }, - { - "epoch": 0.6258254597175658, - "grad_norm": 4.34375, - "learning_rate": 1.5652111403434338e-05, - "loss": 0.2363, - "step": 6160 - }, - { - "epoch": 0.6268414101391853, - "grad_norm": 2.453125, - "learning_rate": 1.5577395934239757e-05, - "loss": 0.2464, - "step": 6170 - }, - { - "epoch": 0.6278573605608047, - "grad_norm": 2.53125, - "learning_rate": 1.5502778433195085e-05, - "loss": 0.1898, - "step": 6180 - }, - { - "epoch": 0.628873310982424, - "grad_norm": 2.28125, - "learning_rate": 1.5428259676109048e-05, - "loss": 0.1804, - "step": 6190 - }, - { - "epoch": 0.6298892614040434, - "grad_norm": 4.3125, - "learning_rate": 1.5353840437763732e-05, - "loss": 0.1409, - "step": 6200 - }, - { - "epoch": 0.630905211825663, - "grad_norm": 2.5625, - "learning_rate": 1.5279521491906496e-05, - "loss": 0.2449, - "step": 6210 - }, - { - "epoch": 0.6319211622472823, - "grad_norm": 3.0625, - "learning_rate": 1.520530361124195e-05, - "loss": 0.2103, - "step": 6220 - }, - { - "epoch": 0.6329371126689017, - "grad_norm": 2.609375, - "learning_rate": 1.5131187567423937e-05, - "loss": 0.2156, - "step": 6230 - }, - { - "epoch": 0.6339530630905211, - "grad_norm": 2.703125, - "learning_rate": 1.5057174131047446e-05, - "loss": 0.161, - "step": 6240 - }, - { - "epoch": 0.6349690135121406, - "grad_norm": 3.265625, - "learning_rate": 1.4983264071640679e-05, - "loss": 0.1757, - "step": 6250 - }, - { - "epoch": 0.63598496393376, - "grad_norm": 3.15625, - "learning_rate": 1.490945815765699e-05, - "loss": 0.2011, - "step": 6260 - }, - { - "epoch": 0.6370009143553794, - "grad_norm": 5.375, - "learning_rate": 1.4835757156466945e-05, - "loss": 0.1658, - "step": 6270 - }, - { - "epoch": 0.6380168647769989, - "grad_norm": 2.984375, - "learning_rate": 1.4762161834350271e-05, - "loss": 0.1754, - "step": 6280 - }, - { - "epoch": 0.6390328151986183, - "grad_norm": 2.015625, - "learning_rate": 1.4688672956487987e-05, - "loss": 0.1427, - "step": 6290 - }, - { - "epoch": 0.6400487656202377, - "grad_norm": 3.78125, - "learning_rate": 1.4615291286954352e-05, - "loss": 0.1517, - "step": 6300 - }, - { - "epoch": 0.6410647160418571, - "grad_norm": 2.859375, - "learning_rate": 1.4542017588709005e-05, - "loss": 0.2348, - "step": 6310 - }, - { - "epoch": 0.6420806664634766, - "grad_norm": 2.421875, - "learning_rate": 1.4468852623588961e-05, - "loss": 0.2089, - "step": 6320 - }, - { - "epoch": 0.643096616885096, - "grad_norm": 2.15625, - "learning_rate": 1.4395797152300719e-05, - "loss": 0.1702, - "step": 6330 - }, - { - "epoch": 0.6441125673067154, - "grad_norm": 1.53125, - "learning_rate": 1.4322851934412382e-05, - "loss": 0.1017, - "step": 6340 - }, - { - "epoch": 0.6451285177283349, - "grad_norm": 1.90625, - "learning_rate": 1.4250017728345716e-05, - "loss": 0.1813, - "step": 6350 - }, - { - "epoch": 0.6461444681499543, - "grad_norm": 2.015625, - "learning_rate": 1.4177295291368292e-05, - "loss": 0.1095, - "step": 6360 - }, - { - "epoch": 0.6471604185715737, - "grad_norm": 2.625, - "learning_rate": 1.410468537958558e-05, - "loss": 0.2259, - "step": 6370 - }, - { - "epoch": 0.6481763689931931, - "grad_norm": 3.5, - "learning_rate": 1.4032188747933136e-05, - "loss": 0.1595, - "step": 6380 - }, - { - "epoch": 0.6491923194148126, - "grad_norm": 5.21875, - "learning_rate": 1.39598061501687e-05, - "loss": 0.2226, - "step": 6390 - }, - { - "epoch": 0.650208269836432, - "grad_norm": 5.34375, - "learning_rate": 1.388753833886442e-05, - "loss": 0.2132, - "step": 6400 - }, - { - "epoch": 0.6512242202580514, - "grad_norm": 3.640625, - "learning_rate": 1.3815386065398945e-05, - "loss": 0.1227, - "step": 6410 - }, - { - "epoch": 0.6522401706796709, - "grad_norm": 1.0, - "learning_rate": 1.3743350079949705e-05, - "loss": 0.1755, - "step": 6420 - }, - { - "epoch": 0.6532561211012903, - "grad_norm": 2.359375, - "learning_rate": 1.3671431131485057e-05, - "loss": 0.1552, - "step": 6430 - }, - { - "epoch": 0.6542720715229097, - "grad_norm": 5.3125, - "learning_rate": 1.3599629967756483e-05, - "loss": 0.1917, - "step": 6440 - }, - { - "epoch": 0.6552880219445291, - "grad_norm": 4.625, - "learning_rate": 1.3527947335290877e-05, - "loss": 0.1812, - "step": 6450 - }, - { - "epoch": 0.6563039723661486, - "grad_norm": 1.234375, - "learning_rate": 1.3456383979382708e-05, - "loss": 0.1896, - "step": 6460 - }, - { - "epoch": 0.657319922787768, - "grad_norm": 3.984375, - "learning_rate": 1.3384940644086352e-05, - "loss": 0.1484, - "step": 6470 - }, - { - "epoch": 0.6583358732093874, - "grad_norm": 2.40625, - "learning_rate": 1.3313618072208268e-05, - "loss": 0.1334, - "step": 6480 - }, - { - "epoch": 0.6593518236310069, - "grad_norm": 4.375, - "learning_rate": 1.3242417005299357e-05, - "loss": 0.1351, - "step": 6490 - }, - { - "epoch": 0.6603677740526263, - "grad_norm": 2.640625, - "learning_rate": 1.31713381836472e-05, - "loss": 0.1717, - "step": 6500 - }, - { - "epoch": 0.6613837244742456, - "grad_norm": 2.640625, - "learning_rate": 1.3100382346268392e-05, - "loss": 0.1867, - "step": 6510 - }, - { - "epoch": 0.662399674895865, - "grad_norm": 1.734375, - "learning_rate": 1.3029550230900812e-05, - "loss": 0.1997, - "step": 6520 - }, - { - "epoch": 0.6634156253174845, - "grad_norm": 3.609375, - "learning_rate": 1.2958842573996016e-05, - "loss": 0.1969, - "step": 6530 - }, - { - "epoch": 0.6644315757391039, - "grad_norm": 3.578125, - "learning_rate": 1.2888260110711525e-05, - "loss": 0.1469, - "step": 6540 - }, - { - "epoch": 0.6654475261607233, - "grad_norm": 1.3515625, - "learning_rate": 1.2817803574903212e-05, - "loss": 0.1524, - "step": 6550 - }, - { - "epoch": 0.6664634765823427, - "grad_norm": 2.109375, - "learning_rate": 1.2747473699117668e-05, - "loss": 0.159, - "step": 6560 - }, - { - "epoch": 0.6674794270039622, - "grad_norm": 1.53125, - "learning_rate": 1.267727121458458e-05, - "loss": 0.1999, - "step": 6570 - }, - { - "epoch": 0.6684953774255816, - "grad_norm": 1.7265625, - "learning_rate": 1.2607196851209137e-05, - "loss": 0.2216, - "step": 6580 - }, - { - "epoch": 0.669511327847201, - "grad_norm": 3.125, - "learning_rate": 1.2537251337564412e-05, - "loss": 0.1607, - "step": 6590 - }, - { - "epoch": 0.6705272782688205, - "grad_norm": 2.421875, - "learning_rate": 1.2467435400883839e-05, - "loss": 0.2187, - "step": 6600 - }, - { - "epoch": 0.6715432286904399, - "grad_norm": 1.5078125, - "learning_rate": 1.239774976705359e-05, - "loss": 0.1753, - "step": 6610 - }, - { - "epoch": 0.6725591791120593, - "grad_norm": 1.140625, - "learning_rate": 1.2328195160605092e-05, - "loss": 0.194, - "step": 6620 - }, - { - "epoch": 0.6735751295336787, - "grad_norm": 4.9375, - "learning_rate": 1.225877230470743e-05, - "loss": 0.1485, - "step": 6630 - }, - { - "epoch": 0.6745910799552982, - "grad_norm": 3.65625, - "learning_rate": 1.218948192115988e-05, - "loss": 0.1847, - "step": 6640 - }, - { - "epoch": 0.6756070303769176, - "grad_norm": 3.875, - "learning_rate": 1.21203247303844e-05, - "loss": 0.1874, - "step": 6650 - }, - { - "epoch": 0.676622980798537, - "grad_norm": 2.65625, - "learning_rate": 1.2051301451418073e-05, - "loss": 0.2377, - "step": 6660 - }, - { - "epoch": 0.6776389312201565, - "grad_norm": 2.09375, - "learning_rate": 1.198241280190574e-05, - "loss": 0.1508, - "step": 6670 - }, - { - "epoch": 0.6786548816417759, - "grad_norm": 2.203125, - "learning_rate": 1.1913659498092431e-05, - "loss": 0.1537, - "step": 6680 - }, - { - "epoch": 0.6796708320633953, - "grad_norm": 2.484375, - "learning_rate": 1.184504225481601e-05, - "loss": 0.2339, - "step": 6690 - }, - { - "epoch": 0.6806867824850147, - "grad_norm": 5.625, - "learning_rate": 1.177656178549966e-05, - "loss": 0.2102, - "step": 6700 - }, - { - "epoch": 0.6817027329066342, - "grad_norm": 2.5, - "learning_rate": 1.1708218802144536e-05, - "loss": 0.1435, - "step": 6710 - }, - { - "epoch": 0.6827186833282536, - "grad_norm": 3.84375, - "learning_rate": 1.1640014015322323e-05, - "loss": 0.1823, - "step": 6720 - }, - { - "epoch": 0.683734633749873, - "grad_norm": 2.359375, - "learning_rate": 1.1571948134167862e-05, - "loss": 0.1154, - "step": 6730 - }, - { - "epoch": 0.6847505841714925, - "grad_norm": 2.90625, - "learning_rate": 1.1504021866371761e-05, - "loss": 0.2105, - "step": 6740 - }, - { - "epoch": 0.6857665345931119, - "grad_norm": 5.46875, - "learning_rate": 1.143623591817304e-05, - "loss": 0.1317, - "step": 6750 - }, - { - "epoch": 0.6867824850147313, - "grad_norm": 3.34375, - "learning_rate": 1.1368590994351835e-05, - "loss": 0.1406, - "step": 6760 - }, - { - "epoch": 0.6877984354363507, - "grad_norm": 3.78125, - "learning_rate": 1.130108779822198e-05, - "loss": 0.1425, - "step": 6770 - }, - { - "epoch": 0.6888143858579702, - "grad_norm": 0.77734375, - "learning_rate": 1.1233727031623783e-05, - "loss": 0.1623, - "step": 6780 - }, - { - "epoch": 0.6898303362795896, - "grad_norm": 4.625, - "learning_rate": 1.1166509394916682e-05, - "loss": 0.1591, - "step": 6790 - }, - { - "epoch": 0.690846286701209, - "grad_norm": 3.84375, - "learning_rate": 1.1099435586971982e-05, - "loss": 0.1758, - "step": 6800 - }, - { - "epoch": 0.6918622371228285, - "grad_norm": 2.4375, - "learning_rate": 1.1032506305165555e-05, - "loss": 0.1018, - "step": 6810 - }, - { - "epoch": 0.6928781875444479, - "grad_norm": 3.203125, - "learning_rate": 1.0965722245370641e-05, - "loss": 0.1485, - "step": 6820 - }, - { - "epoch": 0.6938941379660672, - "grad_norm": 0.7109375, - "learning_rate": 1.0899084101950561e-05, - "loss": 0.1762, - "step": 6830 - }, - { - "epoch": 0.6949100883876866, - "grad_norm": 1.9765625, - "learning_rate": 1.0832592567751555e-05, - "loss": 0.1402, - "step": 6840 - }, - { - "epoch": 0.6959260388093061, - "grad_norm": 1.4609375, - "learning_rate": 1.0766248334095505e-05, - "loss": 0.2278, - "step": 6850 - }, - { - "epoch": 0.6969419892309255, - "grad_norm": 3.953125, - "learning_rate": 1.0700052090772828e-05, - "loss": 0.1969, - "step": 6860 - }, - { - "epoch": 0.6979579396525449, - "grad_norm": 2.453125, - "learning_rate": 1.0634004526035249e-05, - "loss": 0.2073, - "step": 6870 - }, - { - "epoch": 0.6989738900741643, - "grad_norm": 1.6171875, - "learning_rate": 1.0568106326588645e-05, - "loss": 0.1902, - "step": 6880 - }, - { - "epoch": 0.6999898404957838, - "grad_norm": 1.2734375, - "learning_rate": 1.0502358177585953e-05, - "loss": 0.2165, - "step": 6890 - }, - { - "epoch": 0.7010057909174032, - "grad_norm": 1.671875, - "learning_rate": 1.0436760762619977e-05, - "loss": 0.1952, - "step": 6900 - }, - { - "epoch": 0.7020217413390226, - "grad_norm": 2.8125, - "learning_rate": 1.0371314763716347e-05, - "loss": 0.1422, - "step": 6910 - }, - { - "epoch": 0.7030376917606421, - "grad_norm": 2.53125, - "learning_rate": 1.0306020861326388e-05, - "loss": 0.0961, - "step": 6920 - }, - { - "epoch": 0.7040536421822615, - "grad_norm": 3.046875, - "learning_rate": 1.0240879734320068e-05, - "loss": 0.1542, - "step": 6930 - }, - { - "epoch": 0.7050695926038809, - "grad_norm": 2.859375, - "learning_rate": 1.0175892059978901e-05, - "loss": 0.1748, - "step": 6940 - }, - { - "epoch": 0.7060855430255003, - "grad_norm": 2.671875, - "learning_rate": 1.0111058513988958e-05, - "loss": 0.0819, - "step": 6950 - }, - { - "epoch": 0.7071014934471198, - "grad_norm": 3.5625, - "learning_rate": 1.0046379770433803e-05, - "loss": 0.1933, - "step": 6960 - }, - { - "epoch": 0.7081174438687392, - "grad_norm": 2.859375, - "learning_rate": 9.98185650178749e-06, - "loss": 0.1891, - "step": 6970 - }, - { - "epoch": 0.7091333942903586, - "grad_norm": 3.15625, - "learning_rate": 9.917489378907591e-06, - "loss": 0.2102, - "step": 6980 - }, - { - "epoch": 0.7101493447119781, - "grad_norm": 6.40625, - "learning_rate": 9.853279071028212e-06, - "loss": 0.1714, - "step": 6990 - }, - { - "epoch": 0.7111652951335975, - "grad_norm": 2.375, - "learning_rate": 9.78922624575303e-06, - "loss": 0.1299, - "step": 7000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-7000/training_args.bin b/checkpoints/checkpoint-7000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-7500/adapter_config.json b/checkpoints/checkpoint-7500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-7500/adapter_model.safetensors b/checkpoints/checkpoint-7500/adapter_model.safetensors deleted file mode 100644 index 8b566b27331cac4819b9e695fbf784bff8e79cd5..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eea7fbe808952b90c42a26bb6a156e294c6b01b74f4561e32845a867894829fc -size 5919456 diff --git a/checkpoints/checkpoint-7500/optimizer.pt b/checkpoints/checkpoint-7500/optimizer.pt deleted file mode 100644 index fd01feb285dc960cdf690295976b4050319066c1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5dd7bb44df6f3a93ddba0c17a7309fbe69b5d9611641b66779f565c2707e80dd -size 11930938 diff --git a/checkpoints/checkpoint-7500/rng_state_0.pth b/checkpoints/checkpoint-7500/rng_state_0.pth deleted file mode 100644 index 0b99b3affc0b088b35c4713d5dbf363b6fb09e01..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:81a176e4f417232286c66488b5554a046fc1af84e1b5eff446ad37a4dc31c907 -size 15024 diff --git a/checkpoints/checkpoint-7500/rng_state_1.pth b/checkpoints/checkpoint-7500/rng_state_1.pth deleted file mode 100644 index 9e56badfa0d1b294971878e842fe14a8595016f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3899c82e743501e2db6cd0e409a1b646a2eae511ad1e21aa77bc3f897edbf07a -size 15024 diff --git a/checkpoints/checkpoint-7500/rng_state_2.pth b/checkpoints/checkpoint-7500/rng_state_2.pth deleted file mode 100644 index d5c75385d7ded955107873c0bccd2a63bd21f7ce..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e12f41676d53322341924bbe6951161bec80e7d0e4adb780e12732cfa714e98d -size 15024 diff --git a/checkpoints/checkpoint-7500/rng_state_3.pth b/checkpoints/checkpoint-7500/rng_state_3.pth deleted file mode 100644 index 0f0ce3c448fcfd7f4444003295f128c63d9e46cd..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b87c6fdf947fac25cf632f98a25e1c5f87369325d38ab0c3dfe29dea4f62eb75 -size 15024 diff --git a/checkpoints/checkpoint-7500/scheduler.pt b/checkpoints/checkpoint-7500/scheduler.pt deleted file mode 100644 index 5a8d5e25619fe365bd3104d2c93dae8be1ab1f87..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01aa4970faf173b76d7b990defe083ac27fc2fbe404a2ecca54f2098c3a5e177 -size 1064 diff --git a/checkpoints/checkpoint-7500/trainer_state.json b/checkpoints/checkpoint-7500/trainer_state.json deleted file mode 100644 index ebe79f3a10531feb2f6b9e25d7e9001aa29a39fc..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/trainer_state.json +++ /dev/null @@ -1,5271 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.7619628162145687, - "eval_steps": 500, - "global_step": 7500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - }, - { - "epoch": 0.5597886823123032, - "grad_norm": 1.9921875, - "learning_rate": 2.0680849202968743e-05, - "loss": 0.2139, - "step": 5510 - }, - { - "epoch": 0.5608046327339226, - "grad_norm": 2.4375, - "learning_rate": 2.060147242557674e-05, - "loss": 0.183, - "step": 5520 - }, - { - "epoch": 0.5618205831555421, - "grad_norm": 5.5, - "learning_rate": 2.0522141380299308e-05, - "loss": 0.1673, - "step": 5530 - }, - { - "epoch": 0.5628365335771615, - "grad_norm": 4.25, - "learning_rate": 2.044285689195258e-05, - "loss": 0.1674, - "step": 5540 - }, - { - "epoch": 0.5638524839987809, - "grad_norm": 2.109375, - "learning_rate": 2.0363619784868604e-05, - "loss": 0.1531, - "step": 5550 - }, - { - "epoch": 0.5648684344204002, - "grad_norm": 2.59375, - "learning_rate": 2.0284430882886836e-05, - "loss": 0.1665, - "step": 5560 - }, - { - "epoch": 0.5658843848420197, - "grad_norm": 3.984375, - "learning_rate": 2.020529100934549e-05, - "loss": 0.1717, - "step": 5570 - }, - { - "epoch": 0.5669003352636391, - "grad_norm": 1.6015625, - "learning_rate": 2.012620098707306e-05, - "loss": 0.1167, - "step": 5580 - }, - { - "epoch": 0.5679162856852585, - "grad_norm": 6.0625, - "learning_rate": 2.004716163837972e-05, - "loss": 0.2084, - "step": 5590 - }, - { - "epoch": 0.5689322361068779, - "grad_norm": 2.5625, - "learning_rate": 1.996817378504876e-05, - "loss": 0.1939, - "step": 5600 - }, - { - "epoch": 0.5699481865284974, - "grad_norm": 3.109375, - "learning_rate": 1.9889238248328108e-05, - "loss": 0.1241, - "step": 5610 - }, - { - "epoch": 0.5709641369501168, - "grad_norm": 4.875, - "learning_rate": 1.981035584892171e-05, - "loss": 0.1865, - "step": 5620 - }, - { - "epoch": 0.5719800873717362, - "grad_norm": 2.984375, - "learning_rate": 1.9731527406981072e-05, - "loss": 0.1639, - "step": 5630 - }, - { - "epoch": 0.5729960377933557, - "grad_norm": 4.4375, - "learning_rate": 1.9652753742096655e-05, - "loss": 0.2019, - "step": 5640 - }, - { - "epoch": 0.5740119882149751, - "grad_norm": 4.3125, - "learning_rate": 1.9574035673289432e-05, - "loss": 0.1829, - "step": 5650 - }, - { - "epoch": 0.5750279386365945, - "grad_norm": 3.203125, - "learning_rate": 1.9495374019002312e-05, - "loss": 0.2267, - "step": 5660 - }, - { - "epoch": 0.5760438890582139, - "grad_norm": 1.765625, - "learning_rate": 1.9416769597091673e-05, - "loss": 0.1411, - "step": 5670 - }, - { - "epoch": 0.5770598394798334, - "grad_norm": 2.640625, - "learning_rate": 1.9338223224818818e-05, - "loss": 0.1476, - "step": 5680 - }, - { - "epoch": 0.5780757899014528, - "grad_norm": 4.84375, - "learning_rate": 1.9259735718841524e-05, - "loss": 0.1417, - "step": 5690 - }, - { - "epoch": 0.5790917403230722, - "grad_norm": 2.421875, - "learning_rate": 1.918130789520551e-05, - "loss": 0.1592, - "step": 5700 - }, - { - "epoch": 0.5801076907446917, - "grad_norm": 2.984375, - "learning_rate": 1.9102940569335963e-05, - "loss": 0.161, - "step": 5710 - }, - { - "epoch": 0.5811236411663111, - "grad_norm": 1.0234375, - "learning_rate": 1.9024634556029093e-05, - "loss": 0.1614, - "step": 5720 - }, - { - "epoch": 0.5821395915879305, - "grad_norm": 2.90625, - "learning_rate": 1.89463906694436e-05, - "loss": 0.1505, - "step": 5730 - }, - { - "epoch": 0.5831555420095499, - "grad_norm": 2.875, - "learning_rate": 1.8868209723092286e-05, - "loss": 0.1674, - "step": 5740 - }, - { - "epoch": 0.5841714924311694, - "grad_norm": 0.408203125, - "learning_rate": 1.8790092529833508e-05, - "loss": 0.1468, - "step": 5750 - }, - { - "epoch": 0.5851874428527888, - "grad_norm": 5.1875, - "learning_rate": 1.871203990186281e-05, - "loss": 0.1903, - "step": 5760 - }, - { - "epoch": 0.5862033932744082, - "grad_norm": 0.5546875, - "learning_rate": 1.8634052650704415e-05, - "loss": 0.2644, - "step": 5770 - }, - { - "epoch": 0.5872193436960277, - "grad_norm": 3.203125, - "learning_rate": 1.8556131587202848e-05, - "loss": 0.1968, - "step": 5780 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.484375, - "learning_rate": 1.8478277521514424e-05, - "loss": 0.2249, - "step": 5790 - }, - { - "epoch": 0.5892512445392665, - "grad_norm": 4.0, - "learning_rate": 1.8400491263098906e-05, - "loss": 0.1881, - "step": 5800 - }, - { - "epoch": 0.5902671949608859, - "grad_norm": 1.90625, - "learning_rate": 1.832277362071106e-05, - "loss": 0.1352, - "step": 5810 - }, - { - "epoch": 0.5912831453825054, - "grad_norm": 2.765625, - "learning_rate": 1.824512540239221e-05, - "loss": 0.2737, - "step": 5820 - }, - { - "epoch": 0.5922990958041248, - "grad_norm": 2.609375, - "learning_rate": 1.81675474154619e-05, - "loss": 0.1566, - "step": 5830 - }, - { - "epoch": 0.5933150462257442, - "grad_norm": 2.6875, - "learning_rate": 1.8090040466509444e-05, - "loss": 0.1999, - "step": 5840 - }, - { - "epoch": 0.5943309966473637, - "grad_norm": 2.609375, - "learning_rate": 1.8012605361385592e-05, - "loss": 0.2372, - "step": 5850 - }, - { - "epoch": 0.5953469470689831, - "grad_norm": 8.125, - "learning_rate": 1.7935242905194087e-05, - "loss": 0.2411, - "step": 5860 - }, - { - "epoch": 0.5963628974906025, - "grad_norm": 3.46875, - "learning_rate": 1.785795390228336e-05, - "loss": 0.138, - "step": 5870 - }, - { - "epoch": 0.5973788479122218, - "grad_norm": 2.3125, - "learning_rate": 1.7780739156238125e-05, - "loss": 0.1867, - "step": 5880 - }, - { - "epoch": 0.5983947983338413, - "grad_norm": 4.0625, - "learning_rate": 1.770359946987105e-05, - "loss": 0.2091, - "step": 5890 - }, - { - "epoch": 0.5994107487554607, - "grad_norm": 5.21875, - "learning_rate": 1.7626535645214378e-05, - "loss": 0.2091, - "step": 5900 - }, - { - "epoch": 0.6004266991770801, - "grad_norm": 3.15625, - "learning_rate": 1.7549548483511614e-05, - "loss": 0.1927, - "step": 5910 - }, - { - "epoch": 0.6014426495986995, - "grad_norm": 4.71875, - "learning_rate": 1.7472638785209198e-05, - "loss": 0.1893, - "step": 5920 - }, - { - "epoch": 0.602458600020319, - "grad_norm": 3.015625, - "learning_rate": 1.7395807349948145e-05, - "loss": 0.1557, - "step": 5930 - }, - { - "epoch": 0.6034745504419384, - "grad_norm": 2.9375, - "learning_rate": 1.73190549765558e-05, - "loss": 0.1717, - "step": 5940 - }, - { - "epoch": 0.6044905008635578, - "grad_norm": 3.109375, - "learning_rate": 1.724238246303745e-05, - "loss": 0.1879, - "step": 5950 - }, - { - "epoch": 0.6055064512851773, - "grad_norm": 3.875, - "learning_rate": 1.71657906065681e-05, - "loss": 0.1908, - "step": 5960 - }, - { - "epoch": 0.6065224017067967, - "grad_norm": 5.09375, - "learning_rate": 1.7089280203484115e-05, - "loss": 0.1712, - "step": 5970 - }, - { - "epoch": 0.6075383521284161, - "grad_norm": 3.015625, - "learning_rate": 1.701285204927502e-05, - "loss": 0.1454, - "step": 5980 - }, - { - "epoch": 0.6085543025500355, - "grad_norm": 3.265625, - "learning_rate": 1.693650693857515e-05, - "loss": 0.2283, - "step": 5990 - }, - { - "epoch": 0.609570252971655, - "grad_norm": 3.40625, - "learning_rate": 1.6860245665155466e-05, - "loss": 0.2188, - "step": 6000 - }, - { - "epoch": 0.6105862033932744, - "grad_norm": 2.5625, - "learning_rate": 1.678406902191521e-05, - "loss": 0.1605, - "step": 6010 - }, - { - "epoch": 0.6116021538148938, - "grad_norm": 0.6796875, - "learning_rate": 1.670797780087374e-05, - "loss": 0.1472, - "step": 6020 - }, - { - "epoch": 0.6126181042365133, - "grad_norm": 2.234375, - "learning_rate": 1.6631972793162288e-05, - "loss": 0.1676, - "step": 6030 - }, - { - "epoch": 0.6136340546581327, - "grad_norm": 1.25, - "learning_rate": 1.6556054789015662e-05, - "loss": 0.1508, - "step": 6040 - }, - { - "epoch": 0.6146500050797521, - "grad_norm": 4.78125, - "learning_rate": 1.6480224577764132e-05, - "loss": 0.1981, - "step": 6050 - }, - { - "epoch": 0.6156659555013715, - "grad_norm": 3.46875, - "learning_rate": 1.6404482947825137e-05, - "loss": 0.2514, - "step": 6060 - }, - { - "epoch": 0.616681905922991, - "grad_norm": 1.265625, - "learning_rate": 1.6328830686695154e-05, - "loss": 0.2397, - "step": 6070 - }, - { - "epoch": 0.6176978563446104, - "grad_norm": 1.953125, - "learning_rate": 1.625326858094144e-05, - "loss": 0.1523, - "step": 6080 - }, - { - "epoch": 0.6187138067662298, - "grad_norm": 3.484375, - "learning_rate": 1.6177797416193953e-05, - "loss": 0.218, - "step": 6090 - }, - { - "epoch": 0.6197297571878493, - "grad_norm": 3.484375, - "learning_rate": 1.6102417977137052e-05, - "loss": 0.1476, - "step": 6100 - }, - { - "epoch": 0.6207457076094687, - "grad_norm": 4.90625, - "learning_rate": 1.602713104750147e-05, - "loss": 0.1818, - "step": 6110 - }, - { - "epoch": 0.6217616580310881, - "grad_norm": 4.375, - "learning_rate": 1.5951937410056087e-05, - "loss": 0.2061, - "step": 6120 - }, - { - "epoch": 0.6227776084527075, - "grad_norm": 6.3125, - "learning_rate": 1.587683784659979e-05, - "loss": 0.1566, - "step": 6130 - }, - { - "epoch": 0.623793558874327, - "grad_norm": 2.828125, - "learning_rate": 1.58018331379534e-05, - "loss": 0.1376, - "step": 6140 - }, - { - "epoch": 0.6248095092959464, - "grad_norm": 2.40625, - "learning_rate": 1.572692406395149e-05, - "loss": 0.1655, - "step": 6150 - }, - { - "epoch": 0.6258254597175658, - "grad_norm": 4.34375, - "learning_rate": 1.5652111403434338e-05, - "loss": 0.2363, - "step": 6160 - }, - { - "epoch": 0.6268414101391853, - "grad_norm": 2.453125, - "learning_rate": 1.5577395934239757e-05, - "loss": 0.2464, - "step": 6170 - }, - { - "epoch": 0.6278573605608047, - "grad_norm": 2.53125, - "learning_rate": 1.5502778433195085e-05, - "loss": 0.1898, - "step": 6180 - }, - { - "epoch": 0.628873310982424, - "grad_norm": 2.28125, - "learning_rate": 1.5428259676109048e-05, - "loss": 0.1804, - "step": 6190 - }, - { - "epoch": 0.6298892614040434, - "grad_norm": 4.3125, - "learning_rate": 1.5353840437763732e-05, - "loss": 0.1409, - "step": 6200 - }, - { - "epoch": 0.630905211825663, - "grad_norm": 2.5625, - "learning_rate": 1.5279521491906496e-05, - "loss": 0.2449, - "step": 6210 - }, - { - "epoch": 0.6319211622472823, - "grad_norm": 3.0625, - "learning_rate": 1.520530361124195e-05, - "loss": 0.2103, - "step": 6220 - }, - { - "epoch": 0.6329371126689017, - "grad_norm": 2.609375, - "learning_rate": 1.5131187567423937e-05, - "loss": 0.2156, - "step": 6230 - }, - { - "epoch": 0.6339530630905211, - "grad_norm": 2.703125, - "learning_rate": 1.5057174131047446e-05, - "loss": 0.161, - "step": 6240 - }, - { - "epoch": 0.6349690135121406, - "grad_norm": 3.265625, - "learning_rate": 1.4983264071640679e-05, - "loss": 0.1757, - "step": 6250 - }, - { - "epoch": 0.63598496393376, - "grad_norm": 3.15625, - "learning_rate": 1.490945815765699e-05, - "loss": 0.2011, - "step": 6260 - }, - { - "epoch": 0.6370009143553794, - "grad_norm": 5.375, - "learning_rate": 1.4835757156466945e-05, - "loss": 0.1658, - "step": 6270 - }, - { - "epoch": 0.6380168647769989, - "grad_norm": 2.984375, - "learning_rate": 1.4762161834350271e-05, - "loss": 0.1754, - "step": 6280 - }, - { - "epoch": 0.6390328151986183, - "grad_norm": 2.015625, - "learning_rate": 1.4688672956487987e-05, - "loss": 0.1427, - "step": 6290 - }, - { - "epoch": 0.6400487656202377, - "grad_norm": 3.78125, - "learning_rate": 1.4615291286954352e-05, - "loss": 0.1517, - "step": 6300 - }, - { - "epoch": 0.6410647160418571, - "grad_norm": 2.859375, - "learning_rate": 1.4542017588709005e-05, - "loss": 0.2348, - "step": 6310 - }, - { - "epoch": 0.6420806664634766, - "grad_norm": 2.421875, - "learning_rate": 1.4468852623588961e-05, - "loss": 0.2089, - "step": 6320 - }, - { - "epoch": 0.643096616885096, - "grad_norm": 2.15625, - "learning_rate": 1.4395797152300719e-05, - "loss": 0.1702, - "step": 6330 - }, - { - "epoch": 0.6441125673067154, - "grad_norm": 1.53125, - "learning_rate": 1.4322851934412382e-05, - "loss": 0.1017, - "step": 6340 - }, - { - "epoch": 0.6451285177283349, - "grad_norm": 1.90625, - "learning_rate": 1.4250017728345716e-05, - "loss": 0.1813, - "step": 6350 - }, - { - "epoch": 0.6461444681499543, - "grad_norm": 2.015625, - "learning_rate": 1.4177295291368292e-05, - "loss": 0.1095, - "step": 6360 - }, - { - "epoch": 0.6471604185715737, - "grad_norm": 2.625, - "learning_rate": 1.410468537958558e-05, - "loss": 0.2259, - "step": 6370 - }, - { - "epoch": 0.6481763689931931, - "grad_norm": 3.5, - "learning_rate": 1.4032188747933136e-05, - "loss": 0.1595, - "step": 6380 - }, - { - "epoch": 0.6491923194148126, - "grad_norm": 5.21875, - "learning_rate": 1.39598061501687e-05, - "loss": 0.2226, - "step": 6390 - }, - { - "epoch": 0.650208269836432, - "grad_norm": 5.34375, - "learning_rate": 1.388753833886442e-05, - "loss": 0.2132, - "step": 6400 - }, - { - "epoch": 0.6512242202580514, - "grad_norm": 3.640625, - "learning_rate": 1.3815386065398945e-05, - "loss": 0.1227, - "step": 6410 - }, - { - "epoch": 0.6522401706796709, - "grad_norm": 1.0, - "learning_rate": 1.3743350079949705e-05, - "loss": 0.1755, - "step": 6420 - }, - { - "epoch": 0.6532561211012903, - "grad_norm": 2.359375, - "learning_rate": 1.3671431131485057e-05, - "loss": 0.1552, - "step": 6430 - }, - { - "epoch": 0.6542720715229097, - "grad_norm": 5.3125, - "learning_rate": 1.3599629967756483e-05, - "loss": 0.1917, - "step": 6440 - }, - { - "epoch": 0.6552880219445291, - "grad_norm": 4.625, - "learning_rate": 1.3527947335290877e-05, - "loss": 0.1812, - "step": 6450 - }, - { - "epoch": 0.6563039723661486, - "grad_norm": 1.234375, - "learning_rate": 1.3456383979382708e-05, - "loss": 0.1896, - "step": 6460 - }, - { - "epoch": 0.657319922787768, - "grad_norm": 3.984375, - "learning_rate": 1.3384940644086352e-05, - "loss": 0.1484, - "step": 6470 - }, - { - "epoch": 0.6583358732093874, - "grad_norm": 2.40625, - "learning_rate": 1.3313618072208268e-05, - "loss": 0.1334, - "step": 6480 - }, - { - "epoch": 0.6593518236310069, - "grad_norm": 4.375, - "learning_rate": 1.3242417005299357e-05, - "loss": 0.1351, - "step": 6490 - }, - { - "epoch": 0.6603677740526263, - "grad_norm": 2.640625, - "learning_rate": 1.31713381836472e-05, - "loss": 0.1717, - "step": 6500 - }, - { - "epoch": 0.6613837244742456, - "grad_norm": 2.640625, - "learning_rate": 1.3100382346268392e-05, - "loss": 0.1867, - "step": 6510 - }, - { - "epoch": 0.662399674895865, - "grad_norm": 1.734375, - "learning_rate": 1.3029550230900812e-05, - "loss": 0.1997, - "step": 6520 - }, - { - "epoch": 0.6634156253174845, - "grad_norm": 3.609375, - "learning_rate": 1.2958842573996016e-05, - "loss": 0.1969, - "step": 6530 - }, - { - "epoch": 0.6644315757391039, - "grad_norm": 3.578125, - "learning_rate": 1.2888260110711525e-05, - "loss": 0.1469, - "step": 6540 - }, - { - "epoch": 0.6654475261607233, - "grad_norm": 1.3515625, - "learning_rate": 1.2817803574903212e-05, - "loss": 0.1524, - "step": 6550 - }, - { - "epoch": 0.6664634765823427, - "grad_norm": 2.109375, - "learning_rate": 1.2747473699117668e-05, - "loss": 0.159, - "step": 6560 - }, - { - "epoch": 0.6674794270039622, - "grad_norm": 1.53125, - "learning_rate": 1.267727121458458e-05, - "loss": 0.1999, - "step": 6570 - }, - { - "epoch": 0.6684953774255816, - "grad_norm": 1.7265625, - "learning_rate": 1.2607196851209137e-05, - "loss": 0.2216, - "step": 6580 - }, - { - "epoch": 0.669511327847201, - "grad_norm": 3.125, - "learning_rate": 1.2537251337564412e-05, - "loss": 0.1607, - "step": 6590 - }, - { - "epoch": 0.6705272782688205, - "grad_norm": 2.421875, - "learning_rate": 1.2467435400883839e-05, - "loss": 0.2187, - "step": 6600 - }, - { - "epoch": 0.6715432286904399, - "grad_norm": 1.5078125, - "learning_rate": 1.239774976705359e-05, - "loss": 0.1753, - "step": 6610 - }, - { - "epoch": 0.6725591791120593, - "grad_norm": 1.140625, - "learning_rate": 1.2328195160605092e-05, - "loss": 0.194, - "step": 6620 - }, - { - "epoch": 0.6735751295336787, - "grad_norm": 4.9375, - "learning_rate": 1.225877230470743e-05, - "loss": 0.1485, - "step": 6630 - }, - { - "epoch": 0.6745910799552982, - "grad_norm": 3.65625, - "learning_rate": 1.218948192115988e-05, - "loss": 0.1847, - "step": 6640 - }, - { - "epoch": 0.6756070303769176, - "grad_norm": 3.875, - "learning_rate": 1.21203247303844e-05, - "loss": 0.1874, - "step": 6650 - }, - { - "epoch": 0.676622980798537, - "grad_norm": 2.65625, - "learning_rate": 1.2051301451418073e-05, - "loss": 0.2377, - "step": 6660 - }, - { - "epoch": 0.6776389312201565, - "grad_norm": 2.09375, - "learning_rate": 1.198241280190574e-05, - "loss": 0.1508, - "step": 6670 - }, - { - "epoch": 0.6786548816417759, - "grad_norm": 2.203125, - "learning_rate": 1.1913659498092431e-05, - "loss": 0.1537, - "step": 6680 - }, - { - "epoch": 0.6796708320633953, - "grad_norm": 2.484375, - "learning_rate": 1.184504225481601e-05, - "loss": 0.2339, - "step": 6690 - }, - { - "epoch": 0.6806867824850147, - "grad_norm": 5.625, - "learning_rate": 1.177656178549966e-05, - "loss": 0.2102, - "step": 6700 - }, - { - "epoch": 0.6817027329066342, - "grad_norm": 2.5, - "learning_rate": 1.1708218802144536e-05, - "loss": 0.1435, - "step": 6710 - }, - { - "epoch": 0.6827186833282536, - "grad_norm": 3.84375, - "learning_rate": 1.1640014015322323e-05, - "loss": 0.1823, - "step": 6720 - }, - { - "epoch": 0.683734633749873, - "grad_norm": 2.359375, - "learning_rate": 1.1571948134167862e-05, - "loss": 0.1154, - "step": 6730 - }, - { - "epoch": 0.6847505841714925, - "grad_norm": 2.90625, - "learning_rate": 1.1504021866371761e-05, - "loss": 0.2105, - "step": 6740 - }, - { - "epoch": 0.6857665345931119, - "grad_norm": 5.46875, - "learning_rate": 1.143623591817304e-05, - "loss": 0.1317, - "step": 6750 - }, - { - "epoch": 0.6867824850147313, - "grad_norm": 3.34375, - "learning_rate": 1.1368590994351835e-05, - "loss": 0.1406, - "step": 6760 - }, - { - "epoch": 0.6877984354363507, - "grad_norm": 3.78125, - "learning_rate": 1.130108779822198e-05, - "loss": 0.1425, - "step": 6770 - }, - { - "epoch": 0.6888143858579702, - "grad_norm": 0.77734375, - "learning_rate": 1.1233727031623783e-05, - "loss": 0.1623, - "step": 6780 - }, - { - "epoch": 0.6898303362795896, - "grad_norm": 4.625, - "learning_rate": 1.1166509394916682e-05, - "loss": 0.1591, - "step": 6790 - }, - { - "epoch": 0.690846286701209, - "grad_norm": 3.84375, - "learning_rate": 1.1099435586971982e-05, - "loss": 0.1758, - "step": 6800 - }, - { - "epoch": 0.6918622371228285, - "grad_norm": 2.4375, - "learning_rate": 1.1032506305165555e-05, - "loss": 0.1018, - "step": 6810 - }, - { - "epoch": 0.6928781875444479, - "grad_norm": 3.203125, - "learning_rate": 1.0965722245370641e-05, - "loss": 0.1485, - "step": 6820 - }, - { - "epoch": 0.6938941379660672, - "grad_norm": 0.7109375, - "learning_rate": 1.0899084101950561e-05, - "loss": 0.1762, - "step": 6830 - }, - { - "epoch": 0.6949100883876866, - "grad_norm": 1.9765625, - "learning_rate": 1.0832592567751555e-05, - "loss": 0.1402, - "step": 6840 - }, - { - "epoch": 0.6959260388093061, - "grad_norm": 1.4609375, - "learning_rate": 1.0766248334095505e-05, - "loss": 0.2278, - "step": 6850 - }, - { - "epoch": 0.6969419892309255, - "grad_norm": 3.953125, - "learning_rate": 1.0700052090772828e-05, - "loss": 0.1969, - "step": 6860 - }, - { - "epoch": 0.6979579396525449, - "grad_norm": 2.453125, - "learning_rate": 1.0634004526035249e-05, - "loss": 0.2073, - "step": 6870 - }, - { - "epoch": 0.6989738900741643, - "grad_norm": 1.6171875, - "learning_rate": 1.0568106326588645e-05, - "loss": 0.1902, - "step": 6880 - }, - { - "epoch": 0.6999898404957838, - "grad_norm": 1.2734375, - "learning_rate": 1.0502358177585953e-05, - "loss": 0.2165, - "step": 6890 - }, - { - "epoch": 0.7010057909174032, - "grad_norm": 1.671875, - "learning_rate": 1.0436760762619977e-05, - "loss": 0.1952, - "step": 6900 - }, - { - "epoch": 0.7020217413390226, - "grad_norm": 2.8125, - "learning_rate": 1.0371314763716347e-05, - "loss": 0.1422, - "step": 6910 - }, - { - "epoch": 0.7030376917606421, - "grad_norm": 2.53125, - "learning_rate": 1.0306020861326388e-05, - "loss": 0.0961, - "step": 6920 - }, - { - "epoch": 0.7040536421822615, - "grad_norm": 3.046875, - "learning_rate": 1.0240879734320068e-05, - "loss": 0.1542, - "step": 6930 - }, - { - "epoch": 0.7050695926038809, - "grad_norm": 2.859375, - "learning_rate": 1.0175892059978901e-05, - "loss": 0.1748, - "step": 6940 - }, - { - "epoch": 0.7060855430255003, - "grad_norm": 2.671875, - "learning_rate": 1.0111058513988958e-05, - "loss": 0.0819, - "step": 6950 - }, - { - "epoch": 0.7071014934471198, - "grad_norm": 3.5625, - "learning_rate": 1.0046379770433803e-05, - "loss": 0.1933, - "step": 6960 - }, - { - "epoch": 0.7081174438687392, - "grad_norm": 2.859375, - "learning_rate": 9.98185650178749e-06, - "loss": 0.1891, - "step": 6970 - }, - { - "epoch": 0.7091333942903586, - "grad_norm": 3.15625, - "learning_rate": 9.917489378907591e-06, - "loss": 0.2102, - "step": 6980 - }, - { - "epoch": 0.7101493447119781, - "grad_norm": 6.40625, - "learning_rate": 9.853279071028212e-06, - "loss": 0.1714, - "step": 6990 - }, - { - "epoch": 0.7111652951335975, - "grad_norm": 2.375, - "learning_rate": 9.78922624575303e-06, - "loss": 0.1299, - "step": 7000 - }, - { - "epoch": 0.7121812455552169, - "grad_norm": 2.078125, - "learning_rate": 9.72533156904833e-06, - "loss": 0.1914, - "step": 7010 - }, - { - "epoch": 0.7131971959768363, - "grad_norm": 3.859375, - "learning_rate": 9.661595705236137e-06, - "loss": 0.2377, - "step": 7020 - }, - { - "epoch": 0.7142131463984558, - "grad_norm": 1.171875, - "learning_rate": 9.598019316987244e-06, - "loss": 0.1851, - "step": 7030 - }, - { - "epoch": 0.7152290968200752, - "grad_norm": 1.078125, - "learning_rate": 9.53460306531439e-06, - "loss": 0.2661, - "step": 7040 - }, - { - "epoch": 0.7162450472416946, - "grad_norm": 1.6484375, - "learning_rate": 9.471347609565311e-06, - "loss": 0.1669, - "step": 7050 - }, - { - "epoch": 0.7172609976633141, - "grad_norm": 4.59375, - "learning_rate": 9.408253607415957e-06, - "loss": 0.2487, - "step": 7060 - }, - { - "epoch": 0.7182769480849335, - "grad_norm": 3.09375, - "learning_rate": 9.345321714863614e-06, - "loss": 0.186, - "step": 7070 - }, - { - "epoch": 0.7192928985065529, - "grad_norm": 6.0625, - "learning_rate": 9.282552586220075e-06, - "loss": 0.2249, - "step": 7080 - }, - { - "epoch": 0.7203088489281723, - "grad_norm": 1.5703125, - "learning_rate": 9.219946874104885e-06, - "loss": 0.1255, - "step": 7090 - }, - { - "epoch": 0.7213247993497918, - "grad_norm": 1.9453125, - "learning_rate": 9.157505229438481e-06, - "loss": 0.1999, - "step": 7100 - }, - { - "epoch": 0.7223407497714112, - "grad_norm": 5.1875, - "learning_rate": 9.095228301435518e-06, - "loss": 0.199, - "step": 7110 - }, - { - "epoch": 0.7233567001930306, - "grad_norm": 2.078125, - "learning_rate": 9.03311673759802e-06, - "loss": 0.2182, - "step": 7120 - }, - { - "epoch": 0.7243726506146501, - "grad_norm": 6.46875, - "learning_rate": 8.971171183708733e-06, - "loss": 0.1573, - "step": 7130 - }, - { - "epoch": 0.7253886010362695, - "grad_norm": 3.015625, - "learning_rate": 8.909392283824353e-06, - "loss": 0.2044, - "step": 7140 - }, - { - "epoch": 0.7264045514578888, - "grad_norm": 2.921875, - "learning_rate": 8.847780680268872e-06, - "loss": 0.11, - "step": 7150 - }, - { - "epoch": 0.7274205018795082, - "grad_norm": 2.96875, - "learning_rate": 8.786337013626853e-06, - "loss": 0.1897, - "step": 7160 - }, - { - "epoch": 0.7284364523011277, - "grad_norm": 1.7578125, - "learning_rate": 8.725061922736799e-06, - "loss": 0.153, - "step": 7170 - }, - { - "epoch": 0.7294524027227471, - "grad_norm": 1.609375, - "learning_rate": 8.663956044684532e-06, - "loss": 0.1746, - "step": 7180 - }, - { - "epoch": 0.7304683531443665, - "grad_norm": 1.9375, - "learning_rate": 8.603020014796507e-06, - "loss": 0.2284, - "step": 7190 - }, - { - "epoch": 0.7314843035659859, - "grad_norm": 1.515625, - "learning_rate": 8.542254466633273e-06, - "loss": 0.1186, - "step": 7200 - }, - { - "epoch": 0.7325002539876054, - "grad_norm": 1.671875, - "learning_rate": 8.481660031982844e-06, - "loss": 0.1971, - "step": 7210 - }, - { - "epoch": 0.7335162044092248, - "grad_norm": 1.453125, - "learning_rate": 8.421237340854157e-06, - "loss": 0.196, - "step": 7220 - }, - { - "epoch": 0.7345321548308442, - "grad_norm": 0.65234375, - "learning_rate": 8.360987021470479e-06, - "loss": 0.1724, - "step": 7230 - }, - { - "epoch": 0.7355481052524637, - "grad_norm": 2.84375, - "learning_rate": 8.300909700262929e-06, - "loss": 0.175, - "step": 7240 - }, - { - "epoch": 0.7365640556740831, - "grad_norm": 3.109375, - "learning_rate": 8.241006001863924e-06, - "loss": 0.2276, - "step": 7250 - }, - { - "epoch": 0.7375800060957025, - "grad_norm": 4.8125, - "learning_rate": 8.181276549100714e-06, - "loss": 0.2029, - "step": 7260 - }, - { - "epoch": 0.7385959565173219, - "grad_norm": 4.03125, - "learning_rate": 8.12172196298887e-06, - "loss": 0.175, - "step": 7270 - }, - { - "epoch": 0.7396119069389414, - "grad_norm": 3.046875, - "learning_rate": 8.062342862725878e-06, - "loss": 0.1662, - "step": 7280 - }, - { - "epoch": 0.7406278573605608, - "grad_norm": 3.375, - "learning_rate": 8.003139865684662e-06, - "loss": 0.1616, - "step": 7290 - }, - { - "epoch": 0.7416438077821802, - "grad_norm": 2.5625, - "learning_rate": 7.944113587407157e-06, - "loss": 0.2448, - "step": 7300 - }, - { - "epoch": 0.7426597582037997, - "grad_norm": 4.125, - "learning_rate": 7.885264641597961e-06, - "loss": 0.1618, - "step": 7310 - }, - { - "epoch": 0.7436757086254191, - "grad_norm": 3.5, - "learning_rate": 7.826593640117889e-06, - "loss": 0.1134, - "step": 7320 - }, - { - "epoch": 0.7446916590470385, - "grad_norm": 2.6875, - "learning_rate": 7.76810119297767e-06, - "loss": 0.1795, - "step": 7330 - }, - { - "epoch": 0.7457076094686579, - "grad_norm": 4.34375, - "learning_rate": 7.709787908331556e-06, - "loss": 0.2736, - "step": 7340 - }, - { - "epoch": 0.7467235598902774, - "grad_norm": 1.21875, - "learning_rate": 7.651654392471038e-06, - "loss": 0.139, - "step": 7350 - }, - { - "epoch": 0.7477395103118968, - "grad_norm": 3.578125, - "learning_rate": 7.593701249818521e-06, - "loss": 0.2023, - "step": 7360 - }, - { - "epoch": 0.7487554607335162, - "grad_norm": 2.15625, - "learning_rate": 7.535929082921048e-06, - "loss": 0.1702, - "step": 7370 - }, - { - "epoch": 0.7497714111551357, - "grad_norm": 1.96875, - "learning_rate": 7.47833849244402e-06, - "loss": 0.1835, - "step": 7380 - }, - { - "epoch": 0.7507873615767551, - "grad_norm": 2.796875, - "learning_rate": 7.420930077164959e-06, - "loss": 0.1713, - "step": 7390 - }, - { - "epoch": 0.7518033119983745, - "grad_norm": 4.46875, - "learning_rate": 7.363704433967311e-06, - "loss": 0.1906, - "step": 7400 - }, - { - "epoch": 0.7528192624199939, - "grad_norm": 1.75, - "learning_rate": 7.306662157834185e-06, - "loss": 0.1421, - "step": 7410 - }, - { - "epoch": 0.7538352128416134, - "grad_norm": 1.140625, - "learning_rate": 7.2498038418422145e-06, - "loss": 0.1793, - "step": 7420 - }, - { - "epoch": 0.7548511632632328, - "grad_norm": 2.578125, - "learning_rate": 7.193130077155374e-06, - "loss": 0.1603, - "step": 7430 - }, - { - "epoch": 0.7558671136848522, - "grad_norm": 4.3125, - "learning_rate": 7.13664145301883e-06, - "loss": 0.2169, - "step": 7440 - }, - { - "epoch": 0.7568830641064717, - "grad_norm": 3.078125, - "learning_rate": 7.0803385567528025e-06, - "loss": 0.1685, - "step": 7450 - }, - { - "epoch": 0.757899014528091, - "grad_norm": 3.5625, - "learning_rate": 7.024221973746495e-06, - "loss": 0.2282, - "step": 7460 - }, - { - "epoch": 0.7589149649497104, - "grad_norm": 2.265625, - "learning_rate": 6.968292287451961e-06, - "loss": 0.1786, - "step": 7470 - }, - { - "epoch": 0.7599309153713298, - "grad_norm": 4.71875, - "learning_rate": 6.912550079378091e-06, - "loss": 0.1811, - "step": 7480 - }, - { - "epoch": 0.7609468657929493, - "grad_norm": 2.328125, - "learning_rate": 6.856995929084506e-06, - "loss": 0.1747, - "step": 7490 - }, - { - "epoch": 0.7619628162145687, - "grad_norm": 5.21875, - "learning_rate": 6.801630414175589e-06, - "loss": 0.2028, - "step": 7500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-7500/training_args.bin b/checkpoints/checkpoint-7500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-7500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-8000/adapter_config.json b/checkpoints/checkpoint-8000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-8000/adapter_model.safetensors b/checkpoints/checkpoint-8000/adapter_model.safetensors deleted file mode 100644 index a3e74da16eac9d652cb024c1c96567e92d1eb6b2..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34815b44876635d03a12d793111d30ddb5f1ba50b8380dcfca1bf6c33938f840 -size 5919456 diff --git a/checkpoints/checkpoint-8000/optimizer.pt b/checkpoints/checkpoint-8000/optimizer.pt deleted file mode 100644 index 756d0512f4d3fe7c92a0b851ef0cade6d0d5d29c..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f76507605c949d2190ab2abb8d665fe079ec56e9d7d3261ffaa91dcc3e884b3 -size 11930938 diff --git a/checkpoints/checkpoint-8000/rng_state_0.pth b/checkpoints/checkpoint-8000/rng_state_0.pth deleted file mode 100644 index 820bae5762cf93cad79c7772fa2a022119a232d9..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f2044a5599a741319339b0e82e423a0b7bec9a103bb74fa855a2b26076b2bbe -size 15024 diff --git a/checkpoints/checkpoint-8000/rng_state_1.pth b/checkpoints/checkpoint-8000/rng_state_1.pth deleted file mode 100644 index 1cab1e079612b70518a476c16ab1414d88aff839..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:274ebe0b40c3a495361afcc0dde20f0438b9b75835963738c84e733228f7478e -size 15024 diff --git a/checkpoints/checkpoint-8000/rng_state_2.pth b/checkpoints/checkpoint-8000/rng_state_2.pth deleted file mode 100644 index 00aef9053f81e7da4386c3746977735f10ec06c9..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd18781ccda8ced5e8f822443d08381a84f46d37b63c3de7993f25d8a7b0dac0 -size 15024 diff --git a/checkpoints/checkpoint-8000/rng_state_3.pth b/checkpoints/checkpoint-8000/rng_state_3.pth deleted file mode 100644 index 928e4beb3f21210417e2df6c6c1dcd68a708f0ee..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a1dc1515069dbf95fe772f9cfe866820d39bf12f126251b05b193492bf8026b -size 15024 diff --git a/checkpoints/checkpoint-8000/scheduler.pt b/checkpoints/checkpoint-8000/scheduler.pt deleted file mode 100644 index 3e3e70b0e2f0324728f6d70898373ef2914d119a..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d390c33d5dd3d95d913d2faf47a8c88adcdfd442ecba2876f37ca1daeb1d2bf4 -size 1064 diff --git a/checkpoints/checkpoint-8000/trainer_state.json b/checkpoints/checkpoint-8000/trainer_state.json deleted file mode 100644 index 9cd92bbfdbf0eb2401ba4a0d81d37b675b3a870f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/trainer_state.json +++ /dev/null @@ -1,5621 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.81276033729554, - "eval_steps": 500, - "global_step": 8000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - }, - { - "epoch": 0.5597886823123032, - "grad_norm": 1.9921875, - "learning_rate": 2.0680849202968743e-05, - "loss": 0.2139, - "step": 5510 - }, - { - "epoch": 0.5608046327339226, - "grad_norm": 2.4375, - "learning_rate": 2.060147242557674e-05, - "loss": 0.183, - "step": 5520 - }, - { - "epoch": 0.5618205831555421, - "grad_norm": 5.5, - "learning_rate": 2.0522141380299308e-05, - "loss": 0.1673, - "step": 5530 - }, - { - "epoch": 0.5628365335771615, - "grad_norm": 4.25, - "learning_rate": 2.044285689195258e-05, - "loss": 0.1674, - "step": 5540 - }, - { - "epoch": 0.5638524839987809, - "grad_norm": 2.109375, - "learning_rate": 2.0363619784868604e-05, - "loss": 0.1531, - "step": 5550 - }, - { - "epoch": 0.5648684344204002, - "grad_norm": 2.59375, - "learning_rate": 2.0284430882886836e-05, - "loss": 0.1665, - "step": 5560 - }, - { - "epoch": 0.5658843848420197, - "grad_norm": 3.984375, - "learning_rate": 2.020529100934549e-05, - "loss": 0.1717, - "step": 5570 - }, - { - "epoch": 0.5669003352636391, - "grad_norm": 1.6015625, - "learning_rate": 2.012620098707306e-05, - "loss": 0.1167, - "step": 5580 - }, - { - "epoch": 0.5679162856852585, - "grad_norm": 6.0625, - "learning_rate": 2.004716163837972e-05, - "loss": 0.2084, - "step": 5590 - }, - { - "epoch": 0.5689322361068779, - "grad_norm": 2.5625, - "learning_rate": 1.996817378504876e-05, - "loss": 0.1939, - "step": 5600 - }, - { - "epoch": 0.5699481865284974, - "grad_norm": 3.109375, - "learning_rate": 1.9889238248328108e-05, - "loss": 0.1241, - "step": 5610 - }, - { - "epoch": 0.5709641369501168, - "grad_norm": 4.875, - "learning_rate": 1.981035584892171e-05, - "loss": 0.1865, - "step": 5620 - }, - { - "epoch": 0.5719800873717362, - "grad_norm": 2.984375, - "learning_rate": 1.9731527406981072e-05, - "loss": 0.1639, - "step": 5630 - }, - { - "epoch": 0.5729960377933557, - "grad_norm": 4.4375, - "learning_rate": 1.9652753742096655e-05, - "loss": 0.2019, - "step": 5640 - }, - { - "epoch": 0.5740119882149751, - "grad_norm": 4.3125, - "learning_rate": 1.9574035673289432e-05, - "loss": 0.1829, - "step": 5650 - }, - { - "epoch": 0.5750279386365945, - "grad_norm": 3.203125, - "learning_rate": 1.9495374019002312e-05, - "loss": 0.2267, - "step": 5660 - }, - { - "epoch": 0.5760438890582139, - "grad_norm": 1.765625, - "learning_rate": 1.9416769597091673e-05, - "loss": 0.1411, - "step": 5670 - }, - { - "epoch": 0.5770598394798334, - "grad_norm": 2.640625, - "learning_rate": 1.9338223224818818e-05, - "loss": 0.1476, - "step": 5680 - }, - { - "epoch": 0.5780757899014528, - "grad_norm": 4.84375, - "learning_rate": 1.9259735718841524e-05, - "loss": 0.1417, - "step": 5690 - }, - { - "epoch": 0.5790917403230722, - "grad_norm": 2.421875, - "learning_rate": 1.918130789520551e-05, - "loss": 0.1592, - "step": 5700 - }, - { - "epoch": 0.5801076907446917, - "grad_norm": 2.984375, - "learning_rate": 1.9102940569335963e-05, - "loss": 0.161, - "step": 5710 - }, - { - "epoch": 0.5811236411663111, - "grad_norm": 1.0234375, - "learning_rate": 1.9024634556029093e-05, - "loss": 0.1614, - "step": 5720 - }, - { - "epoch": 0.5821395915879305, - "grad_norm": 2.90625, - "learning_rate": 1.89463906694436e-05, - "loss": 0.1505, - "step": 5730 - }, - { - "epoch": 0.5831555420095499, - "grad_norm": 2.875, - "learning_rate": 1.8868209723092286e-05, - "loss": 0.1674, - "step": 5740 - }, - { - "epoch": 0.5841714924311694, - "grad_norm": 0.408203125, - "learning_rate": 1.8790092529833508e-05, - "loss": 0.1468, - "step": 5750 - }, - { - "epoch": 0.5851874428527888, - "grad_norm": 5.1875, - "learning_rate": 1.871203990186281e-05, - "loss": 0.1903, - "step": 5760 - }, - { - "epoch": 0.5862033932744082, - "grad_norm": 0.5546875, - "learning_rate": 1.8634052650704415e-05, - "loss": 0.2644, - "step": 5770 - }, - { - "epoch": 0.5872193436960277, - "grad_norm": 3.203125, - "learning_rate": 1.8556131587202848e-05, - "loss": 0.1968, - "step": 5780 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.484375, - "learning_rate": 1.8478277521514424e-05, - "loss": 0.2249, - "step": 5790 - }, - { - "epoch": 0.5892512445392665, - "grad_norm": 4.0, - "learning_rate": 1.8400491263098906e-05, - "loss": 0.1881, - "step": 5800 - }, - { - "epoch": 0.5902671949608859, - "grad_norm": 1.90625, - "learning_rate": 1.832277362071106e-05, - "loss": 0.1352, - "step": 5810 - }, - { - "epoch": 0.5912831453825054, - "grad_norm": 2.765625, - "learning_rate": 1.824512540239221e-05, - "loss": 0.2737, - "step": 5820 - }, - { - "epoch": 0.5922990958041248, - "grad_norm": 2.609375, - "learning_rate": 1.81675474154619e-05, - "loss": 0.1566, - "step": 5830 - }, - { - "epoch": 0.5933150462257442, - "grad_norm": 2.6875, - "learning_rate": 1.8090040466509444e-05, - "loss": 0.1999, - "step": 5840 - }, - { - "epoch": 0.5943309966473637, - "grad_norm": 2.609375, - "learning_rate": 1.8012605361385592e-05, - "loss": 0.2372, - "step": 5850 - }, - { - "epoch": 0.5953469470689831, - "grad_norm": 8.125, - "learning_rate": 1.7935242905194087e-05, - "loss": 0.2411, - "step": 5860 - }, - { - "epoch": 0.5963628974906025, - "grad_norm": 3.46875, - "learning_rate": 1.785795390228336e-05, - "loss": 0.138, - "step": 5870 - }, - { - "epoch": 0.5973788479122218, - "grad_norm": 2.3125, - "learning_rate": 1.7780739156238125e-05, - "loss": 0.1867, - "step": 5880 - }, - { - "epoch": 0.5983947983338413, - "grad_norm": 4.0625, - "learning_rate": 1.770359946987105e-05, - "loss": 0.2091, - "step": 5890 - }, - { - "epoch": 0.5994107487554607, - "grad_norm": 5.21875, - "learning_rate": 1.7626535645214378e-05, - "loss": 0.2091, - "step": 5900 - }, - { - "epoch": 0.6004266991770801, - "grad_norm": 3.15625, - "learning_rate": 1.7549548483511614e-05, - "loss": 0.1927, - "step": 5910 - }, - { - "epoch": 0.6014426495986995, - "grad_norm": 4.71875, - "learning_rate": 1.7472638785209198e-05, - "loss": 0.1893, - "step": 5920 - }, - { - "epoch": 0.602458600020319, - "grad_norm": 3.015625, - "learning_rate": 1.7395807349948145e-05, - "loss": 0.1557, - "step": 5930 - }, - { - "epoch": 0.6034745504419384, - "grad_norm": 2.9375, - "learning_rate": 1.73190549765558e-05, - "loss": 0.1717, - "step": 5940 - }, - { - "epoch": 0.6044905008635578, - "grad_norm": 3.109375, - "learning_rate": 1.724238246303745e-05, - "loss": 0.1879, - "step": 5950 - }, - { - "epoch": 0.6055064512851773, - "grad_norm": 3.875, - "learning_rate": 1.71657906065681e-05, - "loss": 0.1908, - "step": 5960 - }, - { - "epoch": 0.6065224017067967, - "grad_norm": 5.09375, - "learning_rate": 1.7089280203484115e-05, - "loss": 0.1712, - "step": 5970 - }, - { - "epoch": 0.6075383521284161, - "grad_norm": 3.015625, - "learning_rate": 1.701285204927502e-05, - "loss": 0.1454, - "step": 5980 - }, - { - "epoch": 0.6085543025500355, - "grad_norm": 3.265625, - "learning_rate": 1.693650693857515e-05, - "loss": 0.2283, - "step": 5990 - }, - { - "epoch": 0.609570252971655, - "grad_norm": 3.40625, - "learning_rate": 1.6860245665155466e-05, - "loss": 0.2188, - "step": 6000 - }, - { - "epoch": 0.6105862033932744, - "grad_norm": 2.5625, - "learning_rate": 1.678406902191521e-05, - "loss": 0.1605, - "step": 6010 - }, - { - "epoch": 0.6116021538148938, - "grad_norm": 0.6796875, - "learning_rate": 1.670797780087374e-05, - "loss": 0.1472, - "step": 6020 - }, - { - "epoch": 0.6126181042365133, - "grad_norm": 2.234375, - "learning_rate": 1.6631972793162288e-05, - "loss": 0.1676, - "step": 6030 - }, - { - "epoch": 0.6136340546581327, - "grad_norm": 1.25, - "learning_rate": 1.6556054789015662e-05, - "loss": 0.1508, - "step": 6040 - }, - { - "epoch": 0.6146500050797521, - "grad_norm": 4.78125, - "learning_rate": 1.6480224577764132e-05, - "loss": 0.1981, - "step": 6050 - }, - { - "epoch": 0.6156659555013715, - "grad_norm": 3.46875, - "learning_rate": 1.6404482947825137e-05, - "loss": 0.2514, - "step": 6060 - }, - { - "epoch": 0.616681905922991, - "grad_norm": 1.265625, - "learning_rate": 1.6328830686695154e-05, - "loss": 0.2397, - "step": 6070 - }, - { - "epoch": 0.6176978563446104, - "grad_norm": 1.953125, - "learning_rate": 1.625326858094144e-05, - "loss": 0.1523, - "step": 6080 - }, - { - "epoch": 0.6187138067662298, - "grad_norm": 3.484375, - "learning_rate": 1.6177797416193953e-05, - "loss": 0.218, - "step": 6090 - }, - { - "epoch": 0.6197297571878493, - "grad_norm": 3.484375, - "learning_rate": 1.6102417977137052e-05, - "loss": 0.1476, - "step": 6100 - }, - { - "epoch": 0.6207457076094687, - "grad_norm": 4.90625, - "learning_rate": 1.602713104750147e-05, - "loss": 0.1818, - "step": 6110 - }, - { - "epoch": 0.6217616580310881, - "grad_norm": 4.375, - "learning_rate": 1.5951937410056087e-05, - "loss": 0.2061, - "step": 6120 - }, - { - "epoch": 0.6227776084527075, - "grad_norm": 6.3125, - "learning_rate": 1.587683784659979e-05, - "loss": 0.1566, - "step": 6130 - }, - { - "epoch": 0.623793558874327, - "grad_norm": 2.828125, - "learning_rate": 1.58018331379534e-05, - "loss": 0.1376, - "step": 6140 - }, - { - "epoch": 0.6248095092959464, - "grad_norm": 2.40625, - "learning_rate": 1.572692406395149e-05, - "loss": 0.1655, - "step": 6150 - }, - { - "epoch": 0.6258254597175658, - "grad_norm": 4.34375, - "learning_rate": 1.5652111403434338e-05, - "loss": 0.2363, - "step": 6160 - }, - { - "epoch": 0.6268414101391853, - "grad_norm": 2.453125, - "learning_rate": 1.5577395934239757e-05, - "loss": 0.2464, - "step": 6170 - }, - { - "epoch": 0.6278573605608047, - "grad_norm": 2.53125, - "learning_rate": 1.5502778433195085e-05, - "loss": 0.1898, - "step": 6180 - }, - { - "epoch": 0.628873310982424, - "grad_norm": 2.28125, - "learning_rate": 1.5428259676109048e-05, - "loss": 0.1804, - "step": 6190 - }, - { - "epoch": 0.6298892614040434, - "grad_norm": 4.3125, - "learning_rate": 1.5353840437763732e-05, - "loss": 0.1409, - "step": 6200 - }, - { - "epoch": 0.630905211825663, - "grad_norm": 2.5625, - "learning_rate": 1.5279521491906496e-05, - "loss": 0.2449, - "step": 6210 - }, - { - "epoch": 0.6319211622472823, - "grad_norm": 3.0625, - "learning_rate": 1.520530361124195e-05, - "loss": 0.2103, - "step": 6220 - }, - { - "epoch": 0.6329371126689017, - "grad_norm": 2.609375, - "learning_rate": 1.5131187567423937e-05, - "loss": 0.2156, - "step": 6230 - }, - { - "epoch": 0.6339530630905211, - "grad_norm": 2.703125, - "learning_rate": 1.5057174131047446e-05, - "loss": 0.161, - "step": 6240 - }, - { - "epoch": 0.6349690135121406, - "grad_norm": 3.265625, - "learning_rate": 1.4983264071640679e-05, - "loss": 0.1757, - "step": 6250 - }, - { - "epoch": 0.63598496393376, - "grad_norm": 3.15625, - "learning_rate": 1.490945815765699e-05, - "loss": 0.2011, - "step": 6260 - }, - { - "epoch": 0.6370009143553794, - "grad_norm": 5.375, - "learning_rate": 1.4835757156466945e-05, - "loss": 0.1658, - "step": 6270 - }, - { - "epoch": 0.6380168647769989, - "grad_norm": 2.984375, - "learning_rate": 1.4762161834350271e-05, - "loss": 0.1754, - "step": 6280 - }, - { - "epoch": 0.6390328151986183, - "grad_norm": 2.015625, - "learning_rate": 1.4688672956487987e-05, - "loss": 0.1427, - "step": 6290 - }, - { - "epoch": 0.6400487656202377, - "grad_norm": 3.78125, - "learning_rate": 1.4615291286954352e-05, - "loss": 0.1517, - "step": 6300 - }, - { - "epoch": 0.6410647160418571, - "grad_norm": 2.859375, - "learning_rate": 1.4542017588709005e-05, - "loss": 0.2348, - "step": 6310 - }, - { - "epoch": 0.6420806664634766, - "grad_norm": 2.421875, - "learning_rate": 1.4468852623588961e-05, - "loss": 0.2089, - "step": 6320 - }, - { - "epoch": 0.643096616885096, - "grad_norm": 2.15625, - "learning_rate": 1.4395797152300719e-05, - "loss": 0.1702, - "step": 6330 - }, - { - "epoch": 0.6441125673067154, - "grad_norm": 1.53125, - "learning_rate": 1.4322851934412382e-05, - "loss": 0.1017, - "step": 6340 - }, - { - "epoch": 0.6451285177283349, - "grad_norm": 1.90625, - "learning_rate": 1.4250017728345716e-05, - "loss": 0.1813, - "step": 6350 - }, - { - "epoch": 0.6461444681499543, - "grad_norm": 2.015625, - "learning_rate": 1.4177295291368292e-05, - "loss": 0.1095, - "step": 6360 - }, - { - "epoch": 0.6471604185715737, - "grad_norm": 2.625, - "learning_rate": 1.410468537958558e-05, - "loss": 0.2259, - "step": 6370 - }, - { - "epoch": 0.6481763689931931, - "grad_norm": 3.5, - "learning_rate": 1.4032188747933136e-05, - "loss": 0.1595, - "step": 6380 - }, - { - "epoch": 0.6491923194148126, - "grad_norm": 5.21875, - "learning_rate": 1.39598061501687e-05, - "loss": 0.2226, - "step": 6390 - }, - { - "epoch": 0.650208269836432, - "grad_norm": 5.34375, - "learning_rate": 1.388753833886442e-05, - "loss": 0.2132, - "step": 6400 - }, - { - "epoch": 0.6512242202580514, - "grad_norm": 3.640625, - "learning_rate": 1.3815386065398945e-05, - "loss": 0.1227, - "step": 6410 - }, - { - "epoch": 0.6522401706796709, - "grad_norm": 1.0, - "learning_rate": 1.3743350079949705e-05, - "loss": 0.1755, - "step": 6420 - }, - { - "epoch": 0.6532561211012903, - "grad_norm": 2.359375, - "learning_rate": 1.3671431131485057e-05, - "loss": 0.1552, - "step": 6430 - }, - { - "epoch": 0.6542720715229097, - "grad_norm": 5.3125, - "learning_rate": 1.3599629967756483e-05, - "loss": 0.1917, - "step": 6440 - }, - { - "epoch": 0.6552880219445291, - "grad_norm": 4.625, - "learning_rate": 1.3527947335290877e-05, - "loss": 0.1812, - "step": 6450 - }, - { - "epoch": 0.6563039723661486, - "grad_norm": 1.234375, - "learning_rate": 1.3456383979382708e-05, - "loss": 0.1896, - "step": 6460 - }, - { - "epoch": 0.657319922787768, - "grad_norm": 3.984375, - "learning_rate": 1.3384940644086352e-05, - "loss": 0.1484, - "step": 6470 - }, - { - "epoch": 0.6583358732093874, - "grad_norm": 2.40625, - "learning_rate": 1.3313618072208268e-05, - "loss": 0.1334, - "step": 6480 - }, - { - "epoch": 0.6593518236310069, - "grad_norm": 4.375, - "learning_rate": 1.3242417005299357e-05, - "loss": 0.1351, - "step": 6490 - }, - { - "epoch": 0.6603677740526263, - "grad_norm": 2.640625, - "learning_rate": 1.31713381836472e-05, - "loss": 0.1717, - "step": 6500 - }, - { - "epoch": 0.6613837244742456, - "grad_norm": 2.640625, - "learning_rate": 1.3100382346268392e-05, - "loss": 0.1867, - "step": 6510 - }, - { - "epoch": 0.662399674895865, - "grad_norm": 1.734375, - "learning_rate": 1.3029550230900812e-05, - "loss": 0.1997, - "step": 6520 - }, - { - "epoch": 0.6634156253174845, - "grad_norm": 3.609375, - "learning_rate": 1.2958842573996016e-05, - "loss": 0.1969, - "step": 6530 - }, - { - "epoch": 0.6644315757391039, - "grad_norm": 3.578125, - "learning_rate": 1.2888260110711525e-05, - "loss": 0.1469, - "step": 6540 - }, - { - "epoch": 0.6654475261607233, - "grad_norm": 1.3515625, - "learning_rate": 1.2817803574903212e-05, - "loss": 0.1524, - "step": 6550 - }, - { - "epoch": 0.6664634765823427, - "grad_norm": 2.109375, - "learning_rate": 1.2747473699117668e-05, - "loss": 0.159, - "step": 6560 - }, - { - "epoch": 0.6674794270039622, - "grad_norm": 1.53125, - "learning_rate": 1.267727121458458e-05, - "loss": 0.1999, - "step": 6570 - }, - { - "epoch": 0.6684953774255816, - "grad_norm": 1.7265625, - "learning_rate": 1.2607196851209137e-05, - "loss": 0.2216, - "step": 6580 - }, - { - "epoch": 0.669511327847201, - "grad_norm": 3.125, - "learning_rate": 1.2537251337564412e-05, - "loss": 0.1607, - "step": 6590 - }, - { - "epoch": 0.6705272782688205, - "grad_norm": 2.421875, - "learning_rate": 1.2467435400883839e-05, - "loss": 0.2187, - "step": 6600 - }, - { - "epoch": 0.6715432286904399, - "grad_norm": 1.5078125, - "learning_rate": 1.239774976705359e-05, - "loss": 0.1753, - "step": 6610 - }, - { - "epoch": 0.6725591791120593, - "grad_norm": 1.140625, - "learning_rate": 1.2328195160605092e-05, - "loss": 0.194, - "step": 6620 - }, - { - "epoch": 0.6735751295336787, - "grad_norm": 4.9375, - "learning_rate": 1.225877230470743e-05, - "loss": 0.1485, - "step": 6630 - }, - { - "epoch": 0.6745910799552982, - "grad_norm": 3.65625, - "learning_rate": 1.218948192115988e-05, - "loss": 0.1847, - "step": 6640 - }, - { - "epoch": 0.6756070303769176, - "grad_norm": 3.875, - "learning_rate": 1.21203247303844e-05, - "loss": 0.1874, - "step": 6650 - }, - { - "epoch": 0.676622980798537, - "grad_norm": 2.65625, - "learning_rate": 1.2051301451418073e-05, - "loss": 0.2377, - "step": 6660 - }, - { - "epoch": 0.6776389312201565, - "grad_norm": 2.09375, - "learning_rate": 1.198241280190574e-05, - "loss": 0.1508, - "step": 6670 - }, - { - "epoch": 0.6786548816417759, - "grad_norm": 2.203125, - "learning_rate": 1.1913659498092431e-05, - "loss": 0.1537, - "step": 6680 - }, - { - "epoch": 0.6796708320633953, - "grad_norm": 2.484375, - "learning_rate": 1.184504225481601e-05, - "loss": 0.2339, - "step": 6690 - }, - { - "epoch": 0.6806867824850147, - "grad_norm": 5.625, - "learning_rate": 1.177656178549966e-05, - "loss": 0.2102, - "step": 6700 - }, - { - "epoch": 0.6817027329066342, - "grad_norm": 2.5, - "learning_rate": 1.1708218802144536e-05, - "loss": 0.1435, - "step": 6710 - }, - { - "epoch": 0.6827186833282536, - "grad_norm": 3.84375, - "learning_rate": 1.1640014015322323e-05, - "loss": 0.1823, - "step": 6720 - }, - { - "epoch": 0.683734633749873, - "grad_norm": 2.359375, - "learning_rate": 1.1571948134167862e-05, - "loss": 0.1154, - "step": 6730 - }, - { - "epoch": 0.6847505841714925, - "grad_norm": 2.90625, - "learning_rate": 1.1504021866371761e-05, - "loss": 0.2105, - "step": 6740 - }, - { - "epoch": 0.6857665345931119, - "grad_norm": 5.46875, - "learning_rate": 1.143623591817304e-05, - "loss": 0.1317, - "step": 6750 - }, - { - "epoch": 0.6867824850147313, - "grad_norm": 3.34375, - "learning_rate": 1.1368590994351835e-05, - "loss": 0.1406, - "step": 6760 - }, - { - "epoch": 0.6877984354363507, - "grad_norm": 3.78125, - "learning_rate": 1.130108779822198e-05, - "loss": 0.1425, - "step": 6770 - }, - { - "epoch": 0.6888143858579702, - "grad_norm": 0.77734375, - "learning_rate": 1.1233727031623783e-05, - "loss": 0.1623, - "step": 6780 - }, - { - "epoch": 0.6898303362795896, - "grad_norm": 4.625, - "learning_rate": 1.1166509394916682e-05, - "loss": 0.1591, - "step": 6790 - }, - { - "epoch": 0.690846286701209, - "grad_norm": 3.84375, - "learning_rate": 1.1099435586971982e-05, - "loss": 0.1758, - "step": 6800 - }, - { - "epoch": 0.6918622371228285, - "grad_norm": 2.4375, - "learning_rate": 1.1032506305165555e-05, - "loss": 0.1018, - "step": 6810 - }, - { - "epoch": 0.6928781875444479, - "grad_norm": 3.203125, - "learning_rate": 1.0965722245370641e-05, - "loss": 0.1485, - "step": 6820 - }, - { - "epoch": 0.6938941379660672, - "grad_norm": 0.7109375, - "learning_rate": 1.0899084101950561e-05, - "loss": 0.1762, - "step": 6830 - }, - { - "epoch": 0.6949100883876866, - "grad_norm": 1.9765625, - "learning_rate": 1.0832592567751555e-05, - "loss": 0.1402, - "step": 6840 - }, - { - "epoch": 0.6959260388093061, - "grad_norm": 1.4609375, - "learning_rate": 1.0766248334095505e-05, - "loss": 0.2278, - "step": 6850 - }, - { - "epoch": 0.6969419892309255, - "grad_norm": 3.953125, - "learning_rate": 1.0700052090772828e-05, - "loss": 0.1969, - "step": 6860 - }, - { - "epoch": 0.6979579396525449, - "grad_norm": 2.453125, - "learning_rate": 1.0634004526035249e-05, - "loss": 0.2073, - "step": 6870 - }, - { - "epoch": 0.6989738900741643, - "grad_norm": 1.6171875, - "learning_rate": 1.0568106326588645e-05, - "loss": 0.1902, - "step": 6880 - }, - { - "epoch": 0.6999898404957838, - "grad_norm": 1.2734375, - "learning_rate": 1.0502358177585953e-05, - "loss": 0.2165, - "step": 6890 - }, - { - "epoch": 0.7010057909174032, - "grad_norm": 1.671875, - "learning_rate": 1.0436760762619977e-05, - "loss": 0.1952, - "step": 6900 - }, - { - "epoch": 0.7020217413390226, - "grad_norm": 2.8125, - "learning_rate": 1.0371314763716347e-05, - "loss": 0.1422, - "step": 6910 - }, - { - "epoch": 0.7030376917606421, - "grad_norm": 2.53125, - "learning_rate": 1.0306020861326388e-05, - "loss": 0.0961, - "step": 6920 - }, - { - "epoch": 0.7040536421822615, - "grad_norm": 3.046875, - "learning_rate": 1.0240879734320068e-05, - "loss": 0.1542, - "step": 6930 - }, - { - "epoch": 0.7050695926038809, - "grad_norm": 2.859375, - "learning_rate": 1.0175892059978901e-05, - "loss": 0.1748, - "step": 6940 - }, - { - "epoch": 0.7060855430255003, - "grad_norm": 2.671875, - "learning_rate": 1.0111058513988958e-05, - "loss": 0.0819, - "step": 6950 - }, - { - "epoch": 0.7071014934471198, - "grad_norm": 3.5625, - "learning_rate": 1.0046379770433803e-05, - "loss": 0.1933, - "step": 6960 - }, - { - "epoch": 0.7081174438687392, - "grad_norm": 2.859375, - "learning_rate": 9.98185650178749e-06, - "loss": 0.1891, - "step": 6970 - }, - { - "epoch": 0.7091333942903586, - "grad_norm": 3.15625, - "learning_rate": 9.917489378907591e-06, - "loss": 0.2102, - "step": 6980 - }, - { - "epoch": 0.7101493447119781, - "grad_norm": 6.40625, - "learning_rate": 9.853279071028212e-06, - "loss": 0.1714, - "step": 6990 - }, - { - "epoch": 0.7111652951335975, - "grad_norm": 2.375, - "learning_rate": 9.78922624575303e-06, - "loss": 0.1299, - "step": 7000 - }, - { - "epoch": 0.7121812455552169, - "grad_norm": 2.078125, - "learning_rate": 9.72533156904833e-06, - "loss": 0.1914, - "step": 7010 - }, - { - "epoch": 0.7131971959768363, - "grad_norm": 3.859375, - "learning_rate": 9.661595705236137e-06, - "loss": 0.2377, - "step": 7020 - }, - { - "epoch": 0.7142131463984558, - "grad_norm": 1.171875, - "learning_rate": 9.598019316987244e-06, - "loss": 0.1851, - "step": 7030 - }, - { - "epoch": 0.7152290968200752, - "grad_norm": 1.078125, - "learning_rate": 9.53460306531439e-06, - "loss": 0.2661, - "step": 7040 - }, - { - "epoch": 0.7162450472416946, - "grad_norm": 1.6484375, - "learning_rate": 9.471347609565311e-06, - "loss": 0.1669, - "step": 7050 - }, - { - "epoch": 0.7172609976633141, - "grad_norm": 4.59375, - "learning_rate": 9.408253607415957e-06, - "loss": 0.2487, - "step": 7060 - }, - { - "epoch": 0.7182769480849335, - "grad_norm": 3.09375, - "learning_rate": 9.345321714863614e-06, - "loss": 0.186, - "step": 7070 - }, - { - "epoch": 0.7192928985065529, - "grad_norm": 6.0625, - "learning_rate": 9.282552586220075e-06, - "loss": 0.2249, - "step": 7080 - }, - { - "epoch": 0.7203088489281723, - "grad_norm": 1.5703125, - "learning_rate": 9.219946874104885e-06, - "loss": 0.1255, - "step": 7090 - }, - { - "epoch": 0.7213247993497918, - "grad_norm": 1.9453125, - "learning_rate": 9.157505229438481e-06, - "loss": 0.1999, - "step": 7100 - }, - { - "epoch": 0.7223407497714112, - "grad_norm": 5.1875, - "learning_rate": 9.095228301435518e-06, - "loss": 0.199, - "step": 7110 - }, - { - "epoch": 0.7233567001930306, - "grad_norm": 2.078125, - "learning_rate": 9.03311673759802e-06, - "loss": 0.2182, - "step": 7120 - }, - { - "epoch": 0.7243726506146501, - "grad_norm": 6.46875, - "learning_rate": 8.971171183708733e-06, - "loss": 0.1573, - "step": 7130 - }, - { - "epoch": 0.7253886010362695, - "grad_norm": 3.015625, - "learning_rate": 8.909392283824353e-06, - "loss": 0.2044, - "step": 7140 - }, - { - "epoch": 0.7264045514578888, - "grad_norm": 2.921875, - "learning_rate": 8.847780680268872e-06, - "loss": 0.11, - "step": 7150 - }, - { - "epoch": 0.7274205018795082, - "grad_norm": 2.96875, - "learning_rate": 8.786337013626853e-06, - "loss": 0.1897, - "step": 7160 - }, - { - "epoch": 0.7284364523011277, - "grad_norm": 1.7578125, - "learning_rate": 8.725061922736799e-06, - "loss": 0.153, - "step": 7170 - }, - { - "epoch": 0.7294524027227471, - "grad_norm": 1.609375, - "learning_rate": 8.663956044684532e-06, - "loss": 0.1746, - "step": 7180 - }, - { - "epoch": 0.7304683531443665, - "grad_norm": 1.9375, - "learning_rate": 8.603020014796507e-06, - "loss": 0.2284, - "step": 7190 - }, - { - "epoch": 0.7314843035659859, - "grad_norm": 1.515625, - "learning_rate": 8.542254466633273e-06, - "loss": 0.1186, - "step": 7200 - }, - { - "epoch": 0.7325002539876054, - "grad_norm": 1.671875, - "learning_rate": 8.481660031982844e-06, - "loss": 0.1971, - "step": 7210 - }, - { - "epoch": 0.7335162044092248, - "grad_norm": 1.453125, - "learning_rate": 8.421237340854157e-06, - "loss": 0.196, - "step": 7220 - }, - { - "epoch": 0.7345321548308442, - "grad_norm": 0.65234375, - "learning_rate": 8.360987021470479e-06, - "loss": 0.1724, - "step": 7230 - }, - { - "epoch": 0.7355481052524637, - "grad_norm": 2.84375, - "learning_rate": 8.300909700262929e-06, - "loss": 0.175, - "step": 7240 - }, - { - "epoch": 0.7365640556740831, - "grad_norm": 3.109375, - "learning_rate": 8.241006001863924e-06, - "loss": 0.2276, - "step": 7250 - }, - { - "epoch": 0.7375800060957025, - "grad_norm": 4.8125, - "learning_rate": 8.181276549100714e-06, - "loss": 0.2029, - "step": 7260 - }, - { - "epoch": 0.7385959565173219, - "grad_norm": 4.03125, - "learning_rate": 8.12172196298887e-06, - "loss": 0.175, - "step": 7270 - }, - { - "epoch": 0.7396119069389414, - "grad_norm": 3.046875, - "learning_rate": 8.062342862725878e-06, - "loss": 0.1662, - "step": 7280 - }, - { - "epoch": 0.7406278573605608, - "grad_norm": 3.375, - "learning_rate": 8.003139865684662e-06, - "loss": 0.1616, - "step": 7290 - }, - { - "epoch": 0.7416438077821802, - "grad_norm": 2.5625, - "learning_rate": 7.944113587407157e-06, - "loss": 0.2448, - "step": 7300 - }, - { - "epoch": 0.7426597582037997, - "grad_norm": 4.125, - "learning_rate": 7.885264641597961e-06, - "loss": 0.1618, - "step": 7310 - }, - { - "epoch": 0.7436757086254191, - "grad_norm": 3.5, - "learning_rate": 7.826593640117889e-06, - "loss": 0.1134, - "step": 7320 - }, - { - "epoch": 0.7446916590470385, - "grad_norm": 2.6875, - "learning_rate": 7.76810119297767e-06, - "loss": 0.1795, - "step": 7330 - }, - { - "epoch": 0.7457076094686579, - "grad_norm": 4.34375, - "learning_rate": 7.709787908331556e-06, - "loss": 0.2736, - "step": 7340 - }, - { - "epoch": 0.7467235598902774, - "grad_norm": 1.21875, - "learning_rate": 7.651654392471038e-06, - "loss": 0.139, - "step": 7350 - }, - { - "epoch": 0.7477395103118968, - "grad_norm": 3.578125, - "learning_rate": 7.593701249818521e-06, - "loss": 0.2023, - "step": 7360 - }, - { - "epoch": 0.7487554607335162, - "grad_norm": 2.15625, - "learning_rate": 7.535929082921048e-06, - "loss": 0.1702, - "step": 7370 - }, - { - "epoch": 0.7497714111551357, - "grad_norm": 1.96875, - "learning_rate": 7.47833849244402e-06, - "loss": 0.1835, - "step": 7380 - }, - { - "epoch": 0.7507873615767551, - "grad_norm": 2.796875, - "learning_rate": 7.420930077164959e-06, - "loss": 0.1713, - "step": 7390 - }, - { - "epoch": 0.7518033119983745, - "grad_norm": 4.46875, - "learning_rate": 7.363704433967311e-06, - "loss": 0.1906, - "step": 7400 - }, - { - "epoch": 0.7528192624199939, - "grad_norm": 1.75, - "learning_rate": 7.306662157834185e-06, - "loss": 0.1421, - "step": 7410 - }, - { - "epoch": 0.7538352128416134, - "grad_norm": 1.140625, - "learning_rate": 7.2498038418422145e-06, - "loss": 0.1793, - "step": 7420 - }, - { - "epoch": 0.7548511632632328, - "grad_norm": 2.578125, - "learning_rate": 7.193130077155374e-06, - "loss": 0.1603, - "step": 7430 - }, - { - "epoch": 0.7558671136848522, - "grad_norm": 4.3125, - "learning_rate": 7.13664145301883e-06, - "loss": 0.2169, - "step": 7440 - }, - { - "epoch": 0.7568830641064717, - "grad_norm": 3.078125, - "learning_rate": 7.0803385567528025e-06, - "loss": 0.1685, - "step": 7450 - }, - { - "epoch": 0.757899014528091, - "grad_norm": 3.5625, - "learning_rate": 7.024221973746495e-06, - "loss": 0.2282, - "step": 7460 - }, - { - "epoch": 0.7589149649497104, - "grad_norm": 2.265625, - "learning_rate": 6.968292287451961e-06, - "loss": 0.1786, - "step": 7470 - }, - { - "epoch": 0.7599309153713298, - "grad_norm": 4.71875, - "learning_rate": 6.912550079378091e-06, - "loss": 0.1811, - "step": 7480 - }, - { - "epoch": 0.7609468657929493, - "grad_norm": 2.328125, - "learning_rate": 6.856995929084506e-06, - "loss": 0.1747, - "step": 7490 - }, - { - "epoch": 0.7619628162145687, - "grad_norm": 5.21875, - "learning_rate": 6.801630414175589e-06, - "loss": 0.2028, - "step": 7500 - }, - { - "epoch": 0.7629787666361881, - "grad_norm": 3.78125, - "learning_rate": 6.746454110294451e-06, - "loss": 0.2255, - "step": 7510 - }, - { - "epoch": 0.7639947170578075, - "grad_norm": 1.625, - "learning_rate": 6.691467591116931e-06, - "loss": 0.1604, - "step": 7520 - }, - { - "epoch": 0.765010667479427, - "grad_norm": 1.7734375, - "learning_rate": 6.6366714283456755e-06, - "loss": 0.2559, - "step": 7530 - }, - { - "epoch": 0.7660266179010464, - "grad_norm": 4.59375, - "learning_rate": 6.582066191704142e-06, - "loss": 0.2034, - "step": 7540 - }, - { - "epoch": 0.7670425683226658, - "grad_norm": 1.578125, - "learning_rate": 6.527652448930724e-06, - "loss": 0.148, - "step": 7550 - }, - { - "epoch": 0.7680585187442853, - "grad_norm": 1.7109375, - "learning_rate": 6.4734307657728e-06, - "loss": 0.1811, - "step": 7560 - }, - { - "epoch": 0.7690744691659047, - "grad_norm": 1.2734375, - "learning_rate": 6.419401705980924e-06, - "loss": 0.1407, - "step": 7570 - }, - { - "epoch": 0.7700904195875241, - "grad_norm": 2.25, - "learning_rate": 6.365565831302869e-06, - "loss": 0.1893, - "step": 7580 - }, - { - "epoch": 0.7711063700091435, - "grad_norm": 1.625, - "learning_rate": 6.311923701477854e-06, - "loss": 0.1835, - "step": 7590 - }, - { - "epoch": 0.772122320430763, - "grad_norm": 2.375, - "learning_rate": 6.258475874230713e-06, - "loss": 0.1579, - "step": 7600 - }, - { - "epoch": 0.7731382708523824, - "grad_norm": 4.5, - "learning_rate": 6.205222905266067e-06, - "loss": 0.1794, - "step": 7610 - }, - { - "epoch": 0.7741542212740018, - "grad_norm": 4.25, - "learning_rate": 6.152165348262598e-06, - "loss": 0.1477, - "step": 7620 - }, - { - "epoch": 0.7751701716956213, - "grad_norm": 1.9765625, - "learning_rate": 6.0993037548672246e-06, - "loss": 0.2396, - "step": 7630 - }, - { - "epoch": 0.7761861221172407, - "grad_norm": 2.671875, - "learning_rate": 6.046638674689454e-06, - "loss": 0.1717, - "step": 7640 - }, - { - "epoch": 0.7772020725388601, - "grad_norm": 3.671875, - "learning_rate": 5.994170655295567e-06, - "loss": 0.2646, - "step": 7650 - }, - { - "epoch": 0.7782180229604795, - "grad_norm": 1.3046875, - "learning_rate": 5.9419002422030106e-06, - "loss": 0.1553, - "step": 7660 - }, - { - "epoch": 0.779233973382099, - "grad_norm": 3.734375, - "learning_rate": 5.889827978874665e-06, - "loss": 0.1854, - "step": 7670 - }, - { - "epoch": 0.7802499238037184, - "grad_norm": 2.140625, - "learning_rate": 5.837954406713245e-06, - "loss": 0.1857, - "step": 7680 - }, - { - "epoch": 0.7812658742253378, - "grad_norm": 3.34375, - "learning_rate": 5.786280065055619e-06, - "loss": 0.1797, - "step": 7690 - }, - { - "epoch": 0.7822818246469573, - "grad_norm": 0.97265625, - "learning_rate": 5.734805491167244e-06, - "loss": 0.1488, - "step": 7700 - }, - { - "epoch": 0.7832977750685767, - "grad_norm": 2.078125, - "learning_rate": 5.683531220236576e-06, - "loss": 0.1688, - "step": 7710 - }, - { - "epoch": 0.7843137254901961, - "grad_norm": 3.046875, - "learning_rate": 5.632457785369455e-06, - "loss": 0.1503, - "step": 7720 - }, - { - "epoch": 0.7853296759118155, - "grad_norm": 1.6875, - "learning_rate": 5.581585717583637e-06, - "loss": 0.1658, - "step": 7730 - }, - { - "epoch": 0.786345626333435, - "grad_norm": 3.421875, - "learning_rate": 5.530915545803209e-06, - "loss": 0.2112, - "step": 7740 - }, - { - "epoch": 0.7873615767550544, - "grad_norm": 4.1875, - "learning_rate": 5.480447796853141e-06, - "loss": 0.165, - "step": 7750 - }, - { - "epoch": 0.7883775271766738, - "grad_norm": 5.3125, - "learning_rate": 5.430182995453756e-06, - "loss": 0.1499, - "step": 7760 - }, - { - "epoch": 0.7893934775982933, - "grad_norm": 2.1875, - "learning_rate": 5.380121664215329e-06, - "loss": 0.1559, - "step": 7770 - }, - { - "epoch": 0.7904094280199127, - "grad_norm": 1.46875, - "learning_rate": 5.330264323632611e-06, - "loss": 0.2098, - "step": 7780 - }, - { - "epoch": 0.791425378441532, - "grad_norm": 4.65625, - "learning_rate": 5.280611492079449e-06, - "loss": 0.1776, - "step": 7790 - }, - { - "epoch": 0.7924413288631514, - "grad_norm": 1.3359375, - "learning_rate": 5.231163685803361e-06, - "loss": 0.1497, - "step": 7800 - }, - { - "epoch": 0.7934572792847709, - "grad_norm": 2.640625, - "learning_rate": 5.181921418920191e-06, - "loss": 0.12, - "step": 7810 - }, - { - "epoch": 0.7944732297063903, - "grad_norm": 2.328125, - "learning_rate": 5.13288520340878e-06, - "loss": 0.1981, - "step": 7820 - }, - { - "epoch": 0.7954891801280097, - "grad_norm": 3.0625, - "learning_rate": 5.084055549105596e-06, - "loss": 0.1389, - "step": 7830 - }, - { - "epoch": 0.7965051305496291, - "grad_norm": 2.796875, - "learning_rate": 5.035432963699479e-06, - "loss": 0.2293, - "step": 7840 - }, - { - "epoch": 0.7975210809712486, - "grad_norm": 5.0625, - "learning_rate": 4.98701795272635e-06, - "loss": 0.1618, - "step": 7850 - }, - { - "epoch": 0.798537031392868, - "grad_norm": 5.09375, - "learning_rate": 4.938811019563938e-06, - "loss": 0.1755, - "step": 7860 - }, - { - "epoch": 0.7995529818144874, - "grad_norm": 2.140625, - "learning_rate": 4.8908126654265475e-06, - "loss": 0.1565, - "step": 7870 - }, - { - "epoch": 0.8005689322361069, - "grad_norm": 0.76171875, - "learning_rate": 4.843023389359885e-06, - "loss": 0.2176, - "step": 7880 - }, - { - "epoch": 0.8015848826577263, - "grad_norm": 2.625, - "learning_rate": 4.79544368823581e-06, - "loss": 0.2013, - "step": 7890 - }, - { - "epoch": 0.8026008330793457, - "grad_norm": 2.078125, - "learning_rate": 4.748074056747234e-06, - "loss": 0.1246, - "step": 7900 - }, - { - "epoch": 0.8036167835009651, - "grad_norm": 3.5, - "learning_rate": 4.700914987402919e-06, - "loss": 0.1638, - "step": 7910 - }, - { - "epoch": 0.8046327339225846, - "grad_norm": 3.4375, - "learning_rate": 4.6539669705223916e-06, - "loss": 0.2213, - "step": 7920 - }, - { - "epoch": 0.805648684344204, - "grad_norm": 2.96875, - "learning_rate": 4.607230494230849e-06, - "loss": 0.1822, - "step": 7930 - }, - { - "epoch": 0.8066646347658234, - "grad_norm": 2.359375, - "learning_rate": 4.560706044454047e-06, - "loss": 0.1763, - "step": 7940 - }, - { - "epoch": 0.8076805851874429, - "grad_norm": 4.59375, - "learning_rate": 4.514394104913291e-06, - "loss": 0.234, - "step": 7950 - }, - { - "epoch": 0.8086965356090623, - "grad_norm": 1.96875, - "learning_rate": 4.468295157120372e-06, - "loss": 0.1939, - "step": 7960 - }, - { - "epoch": 0.8097124860306817, - "grad_norm": 2.578125, - "learning_rate": 4.422409680372594e-06, - "loss": 0.174, - "step": 7970 - }, - { - "epoch": 0.8107284364523011, - "grad_norm": 4.5625, - "learning_rate": 4.3767381517477505e-06, - "loss": 0.2375, - "step": 7980 - }, - { - "epoch": 0.8117443868739206, - "grad_norm": 0.9609375, - "learning_rate": 4.331281046099203e-06, - "loss": 0.2076, - "step": 7990 - }, - { - "epoch": 0.81276033729554, - "grad_norm": 6.0625, - "learning_rate": 4.286038836050929e-06, - "loss": 0.2504, - "step": 8000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-8000/training_args.bin b/checkpoints/checkpoint-8000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-8500/adapter_config.json b/checkpoints/checkpoint-8500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-8500/adapter_model.safetensors b/checkpoints/checkpoint-8500/adapter_model.safetensors deleted file mode 100644 index c4142c2f632b19891dd52f1c81cfe510afb044ab..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28ac81ab3c97c69dfd1334375e0193bcbeec6d6567bc44b70349e272cd340ee0 -size 5919456 diff --git a/checkpoints/checkpoint-8500/optimizer.pt b/checkpoints/checkpoint-8500/optimizer.pt deleted file mode 100644 index 0eb48e2be9f73c79b44660a595421fd4cb1d762c..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a97970da7a377ddc03d41ac023f8d6e5a218717457c493a71d03647daee1d293 -size 11930938 diff --git a/checkpoints/checkpoint-8500/rng_state_0.pth b/checkpoints/checkpoint-8500/rng_state_0.pth deleted file mode 100644 index de8f82b7c4feb8b730311d766bf44d581384eae7..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a353686e04717107f6f8ef46d31bd168a04a71491dff1b49a95dd841cae20dee -size 15024 diff --git a/checkpoints/checkpoint-8500/rng_state_1.pth b/checkpoints/checkpoint-8500/rng_state_1.pth deleted file mode 100644 index 58bca56d46a97b8f2ee454510a5f93c7f3bb7410..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:59a024dac60d84768b968adca05416d784c0295ae139bead6f2816285ba6b01b -size 15024 diff --git a/checkpoints/checkpoint-8500/rng_state_2.pth b/checkpoints/checkpoint-8500/rng_state_2.pth deleted file mode 100644 index 9748eeb88596cf9f27c2821bec97e56aa5f2ccfc..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1f28ab7ae50115d00765b2d4d2188a1d88b5bcb9be07f45fe7e78cb4752256bd -size 15024 diff --git a/checkpoints/checkpoint-8500/rng_state_3.pth b/checkpoints/checkpoint-8500/rng_state_3.pth deleted file mode 100644 index 354e6dd8952c6a6e554d8c56f7f0daa728e13f22..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce60533c28c44e2a4955c454dfc9eac41778818fa945283e28161cac010c0143 -size 15024 diff --git a/checkpoints/checkpoint-8500/scheduler.pt b/checkpoints/checkpoint-8500/scheduler.pt deleted file mode 100644 index f22ac3e321eef132a6a05d855a039dde00c7589a..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d013f7dc21c771c94b398b7ee47f1c3b60572d9f3b478e50ee4f2b58d385070d -size 1064 diff --git a/checkpoints/checkpoint-8500/trainer_state.json b/checkpoints/checkpoint-8500/trainer_state.json deleted file mode 100644 index 3ef48260e073af534f11c2fb65f8473ce1a4a401..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/trainer_state.json +++ /dev/null @@ -1,5971 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.8635578583765112, - "eval_steps": 500, - "global_step": 8500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - }, - { - "epoch": 0.5597886823123032, - "grad_norm": 1.9921875, - "learning_rate": 2.0680849202968743e-05, - "loss": 0.2139, - "step": 5510 - }, - { - "epoch": 0.5608046327339226, - "grad_norm": 2.4375, - "learning_rate": 2.060147242557674e-05, - "loss": 0.183, - "step": 5520 - }, - { - "epoch": 0.5618205831555421, - "grad_norm": 5.5, - "learning_rate": 2.0522141380299308e-05, - "loss": 0.1673, - "step": 5530 - }, - { - "epoch": 0.5628365335771615, - "grad_norm": 4.25, - "learning_rate": 2.044285689195258e-05, - "loss": 0.1674, - "step": 5540 - }, - { - "epoch": 0.5638524839987809, - "grad_norm": 2.109375, - "learning_rate": 2.0363619784868604e-05, - "loss": 0.1531, - "step": 5550 - }, - { - "epoch": 0.5648684344204002, - "grad_norm": 2.59375, - "learning_rate": 2.0284430882886836e-05, - "loss": 0.1665, - "step": 5560 - }, - { - "epoch": 0.5658843848420197, - "grad_norm": 3.984375, - "learning_rate": 2.020529100934549e-05, - "loss": 0.1717, - "step": 5570 - }, - { - "epoch": 0.5669003352636391, - "grad_norm": 1.6015625, - "learning_rate": 2.012620098707306e-05, - "loss": 0.1167, - "step": 5580 - }, - { - "epoch": 0.5679162856852585, - "grad_norm": 6.0625, - "learning_rate": 2.004716163837972e-05, - "loss": 0.2084, - "step": 5590 - }, - { - "epoch": 0.5689322361068779, - "grad_norm": 2.5625, - "learning_rate": 1.996817378504876e-05, - "loss": 0.1939, - "step": 5600 - }, - { - "epoch": 0.5699481865284974, - "grad_norm": 3.109375, - "learning_rate": 1.9889238248328108e-05, - "loss": 0.1241, - "step": 5610 - }, - { - "epoch": 0.5709641369501168, - "grad_norm": 4.875, - "learning_rate": 1.981035584892171e-05, - "loss": 0.1865, - "step": 5620 - }, - { - "epoch": 0.5719800873717362, - "grad_norm": 2.984375, - "learning_rate": 1.9731527406981072e-05, - "loss": 0.1639, - "step": 5630 - }, - { - "epoch": 0.5729960377933557, - "grad_norm": 4.4375, - "learning_rate": 1.9652753742096655e-05, - "loss": 0.2019, - "step": 5640 - }, - { - "epoch": 0.5740119882149751, - "grad_norm": 4.3125, - "learning_rate": 1.9574035673289432e-05, - "loss": 0.1829, - "step": 5650 - }, - { - "epoch": 0.5750279386365945, - "grad_norm": 3.203125, - "learning_rate": 1.9495374019002312e-05, - "loss": 0.2267, - "step": 5660 - }, - { - "epoch": 0.5760438890582139, - "grad_norm": 1.765625, - "learning_rate": 1.9416769597091673e-05, - "loss": 0.1411, - "step": 5670 - }, - { - "epoch": 0.5770598394798334, - "grad_norm": 2.640625, - "learning_rate": 1.9338223224818818e-05, - "loss": 0.1476, - "step": 5680 - }, - { - "epoch": 0.5780757899014528, - "grad_norm": 4.84375, - "learning_rate": 1.9259735718841524e-05, - "loss": 0.1417, - "step": 5690 - }, - { - "epoch": 0.5790917403230722, - "grad_norm": 2.421875, - "learning_rate": 1.918130789520551e-05, - "loss": 0.1592, - "step": 5700 - }, - { - "epoch": 0.5801076907446917, - "grad_norm": 2.984375, - "learning_rate": 1.9102940569335963e-05, - "loss": 0.161, - "step": 5710 - }, - { - "epoch": 0.5811236411663111, - "grad_norm": 1.0234375, - "learning_rate": 1.9024634556029093e-05, - "loss": 0.1614, - "step": 5720 - }, - { - "epoch": 0.5821395915879305, - "grad_norm": 2.90625, - "learning_rate": 1.89463906694436e-05, - "loss": 0.1505, - "step": 5730 - }, - { - "epoch": 0.5831555420095499, - "grad_norm": 2.875, - "learning_rate": 1.8868209723092286e-05, - "loss": 0.1674, - "step": 5740 - }, - { - "epoch": 0.5841714924311694, - "grad_norm": 0.408203125, - "learning_rate": 1.8790092529833508e-05, - "loss": 0.1468, - "step": 5750 - }, - { - "epoch": 0.5851874428527888, - "grad_norm": 5.1875, - "learning_rate": 1.871203990186281e-05, - "loss": 0.1903, - "step": 5760 - }, - { - "epoch": 0.5862033932744082, - "grad_norm": 0.5546875, - "learning_rate": 1.8634052650704415e-05, - "loss": 0.2644, - "step": 5770 - }, - { - "epoch": 0.5872193436960277, - "grad_norm": 3.203125, - "learning_rate": 1.8556131587202848e-05, - "loss": 0.1968, - "step": 5780 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.484375, - "learning_rate": 1.8478277521514424e-05, - "loss": 0.2249, - "step": 5790 - }, - { - "epoch": 0.5892512445392665, - "grad_norm": 4.0, - "learning_rate": 1.8400491263098906e-05, - "loss": 0.1881, - "step": 5800 - }, - { - "epoch": 0.5902671949608859, - "grad_norm": 1.90625, - "learning_rate": 1.832277362071106e-05, - "loss": 0.1352, - "step": 5810 - }, - { - "epoch": 0.5912831453825054, - "grad_norm": 2.765625, - "learning_rate": 1.824512540239221e-05, - "loss": 0.2737, - "step": 5820 - }, - { - "epoch": 0.5922990958041248, - "grad_norm": 2.609375, - "learning_rate": 1.81675474154619e-05, - "loss": 0.1566, - "step": 5830 - }, - { - "epoch": 0.5933150462257442, - "grad_norm": 2.6875, - "learning_rate": 1.8090040466509444e-05, - "loss": 0.1999, - "step": 5840 - }, - { - "epoch": 0.5943309966473637, - "grad_norm": 2.609375, - "learning_rate": 1.8012605361385592e-05, - "loss": 0.2372, - "step": 5850 - }, - { - "epoch": 0.5953469470689831, - "grad_norm": 8.125, - "learning_rate": 1.7935242905194087e-05, - "loss": 0.2411, - "step": 5860 - }, - { - "epoch": 0.5963628974906025, - "grad_norm": 3.46875, - "learning_rate": 1.785795390228336e-05, - "loss": 0.138, - "step": 5870 - }, - { - "epoch": 0.5973788479122218, - "grad_norm": 2.3125, - "learning_rate": 1.7780739156238125e-05, - "loss": 0.1867, - "step": 5880 - }, - { - "epoch": 0.5983947983338413, - "grad_norm": 4.0625, - "learning_rate": 1.770359946987105e-05, - "loss": 0.2091, - "step": 5890 - }, - { - "epoch": 0.5994107487554607, - "grad_norm": 5.21875, - "learning_rate": 1.7626535645214378e-05, - "loss": 0.2091, - "step": 5900 - }, - { - "epoch": 0.6004266991770801, - "grad_norm": 3.15625, - "learning_rate": 1.7549548483511614e-05, - "loss": 0.1927, - "step": 5910 - }, - { - "epoch": 0.6014426495986995, - "grad_norm": 4.71875, - "learning_rate": 1.7472638785209198e-05, - "loss": 0.1893, - "step": 5920 - }, - { - "epoch": 0.602458600020319, - "grad_norm": 3.015625, - "learning_rate": 1.7395807349948145e-05, - "loss": 0.1557, - "step": 5930 - }, - { - "epoch": 0.6034745504419384, - "grad_norm": 2.9375, - "learning_rate": 1.73190549765558e-05, - "loss": 0.1717, - "step": 5940 - }, - { - "epoch": 0.6044905008635578, - "grad_norm": 3.109375, - "learning_rate": 1.724238246303745e-05, - "loss": 0.1879, - "step": 5950 - }, - { - "epoch": 0.6055064512851773, - "grad_norm": 3.875, - "learning_rate": 1.71657906065681e-05, - "loss": 0.1908, - "step": 5960 - }, - { - "epoch": 0.6065224017067967, - "grad_norm": 5.09375, - "learning_rate": 1.7089280203484115e-05, - "loss": 0.1712, - "step": 5970 - }, - { - "epoch": 0.6075383521284161, - "grad_norm": 3.015625, - "learning_rate": 1.701285204927502e-05, - "loss": 0.1454, - "step": 5980 - }, - { - "epoch": 0.6085543025500355, - "grad_norm": 3.265625, - "learning_rate": 1.693650693857515e-05, - "loss": 0.2283, - "step": 5990 - }, - { - "epoch": 0.609570252971655, - "grad_norm": 3.40625, - "learning_rate": 1.6860245665155466e-05, - "loss": 0.2188, - "step": 6000 - }, - { - "epoch": 0.6105862033932744, - "grad_norm": 2.5625, - "learning_rate": 1.678406902191521e-05, - "loss": 0.1605, - "step": 6010 - }, - { - "epoch": 0.6116021538148938, - "grad_norm": 0.6796875, - "learning_rate": 1.670797780087374e-05, - "loss": 0.1472, - "step": 6020 - }, - { - "epoch": 0.6126181042365133, - "grad_norm": 2.234375, - "learning_rate": 1.6631972793162288e-05, - "loss": 0.1676, - "step": 6030 - }, - { - "epoch": 0.6136340546581327, - "grad_norm": 1.25, - "learning_rate": 1.6556054789015662e-05, - "loss": 0.1508, - "step": 6040 - }, - { - "epoch": 0.6146500050797521, - "grad_norm": 4.78125, - "learning_rate": 1.6480224577764132e-05, - "loss": 0.1981, - "step": 6050 - }, - { - "epoch": 0.6156659555013715, - "grad_norm": 3.46875, - "learning_rate": 1.6404482947825137e-05, - "loss": 0.2514, - "step": 6060 - }, - { - "epoch": 0.616681905922991, - "grad_norm": 1.265625, - "learning_rate": 1.6328830686695154e-05, - "loss": 0.2397, - "step": 6070 - }, - { - "epoch": 0.6176978563446104, - "grad_norm": 1.953125, - "learning_rate": 1.625326858094144e-05, - "loss": 0.1523, - "step": 6080 - }, - { - "epoch": 0.6187138067662298, - "grad_norm": 3.484375, - "learning_rate": 1.6177797416193953e-05, - "loss": 0.218, - "step": 6090 - }, - { - "epoch": 0.6197297571878493, - "grad_norm": 3.484375, - "learning_rate": 1.6102417977137052e-05, - "loss": 0.1476, - "step": 6100 - }, - { - "epoch": 0.6207457076094687, - "grad_norm": 4.90625, - "learning_rate": 1.602713104750147e-05, - "loss": 0.1818, - "step": 6110 - }, - { - "epoch": 0.6217616580310881, - "grad_norm": 4.375, - "learning_rate": 1.5951937410056087e-05, - "loss": 0.2061, - "step": 6120 - }, - { - "epoch": 0.6227776084527075, - "grad_norm": 6.3125, - "learning_rate": 1.587683784659979e-05, - "loss": 0.1566, - "step": 6130 - }, - { - "epoch": 0.623793558874327, - "grad_norm": 2.828125, - "learning_rate": 1.58018331379534e-05, - "loss": 0.1376, - "step": 6140 - }, - { - "epoch": 0.6248095092959464, - "grad_norm": 2.40625, - "learning_rate": 1.572692406395149e-05, - "loss": 0.1655, - "step": 6150 - }, - { - "epoch": 0.6258254597175658, - "grad_norm": 4.34375, - "learning_rate": 1.5652111403434338e-05, - "loss": 0.2363, - "step": 6160 - }, - { - "epoch": 0.6268414101391853, - "grad_norm": 2.453125, - "learning_rate": 1.5577395934239757e-05, - "loss": 0.2464, - "step": 6170 - }, - { - "epoch": 0.6278573605608047, - "grad_norm": 2.53125, - "learning_rate": 1.5502778433195085e-05, - "loss": 0.1898, - "step": 6180 - }, - { - "epoch": 0.628873310982424, - "grad_norm": 2.28125, - "learning_rate": 1.5428259676109048e-05, - "loss": 0.1804, - "step": 6190 - }, - { - "epoch": 0.6298892614040434, - "grad_norm": 4.3125, - "learning_rate": 1.5353840437763732e-05, - "loss": 0.1409, - "step": 6200 - }, - { - "epoch": 0.630905211825663, - "grad_norm": 2.5625, - "learning_rate": 1.5279521491906496e-05, - "loss": 0.2449, - "step": 6210 - }, - { - "epoch": 0.6319211622472823, - "grad_norm": 3.0625, - "learning_rate": 1.520530361124195e-05, - "loss": 0.2103, - "step": 6220 - }, - { - "epoch": 0.6329371126689017, - "grad_norm": 2.609375, - "learning_rate": 1.5131187567423937e-05, - "loss": 0.2156, - "step": 6230 - }, - { - "epoch": 0.6339530630905211, - "grad_norm": 2.703125, - "learning_rate": 1.5057174131047446e-05, - "loss": 0.161, - "step": 6240 - }, - { - "epoch": 0.6349690135121406, - "grad_norm": 3.265625, - "learning_rate": 1.4983264071640679e-05, - "loss": 0.1757, - "step": 6250 - }, - { - "epoch": 0.63598496393376, - "grad_norm": 3.15625, - "learning_rate": 1.490945815765699e-05, - "loss": 0.2011, - "step": 6260 - }, - { - "epoch": 0.6370009143553794, - "grad_norm": 5.375, - "learning_rate": 1.4835757156466945e-05, - "loss": 0.1658, - "step": 6270 - }, - { - "epoch": 0.6380168647769989, - "grad_norm": 2.984375, - "learning_rate": 1.4762161834350271e-05, - "loss": 0.1754, - "step": 6280 - }, - { - "epoch": 0.6390328151986183, - "grad_norm": 2.015625, - "learning_rate": 1.4688672956487987e-05, - "loss": 0.1427, - "step": 6290 - }, - { - "epoch": 0.6400487656202377, - "grad_norm": 3.78125, - "learning_rate": 1.4615291286954352e-05, - "loss": 0.1517, - "step": 6300 - }, - { - "epoch": 0.6410647160418571, - "grad_norm": 2.859375, - "learning_rate": 1.4542017588709005e-05, - "loss": 0.2348, - "step": 6310 - }, - { - "epoch": 0.6420806664634766, - "grad_norm": 2.421875, - "learning_rate": 1.4468852623588961e-05, - "loss": 0.2089, - "step": 6320 - }, - { - "epoch": 0.643096616885096, - "grad_norm": 2.15625, - "learning_rate": 1.4395797152300719e-05, - "loss": 0.1702, - "step": 6330 - }, - { - "epoch": 0.6441125673067154, - "grad_norm": 1.53125, - "learning_rate": 1.4322851934412382e-05, - "loss": 0.1017, - "step": 6340 - }, - { - "epoch": 0.6451285177283349, - "grad_norm": 1.90625, - "learning_rate": 1.4250017728345716e-05, - "loss": 0.1813, - "step": 6350 - }, - { - "epoch": 0.6461444681499543, - "grad_norm": 2.015625, - "learning_rate": 1.4177295291368292e-05, - "loss": 0.1095, - "step": 6360 - }, - { - "epoch": 0.6471604185715737, - "grad_norm": 2.625, - "learning_rate": 1.410468537958558e-05, - "loss": 0.2259, - "step": 6370 - }, - { - "epoch": 0.6481763689931931, - "grad_norm": 3.5, - "learning_rate": 1.4032188747933136e-05, - "loss": 0.1595, - "step": 6380 - }, - { - "epoch": 0.6491923194148126, - "grad_norm": 5.21875, - "learning_rate": 1.39598061501687e-05, - "loss": 0.2226, - "step": 6390 - }, - { - "epoch": 0.650208269836432, - "grad_norm": 5.34375, - "learning_rate": 1.388753833886442e-05, - "loss": 0.2132, - "step": 6400 - }, - { - "epoch": 0.6512242202580514, - "grad_norm": 3.640625, - "learning_rate": 1.3815386065398945e-05, - "loss": 0.1227, - "step": 6410 - }, - { - "epoch": 0.6522401706796709, - "grad_norm": 1.0, - "learning_rate": 1.3743350079949705e-05, - "loss": 0.1755, - "step": 6420 - }, - { - "epoch": 0.6532561211012903, - "grad_norm": 2.359375, - "learning_rate": 1.3671431131485057e-05, - "loss": 0.1552, - "step": 6430 - }, - { - "epoch": 0.6542720715229097, - "grad_norm": 5.3125, - "learning_rate": 1.3599629967756483e-05, - "loss": 0.1917, - "step": 6440 - }, - { - "epoch": 0.6552880219445291, - "grad_norm": 4.625, - "learning_rate": 1.3527947335290877e-05, - "loss": 0.1812, - "step": 6450 - }, - { - "epoch": 0.6563039723661486, - "grad_norm": 1.234375, - "learning_rate": 1.3456383979382708e-05, - "loss": 0.1896, - "step": 6460 - }, - { - "epoch": 0.657319922787768, - "grad_norm": 3.984375, - "learning_rate": 1.3384940644086352e-05, - "loss": 0.1484, - "step": 6470 - }, - { - "epoch": 0.6583358732093874, - "grad_norm": 2.40625, - "learning_rate": 1.3313618072208268e-05, - "loss": 0.1334, - "step": 6480 - }, - { - "epoch": 0.6593518236310069, - "grad_norm": 4.375, - "learning_rate": 1.3242417005299357e-05, - "loss": 0.1351, - "step": 6490 - }, - { - "epoch": 0.6603677740526263, - "grad_norm": 2.640625, - "learning_rate": 1.31713381836472e-05, - "loss": 0.1717, - "step": 6500 - }, - { - "epoch": 0.6613837244742456, - "grad_norm": 2.640625, - "learning_rate": 1.3100382346268392e-05, - "loss": 0.1867, - "step": 6510 - }, - { - "epoch": 0.662399674895865, - "grad_norm": 1.734375, - "learning_rate": 1.3029550230900812e-05, - "loss": 0.1997, - "step": 6520 - }, - { - "epoch": 0.6634156253174845, - "grad_norm": 3.609375, - "learning_rate": 1.2958842573996016e-05, - "loss": 0.1969, - "step": 6530 - }, - { - "epoch": 0.6644315757391039, - "grad_norm": 3.578125, - "learning_rate": 1.2888260110711525e-05, - "loss": 0.1469, - "step": 6540 - }, - { - "epoch": 0.6654475261607233, - "grad_norm": 1.3515625, - "learning_rate": 1.2817803574903212e-05, - "loss": 0.1524, - "step": 6550 - }, - { - "epoch": 0.6664634765823427, - "grad_norm": 2.109375, - "learning_rate": 1.2747473699117668e-05, - "loss": 0.159, - "step": 6560 - }, - { - "epoch": 0.6674794270039622, - "grad_norm": 1.53125, - "learning_rate": 1.267727121458458e-05, - "loss": 0.1999, - "step": 6570 - }, - { - "epoch": 0.6684953774255816, - "grad_norm": 1.7265625, - "learning_rate": 1.2607196851209137e-05, - "loss": 0.2216, - "step": 6580 - }, - { - "epoch": 0.669511327847201, - "grad_norm": 3.125, - "learning_rate": 1.2537251337564412e-05, - "loss": 0.1607, - "step": 6590 - }, - { - "epoch": 0.6705272782688205, - "grad_norm": 2.421875, - "learning_rate": 1.2467435400883839e-05, - "loss": 0.2187, - "step": 6600 - }, - { - "epoch": 0.6715432286904399, - "grad_norm": 1.5078125, - "learning_rate": 1.239774976705359e-05, - "loss": 0.1753, - "step": 6610 - }, - { - "epoch": 0.6725591791120593, - "grad_norm": 1.140625, - "learning_rate": 1.2328195160605092e-05, - "loss": 0.194, - "step": 6620 - }, - { - "epoch": 0.6735751295336787, - "grad_norm": 4.9375, - "learning_rate": 1.225877230470743e-05, - "loss": 0.1485, - "step": 6630 - }, - { - "epoch": 0.6745910799552982, - "grad_norm": 3.65625, - "learning_rate": 1.218948192115988e-05, - "loss": 0.1847, - "step": 6640 - }, - { - "epoch": 0.6756070303769176, - "grad_norm": 3.875, - "learning_rate": 1.21203247303844e-05, - "loss": 0.1874, - "step": 6650 - }, - { - "epoch": 0.676622980798537, - "grad_norm": 2.65625, - "learning_rate": 1.2051301451418073e-05, - "loss": 0.2377, - "step": 6660 - }, - { - "epoch": 0.6776389312201565, - "grad_norm": 2.09375, - "learning_rate": 1.198241280190574e-05, - "loss": 0.1508, - "step": 6670 - }, - { - "epoch": 0.6786548816417759, - "grad_norm": 2.203125, - "learning_rate": 1.1913659498092431e-05, - "loss": 0.1537, - "step": 6680 - }, - { - "epoch": 0.6796708320633953, - "grad_norm": 2.484375, - "learning_rate": 1.184504225481601e-05, - "loss": 0.2339, - "step": 6690 - }, - { - "epoch": 0.6806867824850147, - "grad_norm": 5.625, - "learning_rate": 1.177656178549966e-05, - "loss": 0.2102, - "step": 6700 - }, - { - "epoch": 0.6817027329066342, - "grad_norm": 2.5, - "learning_rate": 1.1708218802144536e-05, - "loss": 0.1435, - "step": 6710 - }, - { - "epoch": 0.6827186833282536, - "grad_norm": 3.84375, - "learning_rate": 1.1640014015322323e-05, - "loss": 0.1823, - "step": 6720 - }, - { - "epoch": 0.683734633749873, - "grad_norm": 2.359375, - "learning_rate": 1.1571948134167862e-05, - "loss": 0.1154, - "step": 6730 - }, - { - "epoch": 0.6847505841714925, - "grad_norm": 2.90625, - "learning_rate": 1.1504021866371761e-05, - "loss": 0.2105, - "step": 6740 - }, - { - "epoch": 0.6857665345931119, - "grad_norm": 5.46875, - "learning_rate": 1.143623591817304e-05, - "loss": 0.1317, - "step": 6750 - }, - { - "epoch": 0.6867824850147313, - "grad_norm": 3.34375, - "learning_rate": 1.1368590994351835e-05, - "loss": 0.1406, - "step": 6760 - }, - { - "epoch": 0.6877984354363507, - "grad_norm": 3.78125, - "learning_rate": 1.130108779822198e-05, - "loss": 0.1425, - "step": 6770 - }, - { - "epoch": 0.6888143858579702, - "grad_norm": 0.77734375, - "learning_rate": 1.1233727031623783e-05, - "loss": 0.1623, - "step": 6780 - }, - { - "epoch": 0.6898303362795896, - "grad_norm": 4.625, - "learning_rate": 1.1166509394916682e-05, - "loss": 0.1591, - "step": 6790 - }, - { - "epoch": 0.690846286701209, - "grad_norm": 3.84375, - "learning_rate": 1.1099435586971982e-05, - "loss": 0.1758, - "step": 6800 - }, - { - "epoch": 0.6918622371228285, - "grad_norm": 2.4375, - "learning_rate": 1.1032506305165555e-05, - "loss": 0.1018, - "step": 6810 - }, - { - "epoch": 0.6928781875444479, - "grad_norm": 3.203125, - "learning_rate": 1.0965722245370641e-05, - "loss": 0.1485, - "step": 6820 - }, - { - "epoch": 0.6938941379660672, - "grad_norm": 0.7109375, - "learning_rate": 1.0899084101950561e-05, - "loss": 0.1762, - "step": 6830 - }, - { - "epoch": 0.6949100883876866, - "grad_norm": 1.9765625, - "learning_rate": 1.0832592567751555e-05, - "loss": 0.1402, - "step": 6840 - }, - { - "epoch": 0.6959260388093061, - "grad_norm": 1.4609375, - "learning_rate": 1.0766248334095505e-05, - "loss": 0.2278, - "step": 6850 - }, - { - "epoch": 0.6969419892309255, - "grad_norm": 3.953125, - "learning_rate": 1.0700052090772828e-05, - "loss": 0.1969, - "step": 6860 - }, - { - "epoch": 0.6979579396525449, - "grad_norm": 2.453125, - "learning_rate": 1.0634004526035249e-05, - "loss": 0.2073, - "step": 6870 - }, - { - "epoch": 0.6989738900741643, - "grad_norm": 1.6171875, - "learning_rate": 1.0568106326588645e-05, - "loss": 0.1902, - "step": 6880 - }, - { - "epoch": 0.6999898404957838, - "grad_norm": 1.2734375, - "learning_rate": 1.0502358177585953e-05, - "loss": 0.2165, - "step": 6890 - }, - { - "epoch": 0.7010057909174032, - "grad_norm": 1.671875, - "learning_rate": 1.0436760762619977e-05, - "loss": 0.1952, - "step": 6900 - }, - { - "epoch": 0.7020217413390226, - "grad_norm": 2.8125, - "learning_rate": 1.0371314763716347e-05, - "loss": 0.1422, - "step": 6910 - }, - { - "epoch": 0.7030376917606421, - "grad_norm": 2.53125, - "learning_rate": 1.0306020861326388e-05, - "loss": 0.0961, - "step": 6920 - }, - { - "epoch": 0.7040536421822615, - "grad_norm": 3.046875, - "learning_rate": 1.0240879734320068e-05, - "loss": 0.1542, - "step": 6930 - }, - { - "epoch": 0.7050695926038809, - "grad_norm": 2.859375, - "learning_rate": 1.0175892059978901e-05, - "loss": 0.1748, - "step": 6940 - }, - { - "epoch": 0.7060855430255003, - "grad_norm": 2.671875, - "learning_rate": 1.0111058513988958e-05, - "loss": 0.0819, - "step": 6950 - }, - { - "epoch": 0.7071014934471198, - "grad_norm": 3.5625, - "learning_rate": 1.0046379770433803e-05, - "loss": 0.1933, - "step": 6960 - }, - { - "epoch": 0.7081174438687392, - "grad_norm": 2.859375, - "learning_rate": 9.98185650178749e-06, - "loss": 0.1891, - "step": 6970 - }, - { - "epoch": 0.7091333942903586, - "grad_norm": 3.15625, - "learning_rate": 9.917489378907591e-06, - "loss": 0.2102, - "step": 6980 - }, - { - "epoch": 0.7101493447119781, - "grad_norm": 6.40625, - "learning_rate": 9.853279071028212e-06, - "loss": 0.1714, - "step": 6990 - }, - { - "epoch": 0.7111652951335975, - "grad_norm": 2.375, - "learning_rate": 9.78922624575303e-06, - "loss": 0.1299, - "step": 7000 - }, - { - "epoch": 0.7121812455552169, - "grad_norm": 2.078125, - "learning_rate": 9.72533156904833e-06, - "loss": 0.1914, - "step": 7010 - }, - { - "epoch": 0.7131971959768363, - "grad_norm": 3.859375, - "learning_rate": 9.661595705236137e-06, - "loss": 0.2377, - "step": 7020 - }, - { - "epoch": 0.7142131463984558, - "grad_norm": 1.171875, - "learning_rate": 9.598019316987244e-06, - "loss": 0.1851, - "step": 7030 - }, - { - "epoch": 0.7152290968200752, - "grad_norm": 1.078125, - "learning_rate": 9.53460306531439e-06, - "loss": 0.2661, - "step": 7040 - }, - { - "epoch": 0.7162450472416946, - "grad_norm": 1.6484375, - "learning_rate": 9.471347609565311e-06, - "loss": 0.1669, - "step": 7050 - }, - { - "epoch": 0.7172609976633141, - "grad_norm": 4.59375, - "learning_rate": 9.408253607415957e-06, - "loss": 0.2487, - "step": 7060 - }, - { - "epoch": 0.7182769480849335, - "grad_norm": 3.09375, - "learning_rate": 9.345321714863614e-06, - "loss": 0.186, - "step": 7070 - }, - { - "epoch": 0.7192928985065529, - "grad_norm": 6.0625, - "learning_rate": 9.282552586220075e-06, - "loss": 0.2249, - "step": 7080 - }, - { - "epoch": 0.7203088489281723, - "grad_norm": 1.5703125, - "learning_rate": 9.219946874104885e-06, - "loss": 0.1255, - "step": 7090 - }, - { - "epoch": 0.7213247993497918, - "grad_norm": 1.9453125, - "learning_rate": 9.157505229438481e-06, - "loss": 0.1999, - "step": 7100 - }, - { - "epoch": 0.7223407497714112, - "grad_norm": 5.1875, - "learning_rate": 9.095228301435518e-06, - "loss": 0.199, - "step": 7110 - }, - { - "epoch": 0.7233567001930306, - "grad_norm": 2.078125, - "learning_rate": 9.03311673759802e-06, - "loss": 0.2182, - "step": 7120 - }, - { - "epoch": 0.7243726506146501, - "grad_norm": 6.46875, - "learning_rate": 8.971171183708733e-06, - "loss": 0.1573, - "step": 7130 - }, - { - "epoch": 0.7253886010362695, - "grad_norm": 3.015625, - "learning_rate": 8.909392283824353e-06, - "loss": 0.2044, - "step": 7140 - }, - { - "epoch": 0.7264045514578888, - "grad_norm": 2.921875, - "learning_rate": 8.847780680268872e-06, - "loss": 0.11, - "step": 7150 - }, - { - "epoch": 0.7274205018795082, - "grad_norm": 2.96875, - "learning_rate": 8.786337013626853e-06, - "loss": 0.1897, - "step": 7160 - }, - { - "epoch": 0.7284364523011277, - "grad_norm": 1.7578125, - "learning_rate": 8.725061922736799e-06, - "loss": 0.153, - "step": 7170 - }, - { - "epoch": 0.7294524027227471, - "grad_norm": 1.609375, - "learning_rate": 8.663956044684532e-06, - "loss": 0.1746, - "step": 7180 - }, - { - "epoch": 0.7304683531443665, - "grad_norm": 1.9375, - "learning_rate": 8.603020014796507e-06, - "loss": 0.2284, - "step": 7190 - }, - { - "epoch": 0.7314843035659859, - "grad_norm": 1.515625, - "learning_rate": 8.542254466633273e-06, - "loss": 0.1186, - "step": 7200 - }, - { - "epoch": 0.7325002539876054, - "grad_norm": 1.671875, - "learning_rate": 8.481660031982844e-06, - "loss": 0.1971, - "step": 7210 - }, - { - "epoch": 0.7335162044092248, - "grad_norm": 1.453125, - "learning_rate": 8.421237340854157e-06, - "loss": 0.196, - "step": 7220 - }, - { - "epoch": 0.7345321548308442, - "grad_norm": 0.65234375, - "learning_rate": 8.360987021470479e-06, - "loss": 0.1724, - "step": 7230 - }, - { - "epoch": 0.7355481052524637, - "grad_norm": 2.84375, - "learning_rate": 8.300909700262929e-06, - "loss": 0.175, - "step": 7240 - }, - { - "epoch": 0.7365640556740831, - "grad_norm": 3.109375, - "learning_rate": 8.241006001863924e-06, - "loss": 0.2276, - "step": 7250 - }, - { - "epoch": 0.7375800060957025, - "grad_norm": 4.8125, - "learning_rate": 8.181276549100714e-06, - "loss": 0.2029, - "step": 7260 - }, - { - "epoch": 0.7385959565173219, - "grad_norm": 4.03125, - "learning_rate": 8.12172196298887e-06, - "loss": 0.175, - "step": 7270 - }, - { - "epoch": 0.7396119069389414, - "grad_norm": 3.046875, - "learning_rate": 8.062342862725878e-06, - "loss": 0.1662, - "step": 7280 - }, - { - "epoch": 0.7406278573605608, - "grad_norm": 3.375, - "learning_rate": 8.003139865684662e-06, - "loss": 0.1616, - "step": 7290 - }, - { - "epoch": 0.7416438077821802, - "grad_norm": 2.5625, - "learning_rate": 7.944113587407157e-06, - "loss": 0.2448, - "step": 7300 - }, - { - "epoch": 0.7426597582037997, - "grad_norm": 4.125, - "learning_rate": 7.885264641597961e-06, - "loss": 0.1618, - "step": 7310 - }, - { - "epoch": 0.7436757086254191, - "grad_norm": 3.5, - "learning_rate": 7.826593640117889e-06, - "loss": 0.1134, - "step": 7320 - }, - { - "epoch": 0.7446916590470385, - "grad_norm": 2.6875, - "learning_rate": 7.76810119297767e-06, - "loss": 0.1795, - "step": 7330 - }, - { - "epoch": 0.7457076094686579, - "grad_norm": 4.34375, - "learning_rate": 7.709787908331556e-06, - "loss": 0.2736, - "step": 7340 - }, - { - "epoch": 0.7467235598902774, - "grad_norm": 1.21875, - "learning_rate": 7.651654392471038e-06, - "loss": 0.139, - "step": 7350 - }, - { - "epoch": 0.7477395103118968, - "grad_norm": 3.578125, - "learning_rate": 7.593701249818521e-06, - "loss": 0.2023, - "step": 7360 - }, - { - "epoch": 0.7487554607335162, - "grad_norm": 2.15625, - "learning_rate": 7.535929082921048e-06, - "loss": 0.1702, - "step": 7370 - }, - { - "epoch": 0.7497714111551357, - "grad_norm": 1.96875, - "learning_rate": 7.47833849244402e-06, - "loss": 0.1835, - "step": 7380 - }, - { - "epoch": 0.7507873615767551, - "grad_norm": 2.796875, - "learning_rate": 7.420930077164959e-06, - "loss": 0.1713, - "step": 7390 - }, - { - "epoch": 0.7518033119983745, - "grad_norm": 4.46875, - "learning_rate": 7.363704433967311e-06, - "loss": 0.1906, - "step": 7400 - }, - { - "epoch": 0.7528192624199939, - "grad_norm": 1.75, - "learning_rate": 7.306662157834185e-06, - "loss": 0.1421, - "step": 7410 - }, - { - "epoch": 0.7538352128416134, - "grad_norm": 1.140625, - "learning_rate": 7.2498038418422145e-06, - "loss": 0.1793, - "step": 7420 - }, - { - "epoch": 0.7548511632632328, - "grad_norm": 2.578125, - "learning_rate": 7.193130077155374e-06, - "loss": 0.1603, - "step": 7430 - }, - { - "epoch": 0.7558671136848522, - "grad_norm": 4.3125, - "learning_rate": 7.13664145301883e-06, - "loss": 0.2169, - "step": 7440 - }, - { - "epoch": 0.7568830641064717, - "grad_norm": 3.078125, - "learning_rate": 7.0803385567528025e-06, - "loss": 0.1685, - "step": 7450 - }, - { - "epoch": 0.757899014528091, - "grad_norm": 3.5625, - "learning_rate": 7.024221973746495e-06, - "loss": 0.2282, - "step": 7460 - }, - { - "epoch": 0.7589149649497104, - "grad_norm": 2.265625, - "learning_rate": 6.968292287451961e-06, - "loss": 0.1786, - "step": 7470 - }, - { - "epoch": 0.7599309153713298, - "grad_norm": 4.71875, - "learning_rate": 6.912550079378091e-06, - "loss": 0.1811, - "step": 7480 - }, - { - "epoch": 0.7609468657929493, - "grad_norm": 2.328125, - "learning_rate": 6.856995929084506e-06, - "loss": 0.1747, - "step": 7490 - }, - { - "epoch": 0.7619628162145687, - "grad_norm": 5.21875, - "learning_rate": 6.801630414175589e-06, - "loss": 0.2028, - "step": 7500 - }, - { - "epoch": 0.7629787666361881, - "grad_norm": 3.78125, - "learning_rate": 6.746454110294451e-06, - "loss": 0.2255, - "step": 7510 - }, - { - "epoch": 0.7639947170578075, - "grad_norm": 1.625, - "learning_rate": 6.691467591116931e-06, - "loss": 0.1604, - "step": 7520 - }, - { - "epoch": 0.765010667479427, - "grad_norm": 1.7734375, - "learning_rate": 6.6366714283456755e-06, - "loss": 0.2559, - "step": 7530 - }, - { - "epoch": 0.7660266179010464, - "grad_norm": 4.59375, - "learning_rate": 6.582066191704142e-06, - "loss": 0.2034, - "step": 7540 - }, - { - "epoch": 0.7670425683226658, - "grad_norm": 1.578125, - "learning_rate": 6.527652448930724e-06, - "loss": 0.148, - "step": 7550 - }, - { - "epoch": 0.7680585187442853, - "grad_norm": 1.7109375, - "learning_rate": 6.4734307657728e-06, - "loss": 0.1811, - "step": 7560 - }, - { - "epoch": 0.7690744691659047, - "grad_norm": 1.2734375, - "learning_rate": 6.419401705980924e-06, - "loss": 0.1407, - "step": 7570 - }, - { - "epoch": 0.7700904195875241, - "grad_norm": 2.25, - "learning_rate": 6.365565831302869e-06, - "loss": 0.1893, - "step": 7580 - }, - { - "epoch": 0.7711063700091435, - "grad_norm": 1.625, - "learning_rate": 6.311923701477854e-06, - "loss": 0.1835, - "step": 7590 - }, - { - "epoch": 0.772122320430763, - "grad_norm": 2.375, - "learning_rate": 6.258475874230713e-06, - "loss": 0.1579, - "step": 7600 - }, - { - "epoch": 0.7731382708523824, - "grad_norm": 4.5, - "learning_rate": 6.205222905266067e-06, - "loss": 0.1794, - "step": 7610 - }, - { - "epoch": 0.7741542212740018, - "grad_norm": 4.25, - "learning_rate": 6.152165348262598e-06, - "loss": 0.1477, - "step": 7620 - }, - { - "epoch": 0.7751701716956213, - "grad_norm": 1.9765625, - "learning_rate": 6.0993037548672246e-06, - "loss": 0.2396, - "step": 7630 - }, - { - "epoch": 0.7761861221172407, - "grad_norm": 2.671875, - "learning_rate": 6.046638674689454e-06, - "loss": 0.1717, - "step": 7640 - }, - { - "epoch": 0.7772020725388601, - "grad_norm": 3.671875, - "learning_rate": 5.994170655295567e-06, - "loss": 0.2646, - "step": 7650 - }, - { - "epoch": 0.7782180229604795, - "grad_norm": 1.3046875, - "learning_rate": 5.9419002422030106e-06, - "loss": 0.1553, - "step": 7660 - }, - { - "epoch": 0.779233973382099, - "grad_norm": 3.734375, - "learning_rate": 5.889827978874665e-06, - "loss": 0.1854, - "step": 7670 - }, - { - "epoch": 0.7802499238037184, - "grad_norm": 2.140625, - "learning_rate": 5.837954406713245e-06, - "loss": 0.1857, - "step": 7680 - }, - { - "epoch": 0.7812658742253378, - "grad_norm": 3.34375, - "learning_rate": 5.786280065055619e-06, - "loss": 0.1797, - "step": 7690 - }, - { - "epoch": 0.7822818246469573, - "grad_norm": 0.97265625, - "learning_rate": 5.734805491167244e-06, - "loss": 0.1488, - "step": 7700 - }, - { - "epoch": 0.7832977750685767, - "grad_norm": 2.078125, - "learning_rate": 5.683531220236576e-06, - "loss": 0.1688, - "step": 7710 - }, - { - "epoch": 0.7843137254901961, - "grad_norm": 3.046875, - "learning_rate": 5.632457785369455e-06, - "loss": 0.1503, - "step": 7720 - }, - { - "epoch": 0.7853296759118155, - "grad_norm": 1.6875, - "learning_rate": 5.581585717583637e-06, - "loss": 0.1658, - "step": 7730 - }, - { - "epoch": 0.786345626333435, - "grad_norm": 3.421875, - "learning_rate": 5.530915545803209e-06, - "loss": 0.2112, - "step": 7740 - }, - { - "epoch": 0.7873615767550544, - "grad_norm": 4.1875, - "learning_rate": 5.480447796853141e-06, - "loss": 0.165, - "step": 7750 - }, - { - "epoch": 0.7883775271766738, - "grad_norm": 5.3125, - "learning_rate": 5.430182995453756e-06, - "loss": 0.1499, - "step": 7760 - }, - { - "epoch": 0.7893934775982933, - "grad_norm": 2.1875, - "learning_rate": 5.380121664215329e-06, - "loss": 0.1559, - "step": 7770 - }, - { - "epoch": 0.7904094280199127, - "grad_norm": 1.46875, - "learning_rate": 5.330264323632611e-06, - "loss": 0.2098, - "step": 7780 - }, - { - "epoch": 0.791425378441532, - "grad_norm": 4.65625, - "learning_rate": 5.280611492079449e-06, - "loss": 0.1776, - "step": 7790 - }, - { - "epoch": 0.7924413288631514, - "grad_norm": 1.3359375, - "learning_rate": 5.231163685803361e-06, - "loss": 0.1497, - "step": 7800 - }, - { - "epoch": 0.7934572792847709, - "grad_norm": 2.640625, - "learning_rate": 5.181921418920191e-06, - "loss": 0.12, - "step": 7810 - }, - { - "epoch": 0.7944732297063903, - "grad_norm": 2.328125, - "learning_rate": 5.13288520340878e-06, - "loss": 0.1981, - "step": 7820 - }, - { - "epoch": 0.7954891801280097, - "grad_norm": 3.0625, - "learning_rate": 5.084055549105596e-06, - "loss": 0.1389, - "step": 7830 - }, - { - "epoch": 0.7965051305496291, - "grad_norm": 2.796875, - "learning_rate": 5.035432963699479e-06, - "loss": 0.2293, - "step": 7840 - }, - { - "epoch": 0.7975210809712486, - "grad_norm": 5.0625, - "learning_rate": 4.98701795272635e-06, - "loss": 0.1618, - "step": 7850 - }, - { - "epoch": 0.798537031392868, - "grad_norm": 5.09375, - "learning_rate": 4.938811019563938e-06, - "loss": 0.1755, - "step": 7860 - }, - { - "epoch": 0.7995529818144874, - "grad_norm": 2.140625, - "learning_rate": 4.8908126654265475e-06, - "loss": 0.1565, - "step": 7870 - }, - { - "epoch": 0.8005689322361069, - "grad_norm": 0.76171875, - "learning_rate": 4.843023389359885e-06, - "loss": 0.2176, - "step": 7880 - }, - { - "epoch": 0.8015848826577263, - "grad_norm": 2.625, - "learning_rate": 4.79544368823581e-06, - "loss": 0.2013, - "step": 7890 - }, - { - "epoch": 0.8026008330793457, - "grad_norm": 2.078125, - "learning_rate": 4.748074056747234e-06, - "loss": 0.1246, - "step": 7900 - }, - { - "epoch": 0.8036167835009651, - "grad_norm": 3.5, - "learning_rate": 4.700914987402919e-06, - "loss": 0.1638, - "step": 7910 - }, - { - "epoch": 0.8046327339225846, - "grad_norm": 3.4375, - "learning_rate": 4.6539669705223916e-06, - "loss": 0.2213, - "step": 7920 - }, - { - "epoch": 0.805648684344204, - "grad_norm": 2.96875, - "learning_rate": 4.607230494230849e-06, - "loss": 0.1822, - "step": 7930 - }, - { - "epoch": 0.8066646347658234, - "grad_norm": 2.359375, - "learning_rate": 4.560706044454047e-06, - "loss": 0.1763, - "step": 7940 - }, - { - "epoch": 0.8076805851874429, - "grad_norm": 4.59375, - "learning_rate": 4.514394104913291e-06, - "loss": 0.234, - "step": 7950 - }, - { - "epoch": 0.8086965356090623, - "grad_norm": 1.96875, - "learning_rate": 4.468295157120372e-06, - "loss": 0.1939, - "step": 7960 - }, - { - "epoch": 0.8097124860306817, - "grad_norm": 2.578125, - "learning_rate": 4.422409680372594e-06, - "loss": 0.174, - "step": 7970 - }, - { - "epoch": 0.8107284364523011, - "grad_norm": 4.5625, - "learning_rate": 4.3767381517477505e-06, - "loss": 0.2375, - "step": 7980 - }, - { - "epoch": 0.8117443868739206, - "grad_norm": 0.9609375, - "learning_rate": 4.331281046099203e-06, - "loss": 0.2076, - "step": 7990 - }, - { - "epoch": 0.81276033729554, - "grad_norm": 6.0625, - "learning_rate": 4.286038836050929e-06, - "loss": 0.2504, - "step": 8000 - }, - { - "epoch": 0.8137762877171594, - "grad_norm": 3.484375, - "learning_rate": 4.241011991992586e-06, - "loss": 0.2102, - "step": 8010 - }, - { - "epoch": 0.8147922381387789, - "grad_norm": 1.9765625, - "learning_rate": 4.1962009820746635e-06, - "loss": 0.1846, - "step": 8020 - }, - { - "epoch": 0.8158081885603983, - "grad_norm": 1.875, - "learning_rate": 4.15160627220357e-06, - "loss": 0.1741, - "step": 8030 - }, - { - "epoch": 0.8168241389820177, - "grad_norm": 5.5625, - "learning_rate": 4.107228326036838e-06, - "loss": 0.2078, - "step": 8040 - }, - { - "epoch": 0.8178400894036371, - "grad_norm": 1.7578125, - "learning_rate": 4.063067604978252e-06, - "loss": 0.212, - "step": 8050 - }, - { - "epoch": 0.8188560398252566, - "grad_norm": 4.09375, - "learning_rate": 4.019124568173094e-06, - "loss": 0.1831, - "step": 8060 - }, - { - "epoch": 0.819871990246876, - "grad_norm": 6.625, - "learning_rate": 3.975399672503341e-06, - "loss": 0.2196, - "step": 8070 - }, - { - "epoch": 0.8208879406684954, - "grad_norm": 2.78125, - "learning_rate": 3.931893372582943e-06, - "loss": 0.2002, - "step": 8080 - }, - { - "epoch": 0.8219038910901149, - "grad_norm": 6.90625, - "learning_rate": 3.888606120753047e-06, - "loss": 0.2138, - "step": 8090 - }, - { - "epoch": 0.8229198415117343, - "grad_norm": 4.09375, - "learning_rate": 3.845538367077362e-06, - "loss": 0.2593, - "step": 8100 - }, - { - "epoch": 0.8239357919333536, - "grad_norm": 1.859375, - "learning_rate": 3.8026905593374213e-06, - "loss": 0.2062, - "step": 8110 - }, - { - "epoch": 0.824951742354973, - "grad_norm": 4.3125, - "learning_rate": 3.760063143027945e-06, - "loss": 0.1343, - "step": 8120 - }, - { - "epoch": 0.8259676927765925, - "grad_norm": 1.984375, - "learning_rate": 3.7176565613522313e-06, - "loss": 0.2494, - "step": 8130 - }, - { - "epoch": 0.8269836431982119, - "grad_norm": 3.71875, - "learning_rate": 3.675471255217516e-06, - "loss": 0.1502, - "step": 8140 - }, - { - "epoch": 0.8279995936198313, - "grad_norm": 2.359375, - "learning_rate": 3.6335076632304175e-06, - "loss": 0.1256, - "step": 8150 - }, - { - "epoch": 0.8290155440414507, - "grad_norm": 1.46875, - "learning_rate": 3.5917662216923332e-06, - "loss": 0.1709, - "step": 8160 - }, - { - "epoch": 0.8300314944630702, - "grad_norm": 2.78125, - "learning_rate": 3.550247364594958e-06, - "loss": 0.1881, - "step": 8170 - }, - { - "epoch": 0.8310474448846896, - "grad_norm": 1.0703125, - "learning_rate": 3.508951523615725e-06, - "loss": 0.1998, - "step": 8180 - }, - { - "epoch": 0.832063395306309, - "grad_norm": 2.40625, - "learning_rate": 3.467879128113352e-06, - "loss": 0.2429, - "step": 8190 - }, - { - "epoch": 0.8330793457279285, - "grad_norm": 2.609375, - "learning_rate": 3.427030605123352e-06, - "loss": 0.1942, - "step": 8200 - }, - { - "epoch": 0.8340952961495479, - "grad_norm": 1.6015625, - "learning_rate": 3.3864063793536043e-06, - "loss": 0.1898, - "step": 8210 - }, - { - "epoch": 0.8351112465711673, - "grad_norm": 5.375, - "learning_rate": 3.3460068731799577e-06, - "loss": 0.1919, - "step": 8220 - }, - { - "epoch": 0.8361271969927867, - "grad_norm": 3.3125, - "learning_rate": 3.3058325066417818e-06, - "loss": 0.1516, - "step": 8230 - }, - { - "epoch": 0.8371431474144062, - "grad_norm": 0.76171875, - "learning_rate": 3.26588369743768e-06, - "loss": 0.1068, - "step": 8240 - }, - { - "epoch": 0.8381590978360256, - "grad_norm": 3.171875, - "learning_rate": 3.2261608609210653e-06, - "loss": 0.1203, - "step": 8250 - }, - { - "epoch": 0.839175048257645, - "grad_norm": 2.359375, - "learning_rate": 3.186664410095913e-06, - "loss": 0.2172, - "step": 8260 - }, - { - "epoch": 0.8401909986792645, - "grad_norm": 3.328125, - "learning_rate": 3.1473947556124093e-06, - "loss": 0.1249, - "step": 8270 - }, - { - "epoch": 0.8412069491008839, - "grad_norm": 2.484375, - "learning_rate": 3.1083523057627213e-06, - "loss": 0.1744, - "step": 8280 - }, - { - "epoch": 0.8422228995225033, - "grad_norm": 4.46875, - "learning_rate": 3.0695374664767353e-06, - "loss": 0.1772, - "step": 8290 - }, - { - "epoch": 0.8432388499441227, - "grad_norm": 0.59375, - "learning_rate": 3.0309506413178397e-06, - "loss": 0.2302, - "step": 8300 - }, - { - "epoch": 0.8442548003657422, - "grad_norm": 2.390625, - "learning_rate": 2.9925922314787136e-06, - "loss": 0.1635, - "step": 8310 - }, - { - "epoch": 0.8452707507873616, - "grad_norm": 2.34375, - "learning_rate": 2.954462635777194e-06, - "loss": 0.1573, - "step": 8320 - }, - { - "epoch": 0.846286701208981, - "grad_norm": 2.015625, - "learning_rate": 2.916562250652083e-06, - "loss": 0.1608, - "step": 8330 - }, - { - "epoch": 0.8473026516306005, - "grad_norm": 4.125, - "learning_rate": 2.878891470159048e-06, - "loss": 0.184, - "step": 8340 - }, - { - "epoch": 0.8483186020522199, - "grad_norm": 2.515625, - "learning_rate": 2.8414506859665514e-06, - "loss": 0.2141, - "step": 8350 - }, - { - "epoch": 0.8493345524738393, - "grad_norm": 3.375, - "learning_rate": 2.8042402873517197e-06, - "loss": 0.1729, - "step": 8360 - }, - { - "epoch": 0.8503505028954587, - "grad_norm": 3.078125, - "learning_rate": 2.76726066119635e-06, - "loss": 0.2252, - "step": 8370 - }, - { - "epoch": 0.8513664533170782, - "grad_norm": 1.5390625, - "learning_rate": 2.730512191982845e-06, - "loss": 0.1644, - "step": 8380 - }, - { - "epoch": 0.8523824037386976, - "grad_norm": 1.9296875, - "learning_rate": 2.693995261790261e-06, - "loss": 0.1822, - "step": 8390 - }, - { - "epoch": 0.853398354160317, - "grad_norm": 3.3125, - "learning_rate": 2.657710250290285e-06, - "loss": 0.2068, - "step": 8400 - }, - { - "epoch": 0.8544143045819365, - "grad_norm": 0.640625, - "learning_rate": 2.621657534743327e-06, - "loss": 0.1224, - "step": 8410 - }, - { - "epoch": 0.8554302550035559, - "grad_norm": 3.421875, - "learning_rate": 2.5858374899945804e-06, - "loss": 0.179, - "step": 8420 - }, - { - "epoch": 0.8564462054251752, - "grad_norm": 3.484375, - "learning_rate": 2.550250488470135e-06, - "loss": 0.1873, - "step": 8430 - }, - { - "epoch": 0.8574621558467946, - "grad_norm": 3.984375, - "learning_rate": 2.5148969001730806e-06, - "loss": 0.1799, - "step": 8440 - }, - { - "epoch": 0.8584781062684141, - "grad_norm": 1.375, - "learning_rate": 2.4797770926796858e-06, - "loss": 0.176, - "step": 8450 - }, - { - "epoch": 0.8594940566900335, - "grad_norm": 1.8984375, - "learning_rate": 2.444891431135571e-06, - "loss": 0.1664, - "step": 8460 - }, - { - "epoch": 0.8605100071116529, - "grad_norm": 4.15625, - "learning_rate": 2.4102402782518936e-06, - "loss": 0.1512, - "step": 8470 - }, - { - "epoch": 0.8615259575332723, - "grad_norm": 1.34375, - "learning_rate": 2.3758239943016096e-06, - "loss": 0.1629, - "step": 8480 - }, - { - "epoch": 0.8625419079548918, - "grad_norm": 5.3125, - "learning_rate": 2.3416429371157013e-06, - "loss": 0.2099, - "step": 8490 - }, - { - "epoch": 0.8635578583765112, - "grad_norm": 5.9375, - "learning_rate": 2.307697462079464e-06, - "loss": 0.2221, - "step": 8500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-8500/training_args.bin b/checkpoints/checkpoint-8500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-8500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-9000/adapter_config.json b/checkpoints/checkpoint-9000/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-9000/adapter_model.safetensors b/checkpoints/checkpoint-9000/adapter_model.safetensors deleted file mode 100644 index 8527f2c40f383b1cf651d36a2984f5ff825b8ae2..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:33bdc4e327ac26650f40cb5930deff06b7f48707f893ee77c1175a4a65f6909b -size 5919456 diff --git a/checkpoints/checkpoint-9000/optimizer.pt b/checkpoints/checkpoint-9000/optimizer.pt deleted file mode 100644 index d6bc1be7888a16d8c4b7f04fab92dd9b007b67cc..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0638566e76474065be273b543ece321c7e8e004ee51a20c1a1d52b2838e78bb6 -size 11930938 diff --git a/checkpoints/checkpoint-9000/rng_state_0.pth b/checkpoints/checkpoint-9000/rng_state_0.pth deleted file mode 100644 index 2f6ccedf895c2d6ac8b9d081077455c1e01eb40b..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9ca1c49d5b46a7f77afead045cfb6ddf181a9c78168c26c7eef763523361557 -size 15024 diff --git a/checkpoints/checkpoint-9000/rng_state_1.pth b/checkpoints/checkpoint-9000/rng_state_1.pth deleted file mode 100644 index a9526156d7a2b5afe37477f57c9c8c5878b57db6..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:321d28c2b0ca1e12bac7c5601220acbf51dba57cb5c81444c67560204eeb23e8 -size 15024 diff --git a/checkpoints/checkpoint-9000/rng_state_2.pth b/checkpoints/checkpoint-9000/rng_state_2.pth deleted file mode 100644 index cf5c72ead4046664eac6220c25e849e4fd4ae199..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0dbe2075823c34d4e2ddbb8628dd2dfb1329f66b7c4225949042b1ffb085b4c4 -size 15024 diff --git a/checkpoints/checkpoint-9000/rng_state_3.pth b/checkpoints/checkpoint-9000/rng_state_3.pth deleted file mode 100644 index 83bd913f93637b92c9974abbc175838c2fc91877..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:94cf730a64bd09032ea105de0e172f948a4a2bd002a9ce85d137729cfbbd0578 -size 15024 diff --git a/checkpoints/checkpoint-9000/scheduler.pt b/checkpoints/checkpoint-9000/scheduler.pt deleted file mode 100644 index 581c38180d342706ff7b6c9ed5c479819b60323e..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:777ecd9b7cc3e4361056b976c635577f61b77b3bbef2bb60189849a484abfe13 -size 1064 diff --git a/checkpoints/checkpoint-9000/trainer_state.json b/checkpoints/checkpoint-9000/trainer_state.json deleted file mode 100644 index ef0193a85dc4d73484dbcd0610b17b8f9fd78a14..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/trainer_state.json +++ /dev/null @@ -1,6321 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.9143553794574825, - "eval_steps": 500, - "global_step": 9000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - }, - { - "epoch": 0.5597886823123032, - "grad_norm": 1.9921875, - "learning_rate": 2.0680849202968743e-05, - "loss": 0.2139, - "step": 5510 - }, - { - "epoch": 0.5608046327339226, - "grad_norm": 2.4375, - "learning_rate": 2.060147242557674e-05, - "loss": 0.183, - "step": 5520 - }, - { - "epoch": 0.5618205831555421, - "grad_norm": 5.5, - "learning_rate": 2.0522141380299308e-05, - "loss": 0.1673, - "step": 5530 - }, - { - "epoch": 0.5628365335771615, - "grad_norm": 4.25, - "learning_rate": 2.044285689195258e-05, - "loss": 0.1674, - "step": 5540 - }, - { - "epoch": 0.5638524839987809, - "grad_norm": 2.109375, - "learning_rate": 2.0363619784868604e-05, - "loss": 0.1531, - "step": 5550 - }, - { - "epoch": 0.5648684344204002, - "grad_norm": 2.59375, - "learning_rate": 2.0284430882886836e-05, - "loss": 0.1665, - "step": 5560 - }, - { - "epoch": 0.5658843848420197, - "grad_norm": 3.984375, - "learning_rate": 2.020529100934549e-05, - "loss": 0.1717, - "step": 5570 - }, - { - "epoch": 0.5669003352636391, - "grad_norm": 1.6015625, - "learning_rate": 2.012620098707306e-05, - "loss": 0.1167, - "step": 5580 - }, - { - "epoch": 0.5679162856852585, - "grad_norm": 6.0625, - "learning_rate": 2.004716163837972e-05, - "loss": 0.2084, - "step": 5590 - }, - { - "epoch": 0.5689322361068779, - "grad_norm": 2.5625, - "learning_rate": 1.996817378504876e-05, - "loss": 0.1939, - "step": 5600 - }, - { - "epoch": 0.5699481865284974, - "grad_norm": 3.109375, - "learning_rate": 1.9889238248328108e-05, - "loss": 0.1241, - "step": 5610 - }, - { - "epoch": 0.5709641369501168, - "grad_norm": 4.875, - "learning_rate": 1.981035584892171e-05, - "loss": 0.1865, - "step": 5620 - }, - { - "epoch": 0.5719800873717362, - "grad_norm": 2.984375, - "learning_rate": 1.9731527406981072e-05, - "loss": 0.1639, - "step": 5630 - }, - { - "epoch": 0.5729960377933557, - "grad_norm": 4.4375, - "learning_rate": 1.9652753742096655e-05, - "loss": 0.2019, - "step": 5640 - }, - { - "epoch": 0.5740119882149751, - "grad_norm": 4.3125, - "learning_rate": 1.9574035673289432e-05, - "loss": 0.1829, - "step": 5650 - }, - { - "epoch": 0.5750279386365945, - "grad_norm": 3.203125, - "learning_rate": 1.9495374019002312e-05, - "loss": 0.2267, - "step": 5660 - }, - { - "epoch": 0.5760438890582139, - "grad_norm": 1.765625, - "learning_rate": 1.9416769597091673e-05, - "loss": 0.1411, - "step": 5670 - }, - { - "epoch": 0.5770598394798334, - "grad_norm": 2.640625, - "learning_rate": 1.9338223224818818e-05, - "loss": 0.1476, - "step": 5680 - }, - { - "epoch": 0.5780757899014528, - "grad_norm": 4.84375, - "learning_rate": 1.9259735718841524e-05, - "loss": 0.1417, - "step": 5690 - }, - { - "epoch": 0.5790917403230722, - "grad_norm": 2.421875, - "learning_rate": 1.918130789520551e-05, - "loss": 0.1592, - "step": 5700 - }, - { - "epoch": 0.5801076907446917, - "grad_norm": 2.984375, - "learning_rate": 1.9102940569335963e-05, - "loss": 0.161, - "step": 5710 - }, - { - "epoch": 0.5811236411663111, - "grad_norm": 1.0234375, - "learning_rate": 1.9024634556029093e-05, - "loss": 0.1614, - "step": 5720 - }, - { - "epoch": 0.5821395915879305, - "grad_norm": 2.90625, - "learning_rate": 1.89463906694436e-05, - "loss": 0.1505, - "step": 5730 - }, - { - "epoch": 0.5831555420095499, - "grad_norm": 2.875, - "learning_rate": 1.8868209723092286e-05, - "loss": 0.1674, - "step": 5740 - }, - { - "epoch": 0.5841714924311694, - "grad_norm": 0.408203125, - "learning_rate": 1.8790092529833508e-05, - "loss": 0.1468, - "step": 5750 - }, - { - "epoch": 0.5851874428527888, - "grad_norm": 5.1875, - "learning_rate": 1.871203990186281e-05, - "loss": 0.1903, - "step": 5760 - }, - { - "epoch": 0.5862033932744082, - "grad_norm": 0.5546875, - "learning_rate": 1.8634052650704415e-05, - "loss": 0.2644, - "step": 5770 - }, - { - "epoch": 0.5872193436960277, - "grad_norm": 3.203125, - "learning_rate": 1.8556131587202848e-05, - "loss": 0.1968, - "step": 5780 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.484375, - "learning_rate": 1.8478277521514424e-05, - "loss": 0.2249, - "step": 5790 - }, - { - "epoch": 0.5892512445392665, - "grad_norm": 4.0, - "learning_rate": 1.8400491263098906e-05, - "loss": 0.1881, - "step": 5800 - }, - { - "epoch": 0.5902671949608859, - "grad_norm": 1.90625, - "learning_rate": 1.832277362071106e-05, - "loss": 0.1352, - "step": 5810 - }, - { - "epoch": 0.5912831453825054, - "grad_norm": 2.765625, - "learning_rate": 1.824512540239221e-05, - "loss": 0.2737, - "step": 5820 - }, - { - "epoch": 0.5922990958041248, - "grad_norm": 2.609375, - "learning_rate": 1.81675474154619e-05, - "loss": 0.1566, - "step": 5830 - }, - { - "epoch": 0.5933150462257442, - "grad_norm": 2.6875, - "learning_rate": 1.8090040466509444e-05, - "loss": 0.1999, - "step": 5840 - }, - { - "epoch": 0.5943309966473637, - "grad_norm": 2.609375, - "learning_rate": 1.8012605361385592e-05, - "loss": 0.2372, - "step": 5850 - }, - { - "epoch": 0.5953469470689831, - "grad_norm": 8.125, - "learning_rate": 1.7935242905194087e-05, - "loss": 0.2411, - "step": 5860 - }, - { - "epoch": 0.5963628974906025, - "grad_norm": 3.46875, - "learning_rate": 1.785795390228336e-05, - "loss": 0.138, - "step": 5870 - }, - { - "epoch": 0.5973788479122218, - "grad_norm": 2.3125, - "learning_rate": 1.7780739156238125e-05, - "loss": 0.1867, - "step": 5880 - }, - { - "epoch": 0.5983947983338413, - "grad_norm": 4.0625, - "learning_rate": 1.770359946987105e-05, - "loss": 0.2091, - "step": 5890 - }, - { - "epoch": 0.5994107487554607, - "grad_norm": 5.21875, - "learning_rate": 1.7626535645214378e-05, - "loss": 0.2091, - "step": 5900 - }, - { - "epoch": 0.6004266991770801, - "grad_norm": 3.15625, - "learning_rate": 1.7549548483511614e-05, - "loss": 0.1927, - "step": 5910 - }, - { - "epoch": 0.6014426495986995, - "grad_norm": 4.71875, - "learning_rate": 1.7472638785209198e-05, - "loss": 0.1893, - "step": 5920 - }, - { - "epoch": 0.602458600020319, - "grad_norm": 3.015625, - "learning_rate": 1.7395807349948145e-05, - "loss": 0.1557, - "step": 5930 - }, - { - "epoch": 0.6034745504419384, - "grad_norm": 2.9375, - "learning_rate": 1.73190549765558e-05, - "loss": 0.1717, - "step": 5940 - }, - { - "epoch": 0.6044905008635578, - "grad_norm": 3.109375, - "learning_rate": 1.724238246303745e-05, - "loss": 0.1879, - "step": 5950 - }, - { - "epoch": 0.6055064512851773, - "grad_norm": 3.875, - "learning_rate": 1.71657906065681e-05, - "loss": 0.1908, - "step": 5960 - }, - { - "epoch": 0.6065224017067967, - "grad_norm": 5.09375, - "learning_rate": 1.7089280203484115e-05, - "loss": 0.1712, - "step": 5970 - }, - { - "epoch": 0.6075383521284161, - "grad_norm": 3.015625, - "learning_rate": 1.701285204927502e-05, - "loss": 0.1454, - "step": 5980 - }, - { - "epoch": 0.6085543025500355, - "grad_norm": 3.265625, - "learning_rate": 1.693650693857515e-05, - "loss": 0.2283, - "step": 5990 - }, - { - "epoch": 0.609570252971655, - "grad_norm": 3.40625, - "learning_rate": 1.6860245665155466e-05, - "loss": 0.2188, - "step": 6000 - }, - { - "epoch": 0.6105862033932744, - "grad_norm": 2.5625, - "learning_rate": 1.678406902191521e-05, - "loss": 0.1605, - "step": 6010 - }, - { - "epoch": 0.6116021538148938, - "grad_norm": 0.6796875, - "learning_rate": 1.670797780087374e-05, - "loss": 0.1472, - "step": 6020 - }, - { - "epoch": 0.6126181042365133, - "grad_norm": 2.234375, - "learning_rate": 1.6631972793162288e-05, - "loss": 0.1676, - "step": 6030 - }, - { - "epoch": 0.6136340546581327, - "grad_norm": 1.25, - "learning_rate": 1.6556054789015662e-05, - "loss": 0.1508, - "step": 6040 - }, - { - "epoch": 0.6146500050797521, - "grad_norm": 4.78125, - "learning_rate": 1.6480224577764132e-05, - "loss": 0.1981, - "step": 6050 - }, - { - "epoch": 0.6156659555013715, - "grad_norm": 3.46875, - "learning_rate": 1.6404482947825137e-05, - "loss": 0.2514, - "step": 6060 - }, - { - "epoch": 0.616681905922991, - "grad_norm": 1.265625, - "learning_rate": 1.6328830686695154e-05, - "loss": 0.2397, - "step": 6070 - }, - { - "epoch": 0.6176978563446104, - "grad_norm": 1.953125, - "learning_rate": 1.625326858094144e-05, - "loss": 0.1523, - "step": 6080 - }, - { - "epoch": 0.6187138067662298, - "grad_norm": 3.484375, - "learning_rate": 1.6177797416193953e-05, - "loss": 0.218, - "step": 6090 - }, - { - "epoch": 0.6197297571878493, - "grad_norm": 3.484375, - "learning_rate": 1.6102417977137052e-05, - "loss": 0.1476, - "step": 6100 - }, - { - "epoch": 0.6207457076094687, - "grad_norm": 4.90625, - "learning_rate": 1.602713104750147e-05, - "loss": 0.1818, - "step": 6110 - }, - { - "epoch": 0.6217616580310881, - "grad_norm": 4.375, - "learning_rate": 1.5951937410056087e-05, - "loss": 0.2061, - "step": 6120 - }, - { - "epoch": 0.6227776084527075, - "grad_norm": 6.3125, - "learning_rate": 1.587683784659979e-05, - "loss": 0.1566, - "step": 6130 - }, - { - "epoch": 0.623793558874327, - "grad_norm": 2.828125, - "learning_rate": 1.58018331379534e-05, - "loss": 0.1376, - "step": 6140 - }, - { - "epoch": 0.6248095092959464, - "grad_norm": 2.40625, - "learning_rate": 1.572692406395149e-05, - "loss": 0.1655, - "step": 6150 - }, - { - "epoch": 0.6258254597175658, - "grad_norm": 4.34375, - "learning_rate": 1.5652111403434338e-05, - "loss": 0.2363, - "step": 6160 - }, - { - "epoch": 0.6268414101391853, - "grad_norm": 2.453125, - "learning_rate": 1.5577395934239757e-05, - "loss": 0.2464, - "step": 6170 - }, - { - "epoch": 0.6278573605608047, - "grad_norm": 2.53125, - "learning_rate": 1.5502778433195085e-05, - "loss": 0.1898, - "step": 6180 - }, - { - "epoch": 0.628873310982424, - "grad_norm": 2.28125, - "learning_rate": 1.5428259676109048e-05, - "loss": 0.1804, - "step": 6190 - }, - { - "epoch": 0.6298892614040434, - "grad_norm": 4.3125, - "learning_rate": 1.5353840437763732e-05, - "loss": 0.1409, - "step": 6200 - }, - { - "epoch": 0.630905211825663, - "grad_norm": 2.5625, - "learning_rate": 1.5279521491906496e-05, - "loss": 0.2449, - "step": 6210 - }, - { - "epoch": 0.6319211622472823, - "grad_norm": 3.0625, - "learning_rate": 1.520530361124195e-05, - "loss": 0.2103, - "step": 6220 - }, - { - "epoch": 0.6329371126689017, - "grad_norm": 2.609375, - "learning_rate": 1.5131187567423937e-05, - "loss": 0.2156, - "step": 6230 - }, - { - "epoch": 0.6339530630905211, - "grad_norm": 2.703125, - "learning_rate": 1.5057174131047446e-05, - "loss": 0.161, - "step": 6240 - }, - { - "epoch": 0.6349690135121406, - "grad_norm": 3.265625, - "learning_rate": 1.4983264071640679e-05, - "loss": 0.1757, - "step": 6250 - }, - { - "epoch": 0.63598496393376, - "grad_norm": 3.15625, - "learning_rate": 1.490945815765699e-05, - "loss": 0.2011, - "step": 6260 - }, - { - "epoch": 0.6370009143553794, - "grad_norm": 5.375, - "learning_rate": 1.4835757156466945e-05, - "loss": 0.1658, - "step": 6270 - }, - { - "epoch": 0.6380168647769989, - "grad_norm": 2.984375, - "learning_rate": 1.4762161834350271e-05, - "loss": 0.1754, - "step": 6280 - }, - { - "epoch": 0.6390328151986183, - "grad_norm": 2.015625, - "learning_rate": 1.4688672956487987e-05, - "loss": 0.1427, - "step": 6290 - }, - { - "epoch": 0.6400487656202377, - "grad_norm": 3.78125, - "learning_rate": 1.4615291286954352e-05, - "loss": 0.1517, - "step": 6300 - }, - { - "epoch": 0.6410647160418571, - "grad_norm": 2.859375, - "learning_rate": 1.4542017588709005e-05, - "loss": 0.2348, - "step": 6310 - }, - { - "epoch": 0.6420806664634766, - "grad_norm": 2.421875, - "learning_rate": 1.4468852623588961e-05, - "loss": 0.2089, - "step": 6320 - }, - { - "epoch": 0.643096616885096, - "grad_norm": 2.15625, - "learning_rate": 1.4395797152300719e-05, - "loss": 0.1702, - "step": 6330 - }, - { - "epoch": 0.6441125673067154, - "grad_norm": 1.53125, - "learning_rate": 1.4322851934412382e-05, - "loss": 0.1017, - "step": 6340 - }, - { - "epoch": 0.6451285177283349, - "grad_norm": 1.90625, - "learning_rate": 1.4250017728345716e-05, - "loss": 0.1813, - "step": 6350 - }, - { - "epoch": 0.6461444681499543, - "grad_norm": 2.015625, - "learning_rate": 1.4177295291368292e-05, - "loss": 0.1095, - "step": 6360 - }, - { - "epoch": 0.6471604185715737, - "grad_norm": 2.625, - "learning_rate": 1.410468537958558e-05, - "loss": 0.2259, - "step": 6370 - }, - { - "epoch": 0.6481763689931931, - "grad_norm": 3.5, - "learning_rate": 1.4032188747933136e-05, - "loss": 0.1595, - "step": 6380 - }, - { - "epoch": 0.6491923194148126, - "grad_norm": 5.21875, - "learning_rate": 1.39598061501687e-05, - "loss": 0.2226, - "step": 6390 - }, - { - "epoch": 0.650208269836432, - "grad_norm": 5.34375, - "learning_rate": 1.388753833886442e-05, - "loss": 0.2132, - "step": 6400 - }, - { - "epoch": 0.6512242202580514, - "grad_norm": 3.640625, - "learning_rate": 1.3815386065398945e-05, - "loss": 0.1227, - "step": 6410 - }, - { - "epoch": 0.6522401706796709, - "grad_norm": 1.0, - "learning_rate": 1.3743350079949705e-05, - "loss": 0.1755, - "step": 6420 - }, - { - "epoch": 0.6532561211012903, - "grad_norm": 2.359375, - "learning_rate": 1.3671431131485057e-05, - "loss": 0.1552, - "step": 6430 - }, - { - "epoch": 0.6542720715229097, - "grad_norm": 5.3125, - "learning_rate": 1.3599629967756483e-05, - "loss": 0.1917, - "step": 6440 - }, - { - "epoch": 0.6552880219445291, - "grad_norm": 4.625, - "learning_rate": 1.3527947335290877e-05, - "loss": 0.1812, - "step": 6450 - }, - { - "epoch": 0.6563039723661486, - "grad_norm": 1.234375, - "learning_rate": 1.3456383979382708e-05, - "loss": 0.1896, - "step": 6460 - }, - { - "epoch": 0.657319922787768, - "grad_norm": 3.984375, - "learning_rate": 1.3384940644086352e-05, - "loss": 0.1484, - "step": 6470 - }, - { - "epoch": 0.6583358732093874, - "grad_norm": 2.40625, - "learning_rate": 1.3313618072208268e-05, - "loss": 0.1334, - "step": 6480 - }, - { - "epoch": 0.6593518236310069, - "grad_norm": 4.375, - "learning_rate": 1.3242417005299357e-05, - "loss": 0.1351, - "step": 6490 - }, - { - "epoch": 0.6603677740526263, - "grad_norm": 2.640625, - "learning_rate": 1.31713381836472e-05, - "loss": 0.1717, - "step": 6500 - }, - { - "epoch": 0.6613837244742456, - "grad_norm": 2.640625, - "learning_rate": 1.3100382346268392e-05, - "loss": 0.1867, - "step": 6510 - }, - { - "epoch": 0.662399674895865, - "grad_norm": 1.734375, - "learning_rate": 1.3029550230900812e-05, - "loss": 0.1997, - "step": 6520 - }, - { - "epoch": 0.6634156253174845, - "grad_norm": 3.609375, - "learning_rate": 1.2958842573996016e-05, - "loss": 0.1969, - "step": 6530 - }, - { - "epoch": 0.6644315757391039, - "grad_norm": 3.578125, - "learning_rate": 1.2888260110711525e-05, - "loss": 0.1469, - "step": 6540 - }, - { - "epoch": 0.6654475261607233, - "grad_norm": 1.3515625, - "learning_rate": 1.2817803574903212e-05, - "loss": 0.1524, - "step": 6550 - }, - { - "epoch": 0.6664634765823427, - "grad_norm": 2.109375, - "learning_rate": 1.2747473699117668e-05, - "loss": 0.159, - "step": 6560 - }, - { - "epoch": 0.6674794270039622, - "grad_norm": 1.53125, - "learning_rate": 1.267727121458458e-05, - "loss": 0.1999, - "step": 6570 - }, - { - "epoch": 0.6684953774255816, - "grad_norm": 1.7265625, - "learning_rate": 1.2607196851209137e-05, - "loss": 0.2216, - "step": 6580 - }, - { - "epoch": 0.669511327847201, - "grad_norm": 3.125, - "learning_rate": 1.2537251337564412e-05, - "loss": 0.1607, - "step": 6590 - }, - { - "epoch": 0.6705272782688205, - "grad_norm": 2.421875, - "learning_rate": 1.2467435400883839e-05, - "loss": 0.2187, - "step": 6600 - }, - { - "epoch": 0.6715432286904399, - "grad_norm": 1.5078125, - "learning_rate": 1.239774976705359e-05, - "loss": 0.1753, - "step": 6610 - }, - { - "epoch": 0.6725591791120593, - "grad_norm": 1.140625, - "learning_rate": 1.2328195160605092e-05, - "loss": 0.194, - "step": 6620 - }, - { - "epoch": 0.6735751295336787, - "grad_norm": 4.9375, - "learning_rate": 1.225877230470743e-05, - "loss": 0.1485, - "step": 6630 - }, - { - "epoch": 0.6745910799552982, - "grad_norm": 3.65625, - "learning_rate": 1.218948192115988e-05, - "loss": 0.1847, - "step": 6640 - }, - { - "epoch": 0.6756070303769176, - "grad_norm": 3.875, - "learning_rate": 1.21203247303844e-05, - "loss": 0.1874, - "step": 6650 - }, - { - "epoch": 0.676622980798537, - "grad_norm": 2.65625, - "learning_rate": 1.2051301451418073e-05, - "loss": 0.2377, - "step": 6660 - }, - { - "epoch": 0.6776389312201565, - "grad_norm": 2.09375, - "learning_rate": 1.198241280190574e-05, - "loss": 0.1508, - "step": 6670 - }, - { - "epoch": 0.6786548816417759, - "grad_norm": 2.203125, - "learning_rate": 1.1913659498092431e-05, - "loss": 0.1537, - "step": 6680 - }, - { - "epoch": 0.6796708320633953, - "grad_norm": 2.484375, - "learning_rate": 1.184504225481601e-05, - "loss": 0.2339, - "step": 6690 - }, - { - "epoch": 0.6806867824850147, - "grad_norm": 5.625, - "learning_rate": 1.177656178549966e-05, - "loss": 0.2102, - "step": 6700 - }, - { - "epoch": 0.6817027329066342, - "grad_norm": 2.5, - "learning_rate": 1.1708218802144536e-05, - "loss": 0.1435, - "step": 6710 - }, - { - "epoch": 0.6827186833282536, - "grad_norm": 3.84375, - "learning_rate": 1.1640014015322323e-05, - "loss": 0.1823, - "step": 6720 - }, - { - "epoch": 0.683734633749873, - "grad_norm": 2.359375, - "learning_rate": 1.1571948134167862e-05, - "loss": 0.1154, - "step": 6730 - }, - { - "epoch": 0.6847505841714925, - "grad_norm": 2.90625, - "learning_rate": 1.1504021866371761e-05, - "loss": 0.2105, - "step": 6740 - }, - { - "epoch": 0.6857665345931119, - "grad_norm": 5.46875, - "learning_rate": 1.143623591817304e-05, - "loss": 0.1317, - "step": 6750 - }, - { - "epoch": 0.6867824850147313, - "grad_norm": 3.34375, - "learning_rate": 1.1368590994351835e-05, - "loss": 0.1406, - "step": 6760 - }, - { - "epoch": 0.6877984354363507, - "grad_norm": 3.78125, - "learning_rate": 1.130108779822198e-05, - "loss": 0.1425, - "step": 6770 - }, - { - "epoch": 0.6888143858579702, - "grad_norm": 0.77734375, - "learning_rate": 1.1233727031623783e-05, - "loss": 0.1623, - "step": 6780 - }, - { - "epoch": 0.6898303362795896, - "grad_norm": 4.625, - "learning_rate": 1.1166509394916682e-05, - "loss": 0.1591, - "step": 6790 - }, - { - "epoch": 0.690846286701209, - "grad_norm": 3.84375, - "learning_rate": 1.1099435586971982e-05, - "loss": 0.1758, - "step": 6800 - }, - { - "epoch": 0.6918622371228285, - "grad_norm": 2.4375, - "learning_rate": 1.1032506305165555e-05, - "loss": 0.1018, - "step": 6810 - }, - { - "epoch": 0.6928781875444479, - "grad_norm": 3.203125, - "learning_rate": 1.0965722245370641e-05, - "loss": 0.1485, - "step": 6820 - }, - { - "epoch": 0.6938941379660672, - "grad_norm": 0.7109375, - "learning_rate": 1.0899084101950561e-05, - "loss": 0.1762, - "step": 6830 - }, - { - "epoch": 0.6949100883876866, - "grad_norm": 1.9765625, - "learning_rate": 1.0832592567751555e-05, - "loss": 0.1402, - "step": 6840 - }, - { - "epoch": 0.6959260388093061, - "grad_norm": 1.4609375, - "learning_rate": 1.0766248334095505e-05, - "loss": 0.2278, - "step": 6850 - }, - { - "epoch": 0.6969419892309255, - "grad_norm": 3.953125, - "learning_rate": 1.0700052090772828e-05, - "loss": 0.1969, - "step": 6860 - }, - { - "epoch": 0.6979579396525449, - "grad_norm": 2.453125, - "learning_rate": 1.0634004526035249e-05, - "loss": 0.2073, - "step": 6870 - }, - { - "epoch": 0.6989738900741643, - "grad_norm": 1.6171875, - "learning_rate": 1.0568106326588645e-05, - "loss": 0.1902, - "step": 6880 - }, - { - "epoch": 0.6999898404957838, - "grad_norm": 1.2734375, - "learning_rate": 1.0502358177585953e-05, - "loss": 0.2165, - "step": 6890 - }, - { - "epoch": 0.7010057909174032, - "grad_norm": 1.671875, - "learning_rate": 1.0436760762619977e-05, - "loss": 0.1952, - "step": 6900 - }, - { - "epoch": 0.7020217413390226, - "grad_norm": 2.8125, - "learning_rate": 1.0371314763716347e-05, - "loss": 0.1422, - "step": 6910 - }, - { - "epoch": 0.7030376917606421, - "grad_norm": 2.53125, - "learning_rate": 1.0306020861326388e-05, - "loss": 0.0961, - "step": 6920 - }, - { - "epoch": 0.7040536421822615, - "grad_norm": 3.046875, - "learning_rate": 1.0240879734320068e-05, - "loss": 0.1542, - "step": 6930 - }, - { - "epoch": 0.7050695926038809, - "grad_norm": 2.859375, - "learning_rate": 1.0175892059978901e-05, - "loss": 0.1748, - "step": 6940 - }, - { - "epoch": 0.7060855430255003, - "grad_norm": 2.671875, - "learning_rate": 1.0111058513988958e-05, - "loss": 0.0819, - "step": 6950 - }, - { - "epoch": 0.7071014934471198, - "grad_norm": 3.5625, - "learning_rate": 1.0046379770433803e-05, - "loss": 0.1933, - "step": 6960 - }, - { - "epoch": 0.7081174438687392, - "grad_norm": 2.859375, - "learning_rate": 9.98185650178749e-06, - "loss": 0.1891, - "step": 6970 - }, - { - "epoch": 0.7091333942903586, - "grad_norm": 3.15625, - "learning_rate": 9.917489378907591e-06, - "loss": 0.2102, - "step": 6980 - }, - { - "epoch": 0.7101493447119781, - "grad_norm": 6.40625, - "learning_rate": 9.853279071028212e-06, - "loss": 0.1714, - "step": 6990 - }, - { - "epoch": 0.7111652951335975, - "grad_norm": 2.375, - "learning_rate": 9.78922624575303e-06, - "loss": 0.1299, - "step": 7000 - }, - { - "epoch": 0.7121812455552169, - "grad_norm": 2.078125, - "learning_rate": 9.72533156904833e-06, - "loss": 0.1914, - "step": 7010 - }, - { - "epoch": 0.7131971959768363, - "grad_norm": 3.859375, - "learning_rate": 9.661595705236137e-06, - "loss": 0.2377, - "step": 7020 - }, - { - "epoch": 0.7142131463984558, - "grad_norm": 1.171875, - "learning_rate": 9.598019316987244e-06, - "loss": 0.1851, - "step": 7030 - }, - { - "epoch": 0.7152290968200752, - "grad_norm": 1.078125, - "learning_rate": 9.53460306531439e-06, - "loss": 0.2661, - "step": 7040 - }, - { - "epoch": 0.7162450472416946, - "grad_norm": 1.6484375, - "learning_rate": 9.471347609565311e-06, - "loss": 0.1669, - "step": 7050 - }, - { - "epoch": 0.7172609976633141, - "grad_norm": 4.59375, - "learning_rate": 9.408253607415957e-06, - "loss": 0.2487, - "step": 7060 - }, - { - "epoch": 0.7182769480849335, - "grad_norm": 3.09375, - "learning_rate": 9.345321714863614e-06, - "loss": 0.186, - "step": 7070 - }, - { - "epoch": 0.7192928985065529, - "grad_norm": 6.0625, - "learning_rate": 9.282552586220075e-06, - "loss": 0.2249, - "step": 7080 - }, - { - "epoch": 0.7203088489281723, - "grad_norm": 1.5703125, - "learning_rate": 9.219946874104885e-06, - "loss": 0.1255, - "step": 7090 - }, - { - "epoch": 0.7213247993497918, - "grad_norm": 1.9453125, - "learning_rate": 9.157505229438481e-06, - "loss": 0.1999, - "step": 7100 - }, - { - "epoch": 0.7223407497714112, - "grad_norm": 5.1875, - "learning_rate": 9.095228301435518e-06, - "loss": 0.199, - "step": 7110 - }, - { - "epoch": 0.7233567001930306, - "grad_norm": 2.078125, - "learning_rate": 9.03311673759802e-06, - "loss": 0.2182, - "step": 7120 - }, - { - "epoch": 0.7243726506146501, - "grad_norm": 6.46875, - "learning_rate": 8.971171183708733e-06, - "loss": 0.1573, - "step": 7130 - }, - { - "epoch": 0.7253886010362695, - "grad_norm": 3.015625, - "learning_rate": 8.909392283824353e-06, - "loss": 0.2044, - "step": 7140 - }, - { - "epoch": 0.7264045514578888, - "grad_norm": 2.921875, - "learning_rate": 8.847780680268872e-06, - "loss": 0.11, - "step": 7150 - }, - { - "epoch": 0.7274205018795082, - "grad_norm": 2.96875, - "learning_rate": 8.786337013626853e-06, - "loss": 0.1897, - "step": 7160 - }, - { - "epoch": 0.7284364523011277, - "grad_norm": 1.7578125, - "learning_rate": 8.725061922736799e-06, - "loss": 0.153, - "step": 7170 - }, - { - "epoch": 0.7294524027227471, - "grad_norm": 1.609375, - "learning_rate": 8.663956044684532e-06, - "loss": 0.1746, - "step": 7180 - }, - { - "epoch": 0.7304683531443665, - "grad_norm": 1.9375, - "learning_rate": 8.603020014796507e-06, - "loss": 0.2284, - "step": 7190 - }, - { - "epoch": 0.7314843035659859, - "grad_norm": 1.515625, - "learning_rate": 8.542254466633273e-06, - "loss": 0.1186, - "step": 7200 - }, - { - "epoch": 0.7325002539876054, - "grad_norm": 1.671875, - "learning_rate": 8.481660031982844e-06, - "loss": 0.1971, - "step": 7210 - }, - { - "epoch": 0.7335162044092248, - "grad_norm": 1.453125, - "learning_rate": 8.421237340854157e-06, - "loss": 0.196, - "step": 7220 - }, - { - "epoch": 0.7345321548308442, - "grad_norm": 0.65234375, - "learning_rate": 8.360987021470479e-06, - "loss": 0.1724, - "step": 7230 - }, - { - "epoch": 0.7355481052524637, - "grad_norm": 2.84375, - "learning_rate": 8.300909700262929e-06, - "loss": 0.175, - "step": 7240 - }, - { - "epoch": 0.7365640556740831, - "grad_norm": 3.109375, - "learning_rate": 8.241006001863924e-06, - "loss": 0.2276, - "step": 7250 - }, - { - "epoch": 0.7375800060957025, - "grad_norm": 4.8125, - "learning_rate": 8.181276549100714e-06, - "loss": 0.2029, - "step": 7260 - }, - { - "epoch": 0.7385959565173219, - "grad_norm": 4.03125, - "learning_rate": 8.12172196298887e-06, - "loss": 0.175, - "step": 7270 - }, - { - "epoch": 0.7396119069389414, - "grad_norm": 3.046875, - "learning_rate": 8.062342862725878e-06, - "loss": 0.1662, - "step": 7280 - }, - { - "epoch": 0.7406278573605608, - "grad_norm": 3.375, - "learning_rate": 8.003139865684662e-06, - "loss": 0.1616, - "step": 7290 - }, - { - "epoch": 0.7416438077821802, - "grad_norm": 2.5625, - "learning_rate": 7.944113587407157e-06, - "loss": 0.2448, - "step": 7300 - }, - { - "epoch": 0.7426597582037997, - "grad_norm": 4.125, - "learning_rate": 7.885264641597961e-06, - "loss": 0.1618, - "step": 7310 - }, - { - "epoch": 0.7436757086254191, - "grad_norm": 3.5, - "learning_rate": 7.826593640117889e-06, - "loss": 0.1134, - "step": 7320 - }, - { - "epoch": 0.7446916590470385, - "grad_norm": 2.6875, - "learning_rate": 7.76810119297767e-06, - "loss": 0.1795, - "step": 7330 - }, - { - "epoch": 0.7457076094686579, - "grad_norm": 4.34375, - "learning_rate": 7.709787908331556e-06, - "loss": 0.2736, - "step": 7340 - }, - { - "epoch": 0.7467235598902774, - "grad_norm": 1.21875, - "learning_rate": 7.651654392471038e-06, - "loss": 0.139, - "step": 7350 - }, - { - "epoch": 0.7477395103118968, - "grad_norm": 3.578125, - "learning_rate": 7.593701249818521e-06, - "loss": 0.2023, - "step": 7360 - }, - { - "epoch": 0.7487554607335162, - "grad_norm": 2.15625, - "learning_rate": 7.535929082921048e-06, - "loss": 0.1702, - "step": 7370 - }, - { - "epoch": 0.7497714111551357, - "grad_norm": 1.96875, - "learning_rate": 7.47833849244402e-06, - "loss": 0.1835, - "step": 7380 - }, - { - "epoch": 0.7507873615767551, - "grad_norm": 2.796875, - "learning_rate": 7.420930077164959e-06, - "loss": 0.1713, - "step": 7390 - }, - { - "epoch": 0.7518033119983745, - "grad_norm": 4.46875, - "learning_rate": 7.363704433967311e-06, - "loss": 0.1906, - "step": 7400 - }, - { - "epoch": 0.7528192624199939, - "grad_norm": 1.75, - "learning_rate": 7.306662157834185e-06, - "loss": 0.1421, - "step": 7410 - }, - { - "epoch": 0.7538352128416134, - "grad_norm": 1.140625, - "learning_rate": 7.2498038418422145e-06, - "loss": 0.1793, - "step": 7420 - }, - { - "epoch": 0.7548511632632328, - "grad_norm": 2.578125, - "learning_rate": 7.193130077155374e-06, - "loss": 0.1603, - "step": 7430 - }, - { - "epoch": 0.7558671136848522, - "grad_norm": 4.3125, - "learning_rate": 7.13664145301883e-06, - "loss": 0.2169, - "step": 7440 - }, - { - "epoch": 0.7568830641064717, - "grad_norm": 3.078125, - "learning_rate": 7.0803385567528025e-06, - "loss": 0.1685, - "step": 7450 - }, - { - "epoch": 0.757899014528091, - "grad_norm": 3.5625, - "learning_rate": 7.024221973746495e-06, - "loss": 0.2282, - "step": 7460 - }, - { - "epoch": 0.7589149649497104, - "grad_norm": 2.265625, - "learning_rate": 6.968292287451961e-06, - "loss": 0.1786, - "step": 7470 - }, - { - "epoch": 0.7599309153713298, - "grad_norm": 4.71875, - "learning_rate": 6.912550079378091e-06, - "loss": 0.1811, - "step": 7480 - }, - { - "epoch": 0.7609468657929493, - "grad_norm": 2.328125, - "learning_rate": 6.856995929084506e-06, - "loss": 0.1747, - "step": 7490 - }, - { - "epoch": 0.7619628162145687, - "grad_norm": 5.21875, - "learning_rate": 6.801630414175589e-06, - "loss": 0.2028, - "step": 7500 - }, - { - "epoch": 0.7629787666361881, - "grad_norm": 3.78125, - "learning_rate": 6.746454110294451e-06, - "loss": 0.2255, - "step": 7510 - }, - { - "epoch": 0.7639947170578075, - "grad_norm": 1.625, - "learning_rate": 6.691467591116931e-06, - "loss": 0.1604, - "step": 7520 - }, - { - "epoch": 0.765010667479427, - "grad_norm": 1.7734375, - "learning_rate": 6.6366714283456755e-06, - "loss": 0.2559, - "step": 7530 - }, - { - "epoch": 0.7660266179010464, - "grad_norm": 4.59375, - "learning_rate": 6.582066191704142e-06, - "loss": 0.2034, - "step": 7540 - }, - { - "epoch": 0.7670425683226658, - "grad_norm": 1.578125, - "learning_rate": 6.527652448930724e-06, - "loss": 0.148, - "step": 7550 - }, - { - "epoch": 0.7680585187442853, - "grad_norm": 1.7109375, - "learning_rate": 6.4734307657728e-06, - "loss": 0.1811, - "step": 7560 - }, - { - "epoch": 0.7690744691659047, - "grad_norm": 1.2734375, - "learning_rate": 6.419401705980924e-06, - "loss": 0.1407, - "step": 7570 - }, - { - "epoch": 0.7700904195875241, - "grad_norm": 2.25, - "learning_rate": 6.365565831302869e-06, - "loss": 0.1893, - "step": 7580 - }, - { - "epoch": 0.7711063700091435, - "grad_norm": 1.625, - "learning_rate": 6.311923701477854e-06, - "loss": 0.1835, - "step": 7590 - }, - { - "epoch": 0.772122320430763, - "grad_norm": 2.375, - "learning_rate": 6.258475874230713e-06, - "loss": 0.1579, - "step": 7600 - }, - { - "epoch": 0.7731382708523824, - "grad_norm": 4.5, - "learning_rate": 6.205222905266067e-06, - "loss": 0.1794, - "step": 7610 - }, - { - "epoch": 0.7741542212740018, - "grad_norm": 4.25, - "learning_rate": 6.152165348262598e-06, - "loss": 0.1477, - "step": 7620 - }, - { - "epoch": 0.7751701716956213, - "grad_norm": 1.9765625, - "learning_rate": 6.0993037548672246e-06, - "loss": 0.2396, - "step": 7630 - }, - { - "epoch": 0.7761861221172407, - "grad_norm": 2.671875, - "learning_rate": 6.046638674689454e-06, - "loss": 0.1717, - "step": 7640 - }, - { - "epoch": 0.7772020725388601, - "grad_norm": 3.671875, - "learning_rate": 5.994170655295567e-06, - "loss": 0.2646, - "step": 7650 - }, - { - "epoch": 0.7782180229604795, - "grad_norm": 1.3046875, - "learning_rate": 5.9419002422030106e-06, - "loss": 0.1553, - "step": 7660 - }, - { - "epoch": 0.779233973382099, - "grad_norm": 3.734375, - "learning_rate": 5.889827978874665e-06, - "loss": 0.1854, - "step": 7670 - }, - { - "epoch": 0.7802499238037184, - "grad_norm": 2.140625, - "learning_rate": 5.837954406713245e-06, - "loss": 0.1857, - "step": 7680 - }, - { - "epoch": 0.7812658742253378, - "grad_norm": 3.34375, - "learning_rate": 5.786280065055619e-06, - "loss": 0.1797, - "step": 7690 - }, - { - "epoch": 0.7822818246469573, - "grad_norm": 0.97265625, - "learning_rate": 5.734805491167244e-06, - "loss": 0.1488, - "step": 7700 - }, - { - "epoch": 0.7832977750685767, - "grad_norm": 2.078125, - "learning_rate": 5.683531220236576e-06, - "loss": 0.1688, - "step": 7710 - }, - { - "epoch": 0.7843137254901961, - "grad_norm": 3.046875, - "learning_rate": 5.632457785369455e-06, - "loss": 0.1503, - "step": 7720 - }, - { - "epoch": 0.7853296759118155, - "grad_norm": 1.6875, - "learning_rate": 5.581585717583637e-06, - "loss": 0.1658, - "step": 7730 - }, - { - "epoch": 0.786345626333435, - "grad_norm": 3.421875, - "learning_rate": 5.530915545803209e-06, - "loss": 0.2112, - "step": 7740 - }, - { - "epoch": 0.7873615767550544, - "grad_norm": 4.1875, - "learning_rate": 5.480447796853141e-06, - "loss": 0.165, - "step": 7750 - }, - { - "epoch": 0.7883775271766738, - "grad_norm": 5.3125, - "learning_rate": 5.430182995453756e-06, - "loss": 0.1499, - "step": 7760 - }, - { - "epoch": 0.7893934775982933, - "grad_norm": 2.1875, - "learning_rate": 5.380121664215329e-06, - "loss": 0.1559, - "step": 7770 - }, - { - "epoch": 0.7904094280199127, - "grad_norm": 1.46875, - "learning_rate": 5.330264323632611e-06, - "loss": 0.2098, - "step": 7780 - }, - { - "epoch": 0.791425378441532, - "grad_norm": 4.65625, - "learning_rate": 5.280611492079449e-06, - "loss": 0.1776, - "step": 7790 - }, - { - "epoch": 0.7924413288631514, - "grad_norm": 1.3359375, - "learning_rate": 5.231163685803361e-06, - "loss": 0.1497, - "step": 7800 - }, - { - "epoch": 0.7934572792847709, - "grad_norm": 2.640625, - "learning_rate": 5.181921418920191e-06, - "loss": 0.12, - "step": 7810 - }, - { - "epoch": 0.7944732297063903, - "grad_norm": 2.328125, - "learning_rate": 5.13288520340878e-06, - "loss": 0.1981, - "step": 7820 - }, - { - "epoch": 0.7954891801280097, - "grad_norm": 3.0625, - "learning_rate": 5.084055549105596e-06, - "loss": 0.1389, - "step": 7830 - }, - { - "epoch": 0.7965051305496291, - "grad_norm": 2.796875, - "learning_rate": 5.035432963699479e-06, - "loss": 0.2293, - "step": 7840 - }, - { - "epoch": 0.7975210809712486, - "grad_norm": 5.0625, - "learning_rate": 4.98701795272635e-06, - "loss": 0.1618, - "step": 7850 - }, - { - "epoch": 0.798537031392868, - "grad_norm": 5.09375, - "learning_rate": 4.938811019563938e-06, - "loss": 0.1755, - "step": 7860 - }, - { - "epoch": 0.7995529818144874, - "grad_norm": 2.140625, - "learning_rate": 4.8908126654265475e-06, - "loss": 0.1565, - "step": 7870 - }, - { - "epoch": 0.8005689322361069, - "grad_norm": 0.76171875, - "learning_rate": 4.843023389359885e-06, - "loss": 0.2176, - "step": 7880 - }, - { - "epoch": 0.8015848826577263, - "grad_norm": 2.625, - "learning_rate": 4.79544368823581e-06, - "loss": 0.2013, - "step": 7890 - }, - { - "epoch": 0.8026008330793457, - "grad_norm": 2.078125, - "learning_rate": 4.748074056747234e-06, - "loss": 0.1246, - "step": 7900 - }, - { - "epoch": 0.8036167835009651, - "grad_norm": 3.5, - "learning_rate": 4.700914987402919e-06, - "loss": 0.1638, - "step": 7910 - }, - { - "epoch": 0.8046327339225846, - "grad_norm": 3.4375, - "learning_rate": 4.6539669705223916e-06, - "loss": 0.2213, - "step": 7920 - }, - { - "epoch": 0.805648684344204, - "grad_norm": 2.96875, - "learning_rate": 4.607230494230849e-06, - "loss": 0.1822, - "step": 7930 - }, - { - "epoch": 0.8066646347658234, - "grad_norm": 2.359375, - "learning_rate": 4.560706044454047e-06, - "loss": 0.1763, - "step": 7940 - }, - { - "epoch": 0.8076805851874429, - "grad_norm": 4.59375, - "learning_rate": 4.514394104913291e-06, - "loss": 0.234, - "step": 7950 - }, - { - "epoch": 0.8086965356090623, - "grad_norm": 1.96875, - "learning_rate": 4.468295157120372e-06, - "loss": 0.1939, - "step": 7960 - }, - { - "epoch": 0.8097124860306817, - "grad_norm": 2.578125, - "learning_rate": 4.422409680372594e-06, - "loss": 0.174, - "step": 7970 - }, - { - "epoch": 0.8107284364523011, - "grad_norm": 4.5625, - "learning_rate": 4.3767381517477505e-06, - "loss": 0.2375, - "step": 7980 - }, - { - "epoch": 0.8117443868739206, - "grad_norm": 0.9609375, - "learning_rate": 4.331281046099203e-06, - "loss": 0.2076, - "step": 7990 - }, - { - "epoch": 0.81276033729554, - "grad_norm": 6.0625, - "learning_rate": 4.286038836050929e-06, - "loss": 0.2504, - "step": 8000 - }, - { - "epoch": 0.8137762877171594, - "grad_norm": 3.484375, - "learning_rate": 4.241011991992586e-06, - "loss": 0.2102, - "step": 8010 - }, - { - "epoch": 0.8147922381387789, - "grad_norm": 1.9765625, - "learning_rate": 4.1962009820746635e-06, - "loss": 0.1846, - "step": 8020 - }, - { - "epoch": 0.8158081885603983, - "grad_norm": 1.875, - "learning_rate": 4.15160627220357e-06, - "loss": 0.1741, - "step": 8030 - }, - { - "epoch": 0.8168241389820177, - "grad_norm": 5.5625, - "learning_rate": 4.107228326036838e-06, - "loss": 0.2078, - "step": 8040 - }, - { - "epoch": 0.8178400894036371, - "grad_norm": 1.7578125, - "learning_rate": 4.063067604978252e-06, - "loss": 0.212, - "step": 8050 - }, - { - "epoch": 0.8188560398252566, - "grad_norm": 4.09375, - "learning_rate": 4.019124568173094e-06, - "loss": 0.1831, - "step": 8060 - }, - { - "epoch": 0.819871990246876, - "grad_norm": 6.625, - "learning_rate": 3.975399672503341e-06, - "loss": 0.2196, - "step": 8070 - }, - { - "epoch": 0.8208879406684954, - "grad_norm": 2.78125, - "learning_rate": 3.931893372582943e-06, - "loss": 0.2002, - "step": 8080 - }, - { - "epoch": 0.8219038910901149, - "grad_norm": 6.90625, - "learning_rate": 3.888606120753047e-06, - "loss": 0.2138, - "step": 8090 - }, - { - "epoch": 0.8229198415117343, - "grad_norm": 4.09375, - "learning_rate": 3.845538367077362e-06, - "loss": 0.2593, - "step": 8100 - }, - { - "epoch": 0.8239357919333536, - "grad_norm": 1.859375, - "learning_rate": 3.8026905593374213e-06, - "loss": 0.2062, - "step": 8110 - }, - { - "epoch": 0.824951742354973, - "grad_norm": 4.3125, - "learning_rate": 3.760063143027945e-06, - "loss": 0.1343, - "step": 8120 - }, - { - "epoch": 0.8259676927765925, - "grad_norm": 1.984375, - "learning_rate": 3.7176565613522313e-06, - "loss": 0.2494, - "step": 8130 - }, - { - "epoch": 0.8269836431982119, - "grad_norm": 3.71875, - "learning_rate": 3.675471255217516e-06, - "loss": 0.1502, - "step": 8140 - }, - { - "epoch": 0.8279995936198313, - "grad_norm": 2.359375, - "learning_rate": 3.6335076632304175e-06, - "loss": 0.1256, - "step": 8150 - }, - { - "epoch": 0.8290155440414507, - "grad_norm": 1.46875, - "learning_rate": 3.5917662216923332e-06, - "loss": 0.1709, - "step": 8160 - }, - { - "epoch": 0.8300314944630702, - "grad_norm": 2.78125, - "learning_rate": 3.550247364594958e-06, - "loss": 0.1881, - "step": 8170 - }, - { - "epoch": 0.8310474448846896, - "grad_norm": 1.0703125, - "learning_rate": 3.508951523615725e-06, - "loss": 0.1998, - "step": 8180 - }, - { - "epoch": 0.832063395306309, - "grad_norm": 2.40625, - "learning_rate": 3.467879128113352e-06, - "loss": 0.2429, - "step": 8190 - }, - { - "epoch": 0.8330793457279285, - "grad_norm": 2.609375, - "learning_rate": 3.427030605123352e-06, - "loss": 0.1942, - "step": 8200 - }, - { - "epoch": 0.8340952961495479, - "grad_norm": 1.6015625, - "learning_rate": 3.3864063793536043e-06, - "loss": 0.1898, - "step": 8210 - }, - { - "epoch": 0.8351112465711673, - "grad_norm": 5.375, - "learning_rate": 3.3460068731799577e-06, - "loss": 0.1919, - "step": 8220 - }, - { - "epoch": 0.8361271969927867, - "grad_norm": 3.3125, - "learning_rate": 3.3058325066417818e-06, - "loss": 0.1516, - "step": 8230 - }, - { - "epoch": 0.8371431474144062, - "grad_norm": 0.76171875, - "learning_rate": 3.26588369743768e-06, - "loss": 0.1068, - "step": 8240 - }, - { - "epoch": 0.8381590978360256, - "grad_norm": 3.171875, - "learning_rate": 3.2261608609210653e-06, - "loss": 0.1203, - "step": 8250 - }, - { - "epoch": 0.839175048257645, - "grad_norm": 2.359375, - "learning_rate": 3.186664410095913e-06, - "loss": 0.2172, - "step": 8260 - }, - { - "epoch": 0.8401909986792645, - "grad_norm": 3.328125, - "learning_rate": 3.1473947556124093e-06, - "loss": 0.1249, - "step": 8270 - }, - { - "epoch": 0.8412069491008839, - "grad_norm": 2.484375, - "learning_rate": 3.1083523057627213e-06, - "loss": 0.1744, - "step": 8280 - }, - { - "epoch": 0.8422228995225033, - "grad_norm": 4.46875, - "learning_rate": 3.0695374664767353e-06, - "loss": 0.1772, - "step": 8290 - }, - { - "epoch": 0.8432388499441227, - "grad_norm": 0.59375, - "learning_rate": 3.0309506413178397e-06, - "loss": 0.2302, - "step": 8300 - }, - { - "epoch": 0.8442548003657422, - "grad_norm": 2.390625, - "learning_rate": 2.9925922314787136e-06, - "loss": 0.1635, - "step": 8310 - }, - { - "epoch": 0.8452707507873616, - "grad_norm": 2.34375, - "learning_rate": 2.954462635777194e-06, - "loss": 0.1573, - "step": 8320 - }, - { - "epoch": 0.846286701208981, - "grad_norm": 2.015625, - "learning_rate": 2.916562250652083e-06, - "loss": 0.1608, - "step": 8330 - }, - { - "epoch": 0.8473026516306005, - "grad_norm": 4.125, - "learning_rate": 2.878891470159048e-06, - "loss": 0.184, - "step": 8340 - }, - { - "epoch": 0.8483186020522199, - "grad_norm": 2.515625, - "learning_rate": 2.8414506859665514e-06, - "loss": 0.2141, - "step": 8350 - }, - { - "epoch": 0.8493345524738393, - "grad_norm": 3.375, - "learning_rate": 2.8042402873517197e-06, - "loss": 0.1729, - "step": 8360 - }, - { - "epoch": 0.8503505028954587, - "grad_norm": 3.078125, - "learning_rate": 2.76726066119635e-06, - "loss": 0.2252, - "step": 8370 - }, - { - "epoch": 0.8513664533170782, - "grad_norm": 1.5390625, - "learning_rate": 2.730512191982845e-06, - "loss": 0.1644, - "step": 8380 - }, - { - "epoch": 0.8523824037386976, - "grad_norm": 1.9296875, - "learning_rate": 2.693995261790261e-06, - "loss": 0.1822, - "step": 8390 - }, - { - "epoch": 0.853398354160317, - "grad_norm": 3.3125, - "learning_rate": 2.657710250290285e-06, - "loss": 0.2068, - "step": 8400 - }, - { - "epoch": 0.8544143045819365, - "grad_norm": 0.640625, - "learning_rate": 2.621657534743327e-06, - "loss": 0.1224, - "step": 8410 - }, - { - "epoch": 0.8554302550035559, - "grad_norm": 3.421875, - "learning_rate": 2.5858374899945804e-06, - "loss": 0.179, - "step": 8420 - }, - { - "epoch": 0.8564462054251752, - "grad_norm": 3.484375, - "learning_rate": 2.550250488470135e-06, - "loss": 0.1873, - "step": 8430 - }, - { - "epoch": 0.8574621558467946, - "grad_norm": 3.984375, - "learning_rate": 2.5148969001730806e-06, - "loss": 0.1799, - "step": 8440 - }, - { - "epoch": 0.8584781062684141, - "grad_norm": 1.375, - "learning_rate": 2.4797770926796858e-06, - "loss": 0.176, - "step": 8450 - }, - { - "epoch": 0.8594940566900335, - "grad_norm": 1.8984375, - "learning_rate": 2.444891431135571e-06, - "loss": 0.1664, - "step": 8460 - }, - { - "epoch": 0.8605100071116529, - "grad_norm": 4.15625, - "learning_rate": 2.4102402782518936e-06, - "loss": 0.1512, - "step": 8470 - }, - { - "epoch": 0.8615259575332723, - "grad_norm": 1.34375, - "learning_rate": 2.3758239943016096e-06, - "loss": 0.1629, - "step": 8480 - }, - { - "epoch": 0.8625419079548918, - "grad_norm": 5.3125, - "learning_rate": 2.3416429371157013e-06, - "loss": 0.2099, - "step": 8490 - }, - { - "epoch": 0.8635578583765112, - "grad_norm": 5.9375, - "learning_rate": 2.307697462079464e-06, - "loss": 0.2221, - "step": 8500 - }, - { - "epoch": 0.8645738087981306, - "grad_norm": 5.4375, - "learning_rate": 2.273987922128809e-06, - "loss": 0.2191, - "step": 8510 - }, - { - "epoch": 0.8655897592197501, - "grad_norm": 2.171875, - "learning_rate": 2.240514667746607e-06, - "loss": 0.1843, - "step": 8520 - }, - { - "epoch": 0.8666057096413695, - "grad_norm": 2.5625, - "learning_rate": 2.2072780469590245e-06, - "loss": 0.2494, - "step": 8530 - }, - { - "epoch": 0.8676216600629889, - "grad_norm": 2.25, - "learning_rate": 2.1742784053319116e-06, - "loss": 0.1712, - "step": 8540 - }, - { - "epoch": 0.8686376104846083, - "grad_norm": 4.5625, - "learning_rate": 2.141516085967224e-06, - "loss": 0.1169, - "step": 8550 - }, - { - "epoch": 0.8696535609062278, - "grad_norm": 4.25, - "learning_rate": 2.1089914294994434e-06, - "loss": 0.1374, - "step": 8560 - }, - { - "epoch": 0.8706695113278472, - "grad_norm": 3.265625, - "learning_rate": 2.0767047740920336e-06, - "loss": 0.2162, - "step": 8570 - }, - { - "epoch": 0.8716854617494666, - "grad_norm": 1.8203125, - "learning_rate": 2.0446564554339187e-06, - "loss": 0.1593, - "step": 8580 - }, - { - "epoch": 0.8727014121710861, - "grad_norm": 2.671875, - "learning_rate": 2.0128468067360185e-06, - "loss": 0.1857, - "step": 8590 - }, - { - "epoch": 0.8737173625927055, - "grad_norm": 2.765625, - "learning_rate": 1.981276158727749e-06, - "loss": 0.1989, - "step": 8600 - }, - { - "epoch": 0.8747333130143249, - "grad_norm": 2.65625, - "learning_rate": 1.949944839653625e-06, - "loss": 0.2077, - "step": 8610 - }, - { - "epoch": 0.8757492634359443, - "grad_norm": 2.625, - "learning_rate": 1.918853175269797e-06, - "loss": 0.2003, - "step": 8620 - }, - { - "epoch": 0.8767652138575638, - "grad_norm": 0.71875, - "learning_rate": 1.8880014888407127e-06, - "loss": 0.2486, - "step": 8630 - }, - { - "epoch": 0.8777811642791832, - "grad_norm": 4.71875, - "learning_rate": 1.8573901011357336e-06, - "loss": 0.1896, - "step": 8640 - }, - { - "epoch": 0.8787971147008026, - "grad_norm": 5.0625, - "learning_rate": 1.8270193304257887e-06, - "loss": 0.1727, - "step": 8650 - }, - { - "epoch": 0.8798130651224221, - "grad_norm": 1.75, - "learning_rate": 1.7968894924800916e-06, - "loss": 0.1687, - "step": 8660 - }, - { - "epoch": 0.8808290155440415, - "grad_norm": 2.65625, - "learning_rate": 1.7670009005628291e-06, - "loss": 0.166, - "step": 8670 - }, - { - "epoch": 0.8818449659656609, - "grad_norm": 4.71875, - "learning_rate": 1.737353865429936e-06, - "loss": 0.1471, - "step": 8680 - }, - { - "epoch": 0.8828609163872803, - "grad_norm": 0.546875, - "learning_rate": 1.7079486953258283e-06, - "loss": 0.1075, - "step": 8690 - }, - { - "epoch": 0.8838768668088998, - "grad_norm": 1.640625, - "learning_rate": 1.6787856959802367e-06, - "loss": 0.2113, - "step": 8700 - }, - { - "epoch": 0.8848928172305192, - "grad_norm": 2.953125, - "learning_rate": 1.6498651706049945e-06, - "loss": 0.1412, - "step": 8710 - }, - { - "epoch": 0.8859087676521386, - "grad_norm": 3.796875, - "learning_rate": 1.6211874198909072e-06, - "loss": 0.1701, - "step": 8720 - }, - { - "epoch": 0.8869247180737581, - "grad_norm": 3.734375, - "learning_rate": 1.592752742004605e-06, - "loss": 0.1348, - "step": 8730 - }, - { - "epoch": 0.8879406684953774, - "grad_norm": 2.21875, - "learning_rate": 1.5645614325854735e-06, - "loss": 0.1931, - "step": 8740 - }, - { - "epoch": 0.8889566189169968, - "grad_norm": 3.4375, - "learning_rate": 1.5366137847425466e-06, - "loss": 0.1705, - "step": 8750 - }, - { - "epoch": 0.8899725693386162, - "grad_norm": 3.5625, - "learning_rate": 1.5089100890514769e-06, - "loss": 0.1889, - "step": 8760 - }, - { - "epoch": 0.8909885197602357, - "grad_norm": 2.65625, - "learning_rate": 1.4814506335515176e-06, - "loss": 0.1837, - "step": 8770 - }, - { - "epoch": 0.8920044701818551, - "grad_norm": 1.421875, - "learning_rate": 1.4542357037425207e-06, - "loss": 0.1728, - "step": 8780 - }, - { - "epoch": 0.8930204206034745, - "grad_norm": 1.625, - "learning_rate": 1.4272655825819713e-06, - "loss": 0.1562, - "step": 8790 - }, - { - "epoch": 0.8940363710250939, - "grad_norm": 4.0625, - "learning_rate": 1.4005405504820351e-06, - "loss": 0.1681, - "step": 8800 - }, - { - "epoch": 0.8950523214467134, - "grad_norm": 2.328125, - "learning_rate": 1.3740608853066634e-06, - "loss": 0.1449, - "step": 8810 - }, - { - "epoch": 0.8960682718683328, - "grad_norm": 4.0625, - "learning_rate": 1.347826862368684e-06, - "loss": 0.2418, - "step": 8820 - }, - { - "epoch": 0.8970842222899522, - "grad_norm": 0.55859375, - "learning_rate": 1.3218387544269545e-06, - "loss": 0.2473, - "step": 8830 - }, - { - "epoch": 0.8981001727115717, - "grad_norm": 4.78125, - "learning_rate": 1.2960968316835132e-06, - "loss": 0.194, - "step": 8840 - }, - { - "epoch": 0.8991161231331911, - "grad_norm": 3.921875, - "learning_rate": 1.2706013617807822e-06, - "loss": 0.2109, - "step": 8850 - }, - { - "epoch": 0.9001320735548105, - "grad_norm": 5.03125, - "learning_rate": 1.2453526097987778e-06, - "loss": 0.151, - "step": 8860 - }, - { - "epoch": 0.9011480239764299, - "grad_norm": 5.96875, - "learning_rate": 1.2203508382523431e-06, - "loss": 0.1811, - "step": 8870 - }, - { - "epoch": 0.9021639743980494, - "grad_norm": 3.828125, - "learning_rate": 1.1955963070884534e-06, - "loss": 0.2004, - "step": 8880 - }, - { - "epoch": 0.9031799248196688, - "grad_norm": 1.9765625, - "learning_rate": 1.171089273683465e-06, - "loss": 0.1395, - "step": 8890 - }, - { - "epoch": 0.9041958752412882, - "grad_norm": 2.328125, - "learning_rate": 1.1468299928404868e-06, - "loss": 0.1915, - "step": 8900 - }, - { - "epoch": 0.9052118256629077, - "grad_norm": 1.265625, - "learning_rate": 1.1228187167866943e-06, - "loss": 0.1281, - "step": 8910 - }, - { - "epoch": 0.9062277760845271, - "grad_norm": 1.4375, - "learning_rate": 1.099055695170728e-06, - "loss": 0.1627, - "step": 8920 - }, - { - "epoch": 0.9072437265061465, - "grad_norm": 0.6953125, - "learning_rate": 1.0755411750600962e-06, - "loss": 0.1768, - "step": 8930 - }, - { - "epoch": 0.9082596769277659, - "grad_norm": 1.046875, - "learning_rate": 1.052275400938596e-06, - "loss": 0.1544, - "step": 8940 - }, - { - "epoch": 0.9092756273493854, - "grad_norm": 2.71875, - "learning_rate": 1.0292586147037764e-06, - "loss": 0.2498, - "step": 8950 - }, - { - "epoch": 0.9102915777710048, - "grad_norm": 3.0625, - "learning_rate": 1.0064910556644214e-06, - "loss": 0.1918, - "step": 8960 - }, - { - "epoch": 0.9113075281926242, - "grad_norm": 4.0, - "learning_rate": 9.839729605380766e-07, - "loss": 0.2388, - "step": 8970 - }, - { - "epoch": 0.9123234786142437, - "grad_norm": 3.765625, - "learning_rate": 9.61704563448565e-07, - "loss": 0.1944, - "step": 8980 - }, - { - "epoch": 0.9133394290358631, - "grad_norm": 2.90625, - "learning_rate": 9.396860959235671e-07, - "loss": 0.1667, - "step": 8990 - }, - { - "epoch": 0.9143553794574825, - "grad_norm": 2.4375, - "learning_rate": 9.179177868922085e-07, - "loss": 0.2143, - "step": 9000 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-9000/training_args.bin b/checkpoints/checkpoint-9000/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9000/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/checkpoints/checkpoint-9500/adapter_config.json b/checkpoints/checkpoint-9500/adapter_config.json deleted file mode 100644 index 2f24c7f88c4378c16a55c7a5a7e4079b1e95ac70..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/adapter_config.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "alpha_pattern": {}, - "auto_mapping": null, - "base_model_name_or_path": "openbmb/MiniCPM-2B-dpo-bf16", - "bias": "none", - "fan_in_fan_out": false, - "inference_mode": true, - "init_lora_weights": "gaussian", - "layer_replication": null, - "layers_pattern": null, - "layers_to_transform": null, - "loftq_config": {}, - "lora_alpha": 32, - "lora_dropout": 0.1, - "megatron_config": null, - "megatron_core": "megatron.core", - "modules_to_save": null, - "peft_type": "LORA", - "r": 8, - "rank_pattern": {}, - "revision": null, - "target_modules": [ - "v_proj", - "q_proj" - ], - "task_type": "CAUSAL_LM", - "use_dora": false, - "use_rslora": false -} \ No newline at end of file diff --git a/checkpoints/checkpoint-9500/adapter_model.safetensors b/checkpoints/checkpoint-9500/adapter_model.safetensors deleted file mode 100644 index 4873d7ac0dd93d300c5305928706f0491b7f09e2..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/adapter_model.safetensors +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b2b633ee47b5f3a143fc9c71b1dd0cfeb40cc097a762104327a42af91522c1dc -size 5919456 diff --git a/checkpoints/checkpoint-9500/optimizer.pt b/checkpoints/checkpoint-9500/optimizer.pt deleted file mode 100644 index d8453ac00ba64242e454f9493f2b7f0058728a8c..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/optimizer.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ada548d4424bd56d11910556a417182e99459e8c5770ee2fd7a75b4cf595f8b5 -size 11930938 diff --git a/checkpoints/checkpoint-9500/rng_state_0.pth b/checkpoints/checkpoint-9500/rng_state_0.pth deleted file mode 100644 index a76a2679c5ec6f028f79881ea7b86547e50b925c..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/rng_state_0.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cdd3431862bb3caf99bbe632d5500d35b3a4001ba5f79dfb311597988ce7bc07 -size 15024 diff --git a/checkpoints/checkpoint-9500/rng_state_1.pth b/checkpoints/checkpoint-9500/rng_state_1.pth deleted file mode 100644 index d638e87f5f6cf02dbf4bd50290706a308924bd9f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/rng_state_1.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a49b137689bb9baeb88d62569411ec42b18c43387181a8e482c1188959e80ba8 -size 15024 diff --git a/checkpoints/checkpoint-9500/rng_state_2.pth b/checkpoints/checkpoint-9500/rng_state_2.pth deleted file mode 100644 index 44200b445027eedae447f8c9abec1fa8bfd8922f..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/rng_state_2.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f64f0a9a34ed397df06f2a99347433818ab578da206dd635ee4d819320c29732 -size 15024 diff --git a/checkpoints/checkpoint-9500/rng_state_3.pth b/checkpoints/checkpoint-9500/rng_state_3.pth deleted file mode 100644 index 2dd77ff9ed7fd897ba735ad0f9af28473a4e29c2..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/rng_state_3.pth +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3f2446089864551b4466c0d305e103a1a414455ac1dd2cffffdee7d5b89d904a -size 15024 diff --git a/checkpoints/checkpoint-9500/scheduler.pt b/checkpoints/checkpoint-9500/scheduler.pt deleted file mode 100644 index 294d12a69b8fec0e3bf520b584bc6bb862f59d5d..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/scheduler.pt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c40a1ffb822b62d125ac636a29ddd80b085b0fed5f9760c5a078bde337791b1a -size 1064 diff --git a/checkpoints/checkpoint-9500/trainer_state.json b/checkpoints/checkpoint-9500/trainer_state.json deleted file mode 100644 index 92150ac85879e6f7310f59e251c296183fd37122..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/trainer_state.json +++ /dev/null @@ -1,6671 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.9651529005384537, - "eval_steps": 500, - "global_step": 9500, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.001015950421619425, - "grad_norm": 17.625, - "learning_rate": 5e-06, - "loss": 3.4264, - "step": 10 - }, - { - "epoch": 0.00203190084323885, - "grad_norm": 12.5625, - "learning_rate": 1e-05, - "loss": 3.432, - "step": 20 - }, - { - "epoch": 0.003047851264858275, - "grad_norm": 14.0625, - "learning_rate": 1.5e-05, - "loss": 3.23, - "step": 30 - }, - { - "epoch": 0.0040638016864777, - "grad_norm": 12.4375, - "learning_rate": 2e-05, - "loss": 2.9762, - "step": 40 - }, - { - "epoch": 0.005079752108097125, - "grad_norm": 10.0625, - "learning_rate": 2.5e-05, - "loss": 2.6173, - "step": 50 - }, - { - "epoch": 0.00609570252971655, - "grad_norm": 10.1875, - "learning_rate": 3e-05, - "loss": 2.2004, - "step": 60 - }, - { - "epoch": 0.007111652951335975, - "grad_norm": 7.03125, - "learning_rate": 3.5e-05, - "loss": 1.4176, - "step": 70 - }, - { - "epoch": 0.0081276033729554, - "grad_norm": 4.375, - "learning_rate": 4e-05, - "loss": 1.0122, - "step": 80 - }, - { - "epoch": 0.009143553794574825, - "grad_norm": 6.5625, - "learning_rate": 4.5e-05, - "loss": 0.9116, - "step": 90 - }, - { - "epoch": 0.01015950421619425, - "grad_norm": 5.28125, - "learning_rate": 5e-05, - "loss": 0.6832, - "step": 100 - }, - { - "epoch": 0.011175454637813675, - "grad_norm": 5.5, - "learning_rate": 4.9999870035728426e-05, - "loss": 0.7355, - "step": 110 - }, - { - "epoch": 0.0121914050594331, - "grad_norm": 5.1875, - "learning_rate": 4.9999480144264944e-05, - "loss": 0.6673, - "step": 120 - }, - { - "epoch": 0.013207355481052525, - "grad_norm": 4.5, - "learning_rate": 4.9998830329663314e-05, - "loss": 0.6792, - "step": 130 - }, - { - "epoch": 0.01422330590267195, - "grad_norm": 3.9375, - "learning_rate": 4.9997920598679756e-05, - "loss": 0.6207, - "step": 140 - }, - { - "epoch": 0.015239256324291375, - "grad_norm": 3.15625, - "learning_rate": 4.999675096077286e-05, - "loss": 0.483, - "step": 150 - }, - { - "epoch": 0.0162552067459108, - "grad_norm": 5.28125, - "learning_rate": 4.999532142810354e-05, - "loss": 0.5319, - "step": 160 - }, - { - "epoch": 0.017271157167530225, - "grad_norm": 4.59375, - "learning_rate": 4.999363201553483e-05, - "loss": 0.6052, - "step": 170 - }, - { - "epoch": 0.01828710758914965, - "grad_norm": 5.03125, - "learning_rate": 4.9991682740631794e-05, - "loss": 0.4258, - "step": 180 - }, - { - "epoch": 0.019303058010769075, - "grad_norm": 3.859375, - "learning_rate": 4.998947362366133e-05, - "loss": 0.4309, - "step": 190 - }, - { - "epoch": 0.0203190084323885, - "grad_norm": 3.328125, - "learning_rate": 4.998700468759193e-05, - "loss": 0.3957, - "step": 200 - }, - { - "epoch": 0.021334958854007924, - "grad_norm": 4.9375, - "learning_rate": 4.9984275958093475e-05, - "loss": 0.4777, - "step": 210 - }, - { - "epoch": 0.02235090927562735, - "grad_norm": 4.78125, - "learning_rate": 4.998128746353695e-05, - "loss": 0.3549, - "step": 220 - }, - { - "epoch": 0.023366859697246774, - "grad_norm": 4.0625, - "learning_rate": 4.997803923499417e-05, - "loss": 0.4447, - "step": 230 - }, - { - "epoch": 0.0243828101188662, - "grad_norm": 6.375, - "learning_rate": 4.99745313062374e-05, - "loss": 0.3808, - "step": 240 - }, - { - "epoch": 0.025398760540485624, - "grad_norm": 3.59375, - "learning_rate": 4.99707637137391e-05, - "loss": 0.3827, - "step": 250 - }, - { - "epoch": 0.02641471096210505, - "grad_norm": 3.015625, - "learning_rate": 4.996673649667145e-05, - "loss": 0.3694, - "step": 260 - }, - { - "epoch": 0.027430661383724474, - "grad_norm": 2.296875, - "learning_rate": 4.9962449696906e-05, - "loss": 0.3586, - "step": 270 - }, - { - "epoch": 0.0284466118053439, - "grad_norm": 4.125, - "learning_rate": 4.9957903359013214e-05, - "loss": 0.3832, - "step": 280 - }, - { - "epoch": 0.029462562226963324, - "grad_norm": 3.296875, - "learning_rate": 4.995309753026201e-05, - "loss": 0.328, - "step": 290 - }, - { - "epoch": 0.03047851264858275, - "grad_norm": 4.5, - "learning_rate": 4.994803226061927e-05, - "loss": 0.3667, - "step": 300 - }, - { - "epoch": 0.03149446307020217, - "grad_norm": 4.3125, - "learning_rate": 4.994270760274933e-05, - "loss": 0.3811, - "step": 310 - }, - { - "epoch": 0.0325104134918216, - "grad_norm": 3.421875, - "learning_rate": 4.99371236120134e-05, - "loss": 0.3065, - "step": 320 - }, - { - "epoch": 0.03352636391344102, - "grad_norm": 4.6875, - "learning_rate": 4.993128034646902e-05, - "loss": 0.4177, - "step": 330 - }, - { - "epoch": 0.03454231433506045, - "grad_norm": 3.046875, - "learning_rate": 4.992517786686947e-05, - "loss": 0.33, - "step": 340 - }, - { - "epoch": 0.03555826475667987, - "grad_norm": 1.8828125, - "learning_rate": 4.9918816236663077e-05, - "loss": 0.3287, - "step": 350 - }, - { - "epoch": 0.0365742151782993, - "grad_norm": 3.8125, - "learning_rate": 4.991219552199262e-05, - "loss": 0.2934, - "step": 360 - }, - { - "epoch": 0.03759016559991872, - "grad_norm": 4.28125, - "learning_rate": 4.99053157916946e-05, - "loss": 0.3176, - "step": 370 - }, - { - "epoch": 0.03860611602153815, - "grad_norm": 2.609375, - "learning_rate": 4.989817711729856e-05, - "loss": 0.3318, - "step": 380 - }, - { - "epoch": 0.03962206644315757, - "grad_norm": 2.375, - "learning_rate": 4.98907795730263e-05, - "loss": 0.3234, - "step": 390 - }, - { - "epoch": 0.040638016864777, - "grad_norm": 4.46875, - "learning_rate": 4.988312323579114e-05, - "loss": 0.267, - "step": 400 - }, - { - "epoch": 0.04165396728639642, - "grad_norm": 3.75, - "learning_rate": 4.98752081851971e-05, - "loss": 0.3081, - "step": 410 - }, - { - "epoch": 0.04266991770801585, - "grad_norm": 2.203125, - "learning_rate": 4.986703450353809e-05, - "loss": 0.2917, - "step": 420 - }, - { - "epoch": 0.04368586812963527, - "grad_norm": 1.6015625, - "learning_rate": 4.985860227579703e-05, - "loss": 0.2805, - "step": 430 - }, - { - "epoch": 0.0447018185512547, - "grad_norm": 3.140625, - "learning_rate": 4.984991158964499e-05, - "loss": 0.3534, - "step": 440 - }, - { - "epoch": 0.04571776897287412, - "grad_norm": 3.296875, - "learning_rate": 4.9840962535440265e-05, - "loss": 0.335, - "step": 450 - }, - { - "epoch": 0.04673371939449355, - "grad_norm": 3.25, - "learning_rate": 4.983175520622744e-05, - "loss": 0.2544, - "step": 460 - }, - { - "epoch": 0.04774966981611297, - "grad_norm": 2.25, - "learning_rate": 4.982228969773642e-05, - "loss": 0.3449, - "step": 470 - }, - { - "epoch": 0.0487656202377324, - "grad_norm": 4.9375, - "learning_rate": 4.9812566108381435e-05, - "loss": 0.2964, - "step": 480 - }, - { - "epoch": 0.04978157065935182, - "grad_norm": 1.5703125, - "learning_rate": 4.9802584539260035e-05, - "loss": 0.2799, - "step": 490 - }, - { - "epoch": 0.05079752108097125, - "grad_norm": 2.828125, - "learning_rate": 4.979234509415199e-05, - "loss": 0.3231, - "step": 500 - }, - { - "epoch": 0.05181347150259067, - "grad_norm": 2.9375, - "learning_rate": 4.978184787951828e-05, - "loss": 0.2943, - "step": 510 - }, - { - "epoch": 0.0528294219242101, - "grad_norm": 2.34375, - "learning_rate": 4.977109300449992e-05, - "loss": 0.2705, - "step": 520 - }, - { - "epoch": 0.05384537234582952, - "grad_norm": 3.140625, - "learning_rate": 4.9760080580916876e-05, - "loss": 0.2998, - "step": 530 - }, - { - "epoch": 0.05486132276744895, - "grad_norm": 3.5625, - "learning_rate": 4.974881072326688e-05, - "loss": 0.2595, - "step": 540 - }, - { - "epoch": 0.05587727318906837, - "grad_norm": 4.25, - "learning_rate": 4.9737283548724236e-05, - "loss": 0.2803, - "step": 550 - }, - { - "epoch": 0.0568932236106878, - "grad_norm": 4.0625, - "learning_rate": 4.97254991771386e-05, - "loss": 0.3511, - "step": 560 - }, - { - "epoch": 0.05790917403230722, - "grad_norm": 2.515625, - "learning_rate": 4.971345773103377e-05, - "loss": 0.312, - "step": 570 - }, - { - "epoch": 0.05892512445392665, - "grad_norm": 3.21875, - "learning_rate": 4.9701159335606365e-05, - "loss": 0.2482, - "step": 580 - }, - { - "epoch": 0.05994107487554607, - "grad_norm": 5.5, - "learning_rate": 4.968860411872454e-05, - "loss": 0.2537, - "step": 590 - }, - { - "epoch": 0.0609570252971655, - "grad_norm": 3.546875, - "learning_rate": 4.967579221092666e-05, - "loss": 0.3125, - "step": 600 - }, - { - "epoch": 0.06197297571878492, - "grad_norm": 2.984375, - "learning_rate": 4.966272374541996e-05, - "loss": 0.2354, - "step": 610 - }, - { - "epoch": 0.06298892614040434, - "grad_norm": 3.6875, - "learning_rate": 4.964939885807912e-05, - "loss": 0.3213, - "step": 620 - }, - { - "epoch": 0.06400487656202378, - "grad_norm": 2.140625, - "learning_rate": 4.9635817687444876e-05, - "loss": 0.3003, - "step": 630 - }, - { - "epoch": 0.0650208269836432, - "grad_norm": 3.484375, - "learning_rate": 4.962198037472259e-05, - "loss": 0.2996, - "step": 640 - }, - { - "epoch": 0.06603677740526262, - "grad_norm": 3.21875, - "learning_rate": 4.9607887063780776e-05, - "loss": 0.2257, - "step": 650 - }, - { - "epoch": 0.06705272782688204, - "grad_norm": 5.375, - "learning_rate": 4.9593537901149564e-05, - "loss": 0.223, - "step": 660 - }, - { - "epoch": 0.06806867824850148, - "grad_norm": 4.1875, - "learning_rate": 4.957893303601924e-05, - "loss": 0.3407, - "step": 670 - }, - { - "epoch": 0.0690846286701209, - "grad_norm": 3.328125, - "learning_rate": 4.956407262023866e-05, - "loss": 0.2589, - "step": 680 - }, - { - "epoch": 0.07010057909174032, - "grad_norm": 2.953125, - "learning_rate": 4.954895680831367e-05, - "loss": 0.2949, - "step": 690 - }, - { - "epoch": 0.07111652951335974, - "grad_norm": 4.0625, - "learning_rate": 4.9533585757405506e-05, - "loss": 0.2995, - "step": 700 - }, - { - "epoch": 0.07213247993497918, - "grad_norm": 4.625, - "learning_rate": 4.951795962732917e-05, - "loss": 0.2894, - "step": 710 - }, - { - "epoch": 0.0731484303565986, - "grad_norm": 3.0, - "learning_rate": 4.9502078580551755e-05, - "loss": 0.3082, - "step": 720 - }, - { - "epoch": 0.07416438077821802, - "grad_norm": 3.65625, - "learning_rate": 4.9485942782190734e-05, - "loss": 0.2308, - "step": 730 - }, - { - "epoch": 0.07518033119983744, - "grad_norm": 4.78125, - "learning_rate": 4.9469552400012306e-05, - "loss": 0.2272, - "step": 740 - }, - { - "epoch": 0.07619628162145688, - "grad_norm": 4.25, - "learning_rate": 4.94529076044296e-05, - "loss": 0.2701, - "step": 750 - }, - { - "epoch": 0.0772122320430763, - "grad_norm": 3.140625, - "learning_rate": 4.94360085685009e-05, - "loss": 0.2686, - "step": 760 - }, - { - "epoch": 0.07822818246469572, - "grad_norm": 0.765625, - "learning_rate": 4.9418855467927894e-05, - "loss": 0.2051, - "step": 770 - }, - { - "epoch": 0.07924413288631514, - "grad_norm": 1.796875, - "learning_rate": 4.940144848105379e-05, - "loss": 0.2267, - "step": 780 - }, - { - "epoch": 0.08026008330793458, - "grad_norm": 4.5625, - "learning_rate": 4.93837877888615e-05, - "loss": 0.2597, - "step": 790 - }, - { - "epoch": 0.081276033729554, - "grad_norm": 3.03125, - "learning_rate": 4.9365873574971745e-05, - "loss": 0.3701, - "step": 800 - }, - { - "epoch": 0.08229198415117342, - "grad_norm": 4.5625, - "learning_rate": 4.9347706025641136e-05, - "loss": 0.2559, - "step": 810 - }, - { - "epoch": 0.08330793457279284, - "grad_norm": 3.90625, - "learning_rate": 4.9329285329760275e-05, - "loss": 0.2799, - "step": 820 - }, - { - "epoch": 0.08432388499441228, - "grad_norm": 3.140625, - "learning_rate": 4.9310611678851735e-05, - "loss": 0.2866, - "step": 830 - }, - { - "epoch": 0.0853398354160317, - "grad_norm": 2.46875, - "learning_rate": 4.929168526706811e-05, - "loss": 0.3105, - "step": 840 - }, - { - "epoch": 0.08635578583765112, - "grad_norm": 13.625, - "learning_rate": 4.927250629119e-05, - "loss": 0.2454, - "step": 850 - }, - { - "epoch": 0.08737173625927054, - "grad_norm": 3.921875, - "learning_rate": 4.9253074950623925e-05, - "loss": 0.2424, - "step": 860 - }, - { - "epoch": 0.08838768668088998, - "grad_norm": 2.90625, - "learning_rate": 4.9233391447400286e-05, - "loss": 0.2481, - "step": 870 - }, - { - "epoch": 0.0894036371025094, - "grad_norm": 2.96875, - "learning_rate": 4.921345598617125e-05, - "loss": 0.2231, - "step": 880 - }, - { - "epoch": 0.09041958752412882, - "grad_norm": 5.375, - "learning_rate": 4.9193268774208654e-05, - "loss": 0.3447, - "step": 890 - }, - { - "epoch": 0.09143553794574824, - "grad_norm": 2.0, - "learning_rate": 4.9172830021401785e-05, - "loss": 0.229, - "step": 900 - }, - { - "epoch": 0.09245148836736768, - "grad_norm": 3.1875, - "learning_rate": 4.9152139940255245e-05, - "loss": 0.2122, - "step": 910 - }, - { - "epoch": 0.0934674387889871, - "grad_norm": 3.40625, - "learning_rate": 4.913119874588677e-05, - "loss": 0.2386, - "step": 920 - }, - { - "epoch": 0.09448338921060652, - "grad_norm": 1.4609375, - "learning_rate": 4.911000665602489e-05, - "loss": 0.1944, - "step": 930 - }, - { - "epoch": 0.09549933963222594, - "grad_norm": 5.0625, - "learning_rate": 4.9088563891006786e-05, - "loss": 0.2038, - "step": 940 - }, - { - "epoch": 0.09651529005384538, - "grad_norm": 4.53125, - "learning_rate": 4.906687067377592e-05, - "loss": 0.3122, - "step": 950 - }, - { - "epoch": 0.0975312404754648, - "grad_norm": 2.84375, - "learning_rate": 4.904492722987976e-05, - "loss": 0.3157, - "step": 960 - }, - { - "epoch": 0.09854719089708422, - "grad_norm": 2.171875, - "learning_rate": 4.902273378746738e-05, - "loss": 0.3077, - "step": 970 - }, - { - "epoch": 0.09956314131870364, - "grad_norm": 2.84375, - "learning_rate": 4.9000290577287165e-05, - "loss": 0.2756, - "step": 980 - }, - { - "epoch": 0.10057909174032308, - "grad_norm": 0.99609375, - "learning_rate": 4.897759783268434e-05, - "loss": 0.2915, - "step": 990 - }, - { - "epoch": 0.1015950421619425, - "grad_norm": 3.53125, - "learning_rate": 4.895465578959859e-05, - "loss": 0.2052, - "step": 1000 - }, - { - "epoch": 0.10261099258356192, - "grad_norm": 4.0, - "learning_rate": 4.893146468656159e-05, - "loss": 0.2499, - "step": 1010 - }, - { - "epoch": 0.10362694300518134, - "grad_norm": 1.65625, - "learning_rate": 4.890802476469452e-05, - "loss": 0.278, - "step": 1020 - }, - { - "epoch": 0.10464289342680078, - "grad_norm": 3.625, - "learning_rate": 4.888433626770558e-05, - "loss": 0.2143, - "step": 1030 - }, - { - "epoch": 0.1056588438484202, - "grad_norm": 5.0625, - "learning_rate": 4.886039944188741e-05, - "loss": 0.2878, - "step": 1040 - }, - { - "epoch": 0.10667479427003962, - "grad_norm": 4.5, - "learning_rate": 4.883621453611461e-05, - "loss": 0.2744, - "step": 1050 - }, - { - "epoch": 0.10769074469165904, - "grad_norm": 4.5625, - "learning_rate": 4.881178180184106e-05, - "loss": 0.2734, - "step": 1060 - }, - { - "epoch": 0.10870669511327848, - "grad_norm": 3.125, - "learning_rate": 4.878710149309735e-05, - "loss": 0.3574, - "step": 1070 - }, - { - "epoch": 0.1097226455348979, - "grad_norm": 3.0625, - "learning_rate": 4.876217386648816e-05, - "loss": 0.2625, - "step": 1080 - }, - { - "epoch": 0.11073859595651732, - "grad_norm": 4.0625, - "learning_rate": 4.873699918118955e-05, - "loss": 0.2437, - "step": 1090 - }, - { - "epoch": 0.11175454637813674, - "grad_norm": 1.59375, - "learning_rate": 4.87115776989463e-05, - "loss": 0.2051, - "step": 1100 - }, - { - "epoch": 0.11277049679975618, - "grad_norm": 4.375, - "learning_rate": 4.8685909684069153e-05, - "loss": 0.1727, - "step": 1110 - }, - { - "epoch": 0.1137864472213756, - "grad_norm": 2.28125, - "learning_rate": 4.865999540343211e-05, - "loss": 0.2256, - "step": 1120 - }, - { - "epoch": 0.11480239764299502, - "grad_norm": 2.265625, - "learning_rate": 4.86338351264696e-05, - "loss": 0.3529, - "step": 1130 - }, - { - "epoch": 0.11581834806461444, - "grad_norm": 2.34375, - "learning_rate": 4.8607429125173754e-05, - "loss": 0.2113, - "step": 1140 - }, - { - "epoch": 0.11683429848623388, - "grad_norm": 0.7578125, - "learning_rate": 4.858077767409149e-05, - "loss": 0.2759, - "step": 1150 - }, - { - "epoch": 0.1178502489078533, - "grad_norm": 3.640625, - "learning_rate": 4.855388105032174e-05, - "loss": 0.2482, - "step": 1160 - }, - { - "epoch": 0.11886619932947272, - "grad_norm": 3.5, - "learning_rate": 4.852673953351249e-05, - "loss": 0.1865, - "step": 1170 - }, - { - "epoch": 0.11988214975109214, - "grad_norm": 3.75, - "learning_rate": 4.849935340585796e-05, - "loss": 0.2659, - "step": 1180 - }, - { - "epoch": 0.12089810017271158, - "grad_norm": 3.375, - "learning_rate": 4.8471722952095586e-05, - "loss": 0.1506, - "step": 1190 - }, - { - "epoch": 0.121914050594331, - "grad_norm": 3.34375, - "learning_rate": 4.844384845950312e-05, - "loss": 0.307, - "step": 1200 - }, - { - "epoch": 0.12293000101595042, - "grad_norm": 1.578125, - "learning_rate": 4.841573021789561e-05, - "loss": 0.1952, - "step": 1210 - }, - { - "epoch": 0.12394595143756984, - "grad_norm": 1.2890625, - "learning_rate": 4.838736851962239e-05, - "loss": 0.1779, - "step": 1220 - }, - { - "epoch": 0.12496190185918928, - "grad_norm": 1.265625, - "learning_rate": 4.835876365956408e-05, - "loss": 0.1235, - "step": 1230 - }, - { - "epoch": 0.12597785228080868, - "grad_norm": 1.9609375, - "learning_rate": 4.8329915935129436e-05, - "loss": 0.1876, - "step": 1240 - }, - { - "epoch": 0.12699380270242813, - "grad_norm": 1.6328125, - "learning_rate": 4.830082564625235e-05, - "loss": 0.2188, - "step": 1250 - }, - { - "epoch": 0.12800975312404755, - "grad_norm": 3.96875, - "learning_rate": 4.8271493095388684e-05, - "loss": 0.2622, - "step": 1260 - }, - { - "epoch": 0.12902570354566698, - "grad_norm": 3.765625, - "learning_rate": 4.824191858751312e-05, - "loss": 0.2724, - "step": 1270 - }, - { - "epoch": 0.1300416539672864, - "grad_norm": 5.59375, - "learning_rate": 4.821210243011601e-05, - "loss": 0.2413, - "step": 1280 - }, - { - "epoch": 0.13105760438890582, - "grad_norm": 3.34375, - "learning_rate": 4.818204493320016e-05, - "loss": 0.2618, - "step": 1290 - }, - { - "epoch": 0.13207355481052524, - "grad_norm": 2.78125, - "learning_rate": 4.8151746409277634e-05, - "loss": 0.2295, - "step": 1300 - }, - { - "epoch": 0.13308950523214466, - "grad_norm": 3.1875, - "learning_rate": 4.8121207173366484e-05, - "loss": 0.2733, - "step": 1310 - }, - { - "epoch": 0.13410545565376408, - "grad_norm": 2.28125, - "learning_rate": 4.809042754298746e-05, - "loss": 0.2311, - "step": 1320 - }, - { - "epoch": 0.13512140607538353, - "grad_norm": 2.171875, - "learning_rate": 4.805940783816075e-05, - "loss": 0.2059, - "step": 1330 - }, - { - "epoch": 0.13613735649700295, - "grad_norm": 2.796875, - "learning_rate": 4.8028148381402625e-05, - "loss": 0.2102, - "step": 1340 - }, - { - "epoch": 0.13715330691862238, - "grad_norm": 2.96875, - "learning_rate": 4.7996649497722084e-05, - "loss": 0.2708, - "step": 1350 - }, - { - "epoch": 0.1381692573402418, - "grad_norm": 2.4375, - "learning_rate": 4.7964911514617485e-05, - "loss": 0.2429, - "step": 1360 - }, - { - "epoch": 0.13918520776186122, - "grad_norm": 5.8125, - "learning_rate": 4.793293476207312e-05, - "loss": 0.2725, - "step": 1370 - }, - { - "epoch": 0.14020115818348064, - "grad_norm": 2.40625, - "learning_rate": 4.790071957255585e-05, - "loss": 0.2098, - "step": 1380 - }, - { - "epoch": 0.14121710860510006, - "grad_norm": 4.25, - "learning_rate": 4.786826628101154e-05, - "loss": 0.2101, - "step": 1390 - }, - { - "epoch": 0.14223305902671948, - "grad_norm": 2.578125, - "learning_rate": 4.783557522486167e-05, - "loss": 0.2624, - "step": 1400 - }, - { - "epoch": 0.14324900944833893, - "grad_norm": 3.125, - "learning_rate": 4.780264674399978e-05, - "loss": 0.2518, - "step": 1410 - }, - { - "epoch": 0.14426495986995835, - "grad_norm": 3.671875, - "learning_rate": 4.7769481180787966e-05, - "loss": 0.3112, - "step": 1420 - }, - { - "epoch": 0.14528091029157778, - "grad_norm": 3.984375, - "learning_rate": 4.773607888005327e-05, - "loss": 0.2747, - "step": 1430 - }, - { - "epoch": 0.1462968607131972, - "grad_norm": 3.234375, - "learning_rate": 4.770244018908416e-05, - "loss": 0.1572, - "step": 1440 - }, - { - "epoch": 0.14731281113481662, - "grad_norm": 4.09375, - "learning_rate": 4.766856545762687e-05, - "loss": 0.2148, - "step": 1450 - }, - { - "epoch": 0.14832876155643604, - "grad_norm": 1.6875, - "learning_rate": 4.763445503788178e-05, - "loss": 0.2531, - "step": 1460 - }, - { - "epoch": 0.14934471197805546, - "grad_norm": 2.375, - "learning_rate": 4.760010928449976e-05, - "loss": 0.199, - "step": 1470 - }, - { - "epoch": 0.15036066239967488, - "grad_norm": 4.6875, - "learning_rate": 4.7565528554578485e-05, - "loss": 0.2366, - "step": 1480 - }, - { - "epoch": 0.15137661282129433, - "grad_norm": 5.4375, - "learning_rate": 4.75307132076587e-05, - "loss": 0.1862, - "step": 1490 - }, - { - "epoch": 0.15239256324291375, - "grad_norm": 2.484375, - "learning_rate": 4.749566360572049e-05, - "loss": 0.2143, - "step": 1500 - }, - { - "epoch": 0.15340851366453317, - "grad_norm": 2.1875, - "learning_rate": 4.746038011317955e-05, - "loss": 0.1877, - "step": 1510 - }, - { - "epoch": 0.1544244640861526, - "grad_norm": 2.84375, - "learning_rate": 4.742486309688333e-05, - "loss": 0.2831, - "step": 1520 - }, - { - "epoch": 0.15544041450777202, - "grad_norm": 2.015625, - "learning_rate": 4.738911292610732e-05, - "loss": 0.1708, - "step": 1530 - }, - { - "epoch": 0.15645636492939144, - "grad_norm": 3.953125, - "learning_rate": 4.735312997255107e-05, - "loss": 0.192, - "step": 1540 - }, - { - "epoch": 0.15747231535101086, - "grad_norm": 2.09375, - "learning_rate": 4.7316914610334475e-05, - "loss": 0.2586, - "step": 1550 - }, - { - "epoch": 0.15848826577263028, - "grad_norm": 3.6875, - "learning_rate": 4.728046721599378e-05, - "loss": 0.2141, - "step": 1560 - }, - { - "epoch": 0.15950421619424973, - "grad_norm": 2.9375, - "learning_rate": 4.724378816847771e-05, - "loss": 0.193, - "step": 1570 - }, - { - "epoch": 0.16052016661586915, - "grad_norm": 1.5625, - "learning_rate": 4.720687784914352e-05, - "loss": 0.191, - "step": 1580 - }, - { - "epoch": 0.16153611703748857, - "grad_norm": 3.75, - "learning_rate": 4.716973664175304e-05, - "loss": 0.2172, - "step": 1590 - }, - { - "epoch": 0.162552067459108, - "grad_norm": 3.125, - "learning_rate": 4.7132364932468645e-05, - "loss": 0.2134, - "step": 1600 - }, - { - "epoch": 0.16356801788072742, - "grad_norm": 4.09375, - "learning_rate": 4.709476310984932e-05, - "loss": 0.2055, - "step": 1610 - }, - { - "epoch": 0.16458396830234684, - "grad_norm": 3.875, - "learning_rate": 4.705693156484652e-05, - "loss": 0.2136, - "step": 1620 - }, - { - "epoch": 0.16559991872396626, - "grad_norm": 1.1796875, - "learning_rate": 4.7018870690800196e-05, - "loss": 0.1471, - "step": 1630 - }, - { - "epoch": 0.16661586914558568, - "grad_norm": 2.5, - "learning_rate": 4.698058088343465e-05, - "loss": 0.2308, - "step": 1640 - }, - { - "epoch": 0.16763181956720513, - "grad_norm": 1.390625, - "learning_rate": 4.6942062540854425e-05, - "loss": 0.2456, - "step": 1650 - }, - { - "epoch": 0.16864776998882455, - "grad_norm": 3.125, - "learning_rate": 4.69033160635402e-05, - "loss": 0.2654, - "step": 1660 - }, - { - "epoch": 0.16966372041044397, - "grad_norm": 3.984375, - "learning_rate": 4.6864341854344587e-05, - "loss": 0.2226, - "step": 1670 - }, - { - "epoch": 0.1706796708320634, - "grad_norm": 2.328125, - "learning_rate": 4.682514031848795e-05, - "loss": 0.2438, - "step": 1680 - }, - { - "epoch": 0.17169562125368282, - "grad_norm": 3.078125, - "learning_rate": 4.678571186355423e-05, - "loss": 0.1889, - "step": 1690 - }, - { - "epoch": 0.17271157167530224, - "grad_norm": 3.328125, - "learning_rate": 4.6746056899486644e-05, - "loss": 0.2117, - "step": 1700 - }, - { - "epoch": 0.17372752209692166, - "grad_norm": 2.78125, - "learning_rate": 4.67061758385835e-05, - "loss": 0.1953, - "step": 1710 - }, - { - "epoch": 0.17474347251854108, - "grad_norm": 3.09375, - "learning_rate": 4.6666069095493816e-05, - "loss": 0.1844, - "step": 1720 - }, - { - "epoch": 0.17575942294016053, - "grad_norm": 3.234375, - "learning_rate": 4.662573708721309e-05, - "loss": 0.2774, - "step": 1730 - }, - { - "epoch": 0.17677537336177995, - "grad_norm": 4.03125, - "learning_rate": 4.658518023307894e-05, - "loss": 0.2527, - "step": 1740 - }, - { - "epoch": 0.17779132378339937, - "grad_norm": 3.21875, - "learning_rate": 4.654439895476671e-05, - "loss": 0.2164, - "step": 1750 - }, - { - "epoch": 0.1788072742050188, - "grad_norm": 2.390625, - "learning_rate": 4.6503393676285146e-05, - "loss": 0.2424, - "step": 1760 - }, - { - "epoch": 0.17982322462663822, - "grad_norm": 1.8359375, - "learning_rate": 4.646216482397192e-05, - "loss": 0.2428, - "step": 1770 - }, - { - "epoch": 0.18083917504825764, - "grad_norm": 2.796875, - "learning_rate": 4.6420712826489275e-05, - "loss": 0.2155, - "step": 1780 - }, - { - "epoch": 0.18185512546987706, - "grad_norm": 0.69921875, - "learning_rate": 4.6379038114819485e-05, - "loss": 0.1544, - "step": 1790 - }, - { - "epoch": 0.18287107589149648, - "grad_norm": 3.40625, - "learning_rate": 4.6337141122260444e-05, - "loss": 0.2029, - "step": 1800 - }, - { - "epoch": 0.18388702631311593, - "grad_norm": 2.359375, - "learning_rate": 4.629502228442112e-05, - "loss": 0.1489, - "step": 1810 - }, - { - "epoch": 0.18490297673473535, - "grad_norm": 1.4453125, - "learning_rate": 4.6252682039217045e-05, - "loss": 0.2101, - "step": 1820 - }, - { - "epoch": 0.18591892715635477, - "grad_norm": 2.71875, - "learning_rate": 4.621012082686573e-05, - "loss": 0.2076, - "step": 1830 - }, - { - "epoch": 0.1869348775779742, - "grad_norm": 3.0625, - "learning_rate": 4.616733908988216e-05, - "loss": 0.2719, - "step": 1840 - }, - { - "epoch": 0.18795082799959362, - "grad_norm": 1.953125, - "learning_rate": 4.612433727307409e-05, - "loss": 0.2105, - "step": 1850 - }, - { - "epoch": 0.18896677842121304, - "grad_norm": 3.46875, - "learning_rate": 4.608111582353751e-05, - "loss": 0.1877, - "step": 1860 - }, - { - "epoch": 0.18998272884283246, - "grad_norm": 2.546875, - "learning_rate": 4.603767519065197e-05, - "loss": 0.2238, - "step": 1870 - }, - { - "epoch": 0.19099867926445188, - "grad_norm": 1.5703125, - "learning_rate": 4.599401582607589e-05, - "loss": 0.243, - "step": 1880 - }, - { - "epoch": 0.19201462968607133, - "grad_norm": 2.4375, - "learning_rate": 4.595013818374185e-05, - "loss": 0.1867, - "step": 1890 - }, - { - "epoch": 0.19303058010769075, - "grad_norm": 2.203125, - "learning_rate": 4.5906042719851925e-05, - "loss": 0.1994, - "step": 1900 - }, - { - "epoch": 0.19404653052931017, - "grad_norm": 3.984375, - "learning_rate": 4.586172989287291e-05, - "loss": 0.1899, - "step": 1910 - }, - { - "epoch": 0.1950624809509296, - "grad_norm": 2.6875, - "learning_rate": 4.5817200163531534e-05, - "loss": 0.2528, - "step": 1920 - }, - { - "epoch": 0.19607843137254902, - "grad_norm": 2.71875, - "learning_rate": 4.577245399480972e-05, - "loss": 0.2336, - "step": 1930 - }, - { - "epoch": 0.19709438179416844, - "grad_norm": 2.640625, - "learning_rate": 4.5727491851939715e-05, - "loss": 0.2204, - "step": 1940 - }, - { - "epoch": 0.19811033221578786, - "grad_norm": 1.78125, - "learning_rate": 4.568231420239929e-05, - "loss": 0.1656, - "step": 1950 - }, - { - "epoch": 0.19912628263740728, - "grad_norm": 3.15625, - "learning_rate": 4.563692151590687e-05, - "loss": 0.2105, - "step": 1960 - }, - { - "epoch": 0.20014223305902673, - "grad_norm": 1.3671875, - "learning_rate": 4.5591314264416666e-05, - "loss": 0.1464, - "step": 1970 - }, - { - "epoch": 0.20115818348064615, - "grad_norm": 4.25, - "learning_rate": 4.554549292211371e-05, - "loss": 0.2103, - "step": 1980 - }, - { - "epoch": 0.20217413390226557, - "grad_norm": 2.984375, - "learning_rate": 4.549945796540901e-05, - "loss": 0.144, - "step": 1990 - }, - { - "epoch": 0.203190084323885, - "grad_norm": 1.859375, - "learning_rate": 4.545320987293453e-05, - "loss": 0.1963, - "step": 2000 - }, - { - "epoch": 0.20420603474550442, - "grad_norm": 1.078125, - "learning_rate": 4.540674912553824e-05, - "loss": 0.2115, - "step": 2010 - }, - { - "epoch": 0.20522198516712384, - "grad_norm": 4.25, - "learning_rate": 4.536007620627911e-05, - "loss": 0.1682, - "step": 2020 - }, - { - "epoch": 0.20623793558874326, - "grad_norm": 2.71875, - "learning_rate": 4.531319160042212e-05, - "loss": 0.1992, - "step": 2030 - }, - { - "epoch": 0.20725388601036268, - "grad_norm": 1.2890625, - "learning_rate": 4.5266095795433126e-05, - "loss": 0.1134, - "step": 2040 - }, - { - "epoch": 0.20826983643198213, - "grad_norm": 3.296875, - "learning_rate": 4.5218789280973925e-05, - "loss": 0.1474, - "step": 2050 - }, - { - "epoch": 0.20928578685360155, - "grad_norm": 1.9375, - "learning_rate": 4.5171272548897024e-05, - "loss": 0.1955, - "step": 2060 - }, - { - "epoch": 0.21030173727522097, - "grad_norm": 2.734375, - "learning_rate": 4.512354609324063e-05, - "loss": 0.2042, - "step": 2070 - }, - { - "epoch": 0.2113176876968404, - "grad_norm": 2.921875, - "learning_rate": 4.507561041022347e-05, - "loss": 0.2174, - "step": 2080 - }, - { - "epoch": 0.21233363811845982, - "grad_norm": 2.40625, - "learning_rate": 4.502746599823963e-05, - "loss": 0.2634, - "step": 2090 - }, - { - "epoch": 0.21334958854007924, - "grad_norm": 1.71875, - "learning_rate": 4.497911335785339e-05, - "loss": 0.1884, - "step": 2100 - }, - { - "epoch": 0.21436553896169866, - "grad_norm": 0.79296875, - "learning_rate": 4.4930552991794e-05, - "loss": 0.1872, - "step": 2110 - }, - { - "epoch": 0.21538148938331808, - "grad_norm": 3.171875, - "learning_rate": 4.4881785404950474e-05, - "loss": 0.2233, - "step": 2120 - }, - { - "epoch": 0.21639743980493753, - "grad_norm": 2.59375, - "learning_rate": 4.483281110436631e-05, - "loss": 0.2374, - "step": 2130 - }, - { - "epoch": 0.21741339022655695, - "grad_norm": 3.328125, - "learning_rate": 4.478363059923426e-05, - "loss": 0.2545, - "step": 2140 - }, - { - "epoch": 0.21842934064817637, - "grad_norm": 2.3125, - "learning_rate": 4.4734244400891014e-05, - "loss": 0.2063, - "step": 2150 - }, - { - "epoch": 0.2194452910697958, - "grad_norm": 3.40625, - "learning_rate": 4.4684653022811865e-05, - "loss": 0.1219, - "step": 2160 - }, - { - "epoch": 0.22046124149141522, - "grad_norm": 4.1875, - "learning_rate": 4.463485698060541e-05, - "loss": 0.2805, - "step": 2170 - }, - { - "epoch": 0.22147719191303464, - "grad_norm": 2.3125, - "learning_rate": 4.458485679200814e-05, - "loss": 0.1998, - "step": 2180 - }, - { - "epoch": 0.22249314233465406, - "grad_norm": 3.578125, - "learning_rate": 4.453465297687912e-05, - "loss": 0.2489, - "step": 2190 - }, - { - "epoch": 0.22350909275627348, - "grad_norm": 2.59375, - "learning_rate": 4.448424605719452e-05, - "loss": 0.2731, - "step": 2200 - }, - { - "epoch": 0.22452504317789293, - "grad_norm": 3.28125, - "learning_rate": 4.443363655704224e-05, - "loss": 0.2425, - "step": 2210 - }, - { - "epoch": 0.22554099359951235, - "grad_norm": 2.78125, - "learning_rate": 4.438282500261641e-05, - "loss": 0.2938, - "step": 2220 - }, - { - "epoch": 0.22655694402113177, - "grad_norm": 1.1953125, - "learning_rate": 4.433181192221197e-05, - "loss": 0.1728, - "step": 2230 - }, - { - "epoch": 0.2275728944427512, - "grad_norm": 1.34375, - "learning_rate": 4.4280597846219155e-05, - "loss": 0.216, - "step": 2240 - }, - { - "epoch": 0.22858884486437062, - "grad_norm": 1.8515625, - "learning_rate": 4.422918330711796e-05, - "loss": 0.1612, - "step": 2250 - }, - { - "epoch": 0.22960479528599004, - "grad_norm": 1.90625, - "learning_rate": 4.417756883947263e-05, - "loss": 0.107, - "step": 2260 - }, - { - "epoch": 0.23062074570760946, - "grad_norm": 3.375, - "learning_rate": 4.412575497992611e-05, - "loss": 0.1756, - "step": 2270 - }, - { - "epoch": 0.23163669612922888, - "grad_norm": 4.375, - "learning_rate": 4.407374226719445e-05, - "loss": 0.234, - "step": 2280 - }, - { - "epoch": 0.23265264655084833, - "grad_norm": 3.25, - "learning_rate": 4.402153124206119e-05, - "loss": 0.2144, - "step": 2290 - }, - { - "epoch": 0.23366859697246775, - "grad_norm": 1.703125, - "learning_rate": 4.396912244737173e-05, - "loss": 0.1696, - "step": 2300 - }, - { - "epoch": 0.23468454739408717, - "grad_norm": 2.84375, - "learning_rate": 4.391651642802778e-05, - "loss": 0.2506, - "step": 2310 - }, - { - "epoch": 0.2357004978157066, - "grad_norm": 4.5, - "learning_rate": 4.386371373098155e-05, - "loss": 0.1686, - "step": 2320 - }, - { - "epoch": 0.23671644823732602, - "grad_norm": 2.515625, - "learning_rate": 4.381071490523018e-05, - "loss": 0.2403, - "step": 2330 - }, - { - "epoch": 0.23773239865894544, - "grad_norm": 4.4375, - "learning_rate": 4.3757520501809955e-05, - "loss": 0.1611, - "step": 2340 - }, - { - "epoch": 0.23874834908056486, - "grad_norm": 1.609375, - "learning_rate": 4.370413107379065e-05, - "loss": 0.1698, - "step": 2350 - }, - { - "epoch": 0.23976429950218428, - "grad_norm": 4.96875, - "learning_rate": 4.36505471762697e-05, - "loss": 0.1928, - "step": 2360 - }, - { - "epoch": 0.24078024992380373, - "grad_norm": 0.8984375, - "learning_rate": 4.3596769366366474e-05, - "loss": 0.2035, - "step": 2370 - }, - { - "epoch": 0.24179620034542315, - "grad_norm": 5.75, - "learning_rate": 4.354279820321649e-05, - "loss": 0.16, - "step": 2380 - }, - { - "epoch": 0.24281215076704257, - "grad_norm": 1.9453125, - "learning_rate": 4.34886342479656e-05, - "loss": 0.1851, - "step": 2390 - }, - { - "epoch": 0.243828101188662, - "grad_norm": 1.015625, - "learning_rate": 4.34342780637641e-05, - "loss": 0.1726, - "step": 2400 - }, - { - "epoch": 0.24484405161028142, - "grad_norm": 4.59375, - "learning_rate": 4.337973021576095e-05, - "loss": 0.2847, - "step": 2410 - }, - { - "epoch": 0.24586000203190084, - "grad_norm": 1.03125, - "learning_rate": 4.3324991271097846e-05, - "loss": 0.2528, - "step": 2420 - }, - { - "epoch": 0.24687595245352026, - "grad_norm": 2.1875, - "learning_rate": 4.3270061798903374e-05, - "loss": 0.1573, - "step": 2430 - }, - { - "epoch": 0.24789190287513968, - "grad_norm": 0.98046875, - "learning_rate": 4.321494237028701e-05, - "loss": 0.1703, - "step": 2440 - }, - { - "epoch": 0.24890785329675913, - "grad_norm": 3.8125, - "learning_rate": 4.31596335583333e-05, - "loss": 0.2613, - "step": 2450 - }, - { - "epoch": 0.24992380371837855, - "grad_norm": 4.0625, - "learning_rate": 4.310413593809579e-05, - "loss": 0.22, - "step": 2460 - }, - { - "epoch": 0.250939754139998, - "grad_norm": 3.15625, - "learning_rate": 4.304845008659108e-05, - "loss": 0.1263, - "step": 2470 - }, - { - "epoch": 0.25195570456161737, - "grad_norm": 3.046875, - "learning_rate": 4.2992576582792895e-05, - "loss": 0.1639, - "step": 2480 - }, - { - "epoch": 0.2529716549832368, - "grad_norm": 9.8125, - "learning_rate": 4.293651600762595e-05, - "loss": 0.2681, - "step": 2490 - }, - { - "epoch": 0.25398760540485626, - "grad_norm": 3.734375, - "learning_rate": 4.288026894395999e-05, - "loss": 0.2292, - "step": 2500 - }, - { - "epoch": 0.25500355582647566, - "grad_norm": 0.455078125, - "learning_rate": 4.2823835976603723e-05, - "loss": 0.2324, - "step": 2510 - }, - { - "epoch": 0.2560195062480951, - "grad_norm": 5.625, - "learning_rate": 4.276721769229869e-05, - "loss": 0.1834, - "step": 2520 - }, - { - "epoch": 0.2570354566697145, - "grad_norm": 1.3671875, - "learning_rate": 4.271041467971323e-05, - "loss": 0.1826, - "step": 2530 - }, - { - "epoch": 0.25805140709133395, - "grad_norm": 5.0625, - "learning_rate": 4.265342752943632e-05, - "loss": 0.2463, - "step": 2540 - }, - { - "epoch": 0.25906735751295334, - "grad_norm": 2.859375, - "learning_rate": 4.2596256833971425e-05, - "loss": 0.2598, - "step": 2550 - }, - { - "epoch": 0.2600833079345728, - "grad_norm": 1.8515625, - "learning_rate": 4.2538903187730374e-05, - "loss": 0.1148, - "step": 2560 - }, - { - "epoch": 0.26109925835619224, - "grad_norm": 2.71875, - "learning_rate": 4.248136718702716e-05, - "loss": 0.2123, - "step": 2570 - }, - { - "epoch": 0.26211520877781164, - "grad_norm": 4.5625, - "learning_rate": 4.242364943007172e-05, - "loss": 0.2369, - "step": 2580 - }, - { - "epoch": 0.2631311591994311, - "grad_norm": 2.296875, - "learning_rate": 4.236575051696377e-05, - "loss": 0.261, - "step": 2590 - }, - { - "epoch": 0.2641471096210505, - "grad_norm": 2.75, - "learning_rate": 4.2307671049686514e-05, - "loss": 0.1564, - "step": 2600 - }, - { - "epoch": 0.26516306004266993, - "grad_norm": 3.5, - "learning_rate": 4.2249411632100396e-05, - "loss": 0.1563, - "step": 2610 - }, - { - "epoch": 0.2661790104642893, - "grad_norm": 2.84375, - "learning_rate": 4.219097286993684e-05, - "loss": 0.1697, - "step": 2620 - }, - { - "epoch": 0.26719496088590877, - "grad_norm": 2.125, - "learning_rate": 4.2132355370791946e-05, - "loss": 0.1844, - "step": 2630 - }, - { - "epoch": 0.26821091130752817, - "grad_norm": 4.03125, - "learning_rate": 4.2073559744120156e-05, - "loss": 0.2144, - "step": 2640 - }, - { - "epoch": 0.2692268617291476, - "grad_norm": 2.375, - "learning_rate": 4.201458660122793e-05, - "loss": 0.2013, - "step": 2650 - }, - { - "epoch": 0.27024281215076706, - "grad_norm": 3.625, - "learning_rate": 4.1955436555267393e-05, - "loss": 0.2166, - "step": 2660 - }, - { - "epoch": 0.27125876257238646, - "grad_norm": 0.328125, - "learning_rate": 4.189611022122997e-05, - "loss": 0.1934, - "step": 2670 - }, - { - "epoch": 0.2722747129940059, - "grad_norm": 2.75, - "learning_rate": 4.1836608215939944e-05, - "loss": 0.2157, - "step": 2680 - }, - { - "epoch": 0.2732906634156253, - "grad_norm": 3.5, - "learning_rate": 4.17769311580481e-05, - "loss": 0.18, - "step": 2690 - }, - { - "epoch": 0.27430661383724475, - "grad_norm": 2.109375, - "learning_rate": 4.171707966802528e-05, - "loss": 0.2178, - "step": 2700 - }, - { - "epoch": 0.27532256425886414, - "grad_norm": 4.65625, - "learning_rate": 4.16570543681559e-05, - "loss": 0.1896, - "step": 2710 - }, - { - "epoch": 0.2763385146804836, - "grad_norm": 4.8125, - "learning_rate": 4.159685588253151e-05, - "loss": 0.1322, - "step": 2720 - }, - { - "epoch": 0.27735446510210304, - "grad_norm": 3.9375, - "learning_rate": 4.153648483704429e-05, - "loss": 0.184, - "step": 2730 - }, - { - "epoch": 0.27837041552372244, - "grad_norm": 4.53125, - "learning_rate": 4.147594185938057e-05, - "loss": 0.2451, - "step": 2740 - }, - { - "epoch": 0.2793863659453419, - "grad_norm": 1.0390625, - "learning_rate": 4.141522757901426e-05, - "loss": 0.2367, - "step": 2750 - }, - { - "epoch": 0.2804023163669613, - "grad_norm": 3.375, - "learning_rate": 4.1354342627200345e-05, - "loss": 0.179, - "step": 2760 - }, - { - "epoch": 0.28141826678858073, - "grad_norm": 2.953125, - "learning_rate": 4.1293287636968286e-05, - "loss": 0.1396, - "step": 2770 - }, - { - "epoch": 0.2824342172102001, - "grad_norm": 2.546875, - "learning_rate": 4.1232063243115485e-05, - "loss": 0.1963, - "step": 2780 - }, - { - "epoch": 0.28345016763181957, - "grad_norm": 5.09375, - "learning_rate": 4.117067008220063e-05, - "loss": 0.2457, - "step": 2790 - }, - { - "epoch": 0.28446611805343897, - "grad_norm": 2.046875, - "learning_rate": 4.110910879253712e-05, - "loss": 0.2262, - "step": 2800 - }, - { - "epoch": 0.2854820684750584, - "grad_norm": 2.1875, - "learning_rate": 4.104738001418641e-05, - "loss": 0.2499, - "step": 2810 - }, - { - "epoch": 0.28649801889667786, - "grad_norm": 2.59375, - "learning_rate": 4.098548438895135e-05, - "loss": 0.1667, - "step": 2820 - }, - { - "epoch": 0.28751396931829726, - "grad_norm": 2.875, - "learning_rate": 4.092342256036954e-05, - "loss": 0.2288, - "step": 2830 - }, - { - "epoch": 0.2885299197399167, - "grad_norm": 3.015625, - "learning_rate": 4.086119517370659e-05, - "loss": 0.2038, - "step": 2840 - }, - { - "epoch": 0.2895458701615361, - "grad_norm": 3.53125, - "learning_rate": 4.0798802875949485e-05, - "loss": 0.181, - "step": 2850 - }, - { - "epoch": 0.29056182058315555, - "grad_norm": 2.296875, - "learning_rate": 4.073624631579975e-05, - "loss": 0.1886, - "step": 2860 - }, - { - "epoch": 0.29157777100477494, - "grad_norm": 3.609375, - "learning_rate": 4.067352614366685e-05, - "loss": 0.2053, - "step": 2870 - }, - { - "epoch": 0.2925937214263944, - "grad_norm": 2.328125, - "learning_rate": 4.061064301166128e-05, - "loss": 0.1409, - "step": 2880 - }, - { - "epoch": 0.29360967184801384, - "grad_norm": 4.9375, - "learning_rate": 4.054759757358787e-05, - "loss": 0.184, - "step": 2890 - }, - { - "epoch": 0.29462562226963324, - "grad_norm": 4.6875, - "learning_rate": 4.048439048493898e-05, - "loss": 0.2306, - "step": 2900 - }, - { - "epoch": 0.2956415726912527, - "grad_norm": 4.09375, - "learning_rate": 4.0421022402887676e-05, - "loss": 0.1914, - "step": 2910 - }, - { - "epoch": 0.2966575231128721, - "grad_norm": 2.3125, - "learning_rate": 4.035749398628088e-05, - "loss": 0.1653, - "step": 2920 - }, - { - "epoch": 0.29767347353449153, - "grad_norm": 2.515625, - "learning_rate": 4.029380589563256e-05, - "loss": 0.1941, - "step": 2930 - }, - { - "epoch": 0.2986894239561109, - "grad_norm": 1.78125, - "learning_rate": 4.02299587931168e-05, - "loss": 0.1117, - "step": 2940 - }, - { - "epoch": 0.29970537437773037, - "grad_norm": 0.8359375, - "learning_rate": 4.0165953342560974e-05, - "loss": 0.1605, - "step": 2950 - }, - { - "epoch": 0.30072132479934977, - "grad_norm": 3.046875, - "learning_rate": 4.010179020943884e-05, - "loss": 0.1726, - "step": 2960 - }, - { - "epoch": 0.3017372752209692, - "grad_norm": 3.453125, - "learning_rate": 4.003747006086357e-05, - "loss": 0.2208, - "step": 2970 - }, - { - "epoch": 0.30275322564258866, - "grad_norm": 2.515625, - "learning_rate": 3.9972993565580866e-05, - "loss": 0.1325, - "step": 2980 - }, - { - "epoch": 0.30376917606420806, - "grad_norm": 3.046875, - "learning_rate": 3.9908361393962e-05, - "loss": 0.2014, - "step": 2990 - }, - { - "epoch": 0.3047851264858275, - "grad_norm": 2.28125, - "learning_rate": 3.984357421799681e-05, - "loss": 0.165, - "step": 3000 - }, - { - "epoch": 0.3058010769074469, - "grad_norm": 5.09375, - "learning_rate": 3.9778632711286756e-05, - "loss": 0.212, - "step": 3010 - }, - { - "epoch": 0.30681702732906635, - "grad_norm": 4.25, - "learning_rate": 3.971353754903788e-05, - "loss": 0.2388, - "step": 3020 - }, - { - "epoch": 0.30783297775068574, - "grad_norm": 2.34375, - "learning_rate": 3.964828940805381e-05, - "loss": 0.2175, - "step": 3030 - }, - { - "epoch": 0.3088489281723052, - "grad_norm": 4.09375, - "learning_rate": 3.95828889667287e-05, - "loss": 0.2088, - "step": 3040 - }, - { - "epoch": 0.30986487859392464, - "grad_norm": 2.359375, - "learning_rate": 3.9517336905040244e-05, - "loss": 0.1913, - "step": 3050 - }, - { - "epoch": 0.31088082901554404, - "grad_norm": 1.1640625, - "learning_rate": 3.9451633904542483e-05, - "loss": 0.2185, - "step": 3060 - }, - { - "epoch": 0.3118967794371635, - "grad_norm": 2.59375, - "learning_rate": 3.9385780648358846e-05, - "loss": 0.2072, - "step": 3070 - }, - { - "epoch": 0.3129127298587829, - "grad_norm": 3.015625, - "learning_rate": 3.9319777821174955e-05, - "loss": 0.1902, - "step": 3080 - }, - { - "epoch": 0.31392868028040233, - "grad_norm": 2.375, - "learning_rate": 3.925362610923158e-05, - "loss": 0.259, - "step": 3090 - }, - { - "epoch": 0.3149446307020217, - "grad_norm": 4.65625, - "learning_rate": 3.918732620031742e-05, - "loss": 0.2026, - "step": 3100 - }, - { - "epoch": 0.31596058112364117, - "grad_norm": 2.1875, - "learning_rate": 3.912087878376205e-05, - "loss": 0.1478, - "step": 3110 - }, - { - "epoch": 0.31697653154526056, - "grad_norm": 2.34375, - "learning_rate": 3.905428455042865e-05, - "loss": 0.167, - "step": 3120 - }, - { - "epoch": 0.31799248196688, - "grad_norm": 2.390625, - "learning_rate": 3.898754419270693e-05, - "loss": 0.1629, - "step": 3130 - }, - { - "epoch": 0.31900843238849946, - "grad_norm": 1.546875, - "learning_rate": 3.892065840450583e-05, - "loss": 0.1308, - "step": 3140 - }, - { - "epoch": 0.32002438281011886, - "grad_norm": 4.625, - "learning_rate": 3.885362788124637e-05, - "loss": 0.2008, - "step": 3150 - }, - { - "epoch": 0.3210403332317383, - "grad_norm": 3.8125, - "learning_rate": 3.8786453319854396e-05, - "loss": 0.2225, - "step": 3160 - }, - { - "epoch": 0.3220562836533577, - "grad_norm": 3.015625, - "learning_rate": 3.8719135418753366e-05, - "loss": 0.2243, - "step": 3170 - }, - { - "epoch": 0.32307223407497715, - "grad_norm": 5.6875, - "learning_rate": 3.865167487785702e-05, - "loss": 0.1981, - "step": 3180 - }, - { - "epoch": 0.32408818449659654, - "grad_norm": 4.84375, - "learning_rate": 3.8584072398562164e-05, - "loss": 0.2031, - "step": 3190 - }, - { - "epoch": 0.325104134918216, - "grad_norm": 4.0625, - "learning_rate": 3.851632868374136e-05, - "loss": 0.1621, - "step": 3200 - }, - { - "epoch": 0.32612008533983544, - "grad_norm": 3.421875, - "learning_rate": 3.844844443773562e-05, - "loss": 0.1674, - "step": 3210 - }, - { - "epoch": 0.32713603576145484, - "grad_norm": 1.3671875, - "learning_rate": 3.8380420366347046e-05, - "loss": 0.1502, - "step": 3220 - }, - { - "epoch": 0.3281519861830743, - "grad_norm": 3.734375, - "learning_rate": 3.831225717683157e-05, - "loss": 0.1868, - "step": 3230 - }, - { - "epoch": 0.3291679366046937, - "grad_norm": 2.703125, - "learning_rate": 3.8243955577891534e-05, - "loss": 0.1818, - "step": 3240 - }, - { - "epoch": 0.3301838870263131, - "grad_norm": 3.796875, - "learning_rate": 3.8175516279668335e-05, - "loss": 0.2215, - "step": 3250 - }, - { - "epoch": 0.3311998374479325, - "grad_norm": 3.203125, - "learning_rate": 3.810693999373505e-05, - "loss": 0.2544, - "step": 3260 - }, - { - "epoch": 0.33221578786955197, - "grad_norm": 4.0, - "learning_rate": 3.8038227433089056e-05, - "loss": 0.1175, - "step": 3270 - }, - { - "epoch": 0.33323173829117136, - "grad_norm": 3.625, - "learning_rate": 3.796937931214458e-05, - "loss": 0.2213, - "step": 3280 - }, - { - "epoch": 0.3342476887127908, - "grad_norm": 1.7265625, - "learning_rate": 3.7900396346725296e-05, - "loss": 0.1711, - "step": 3290 - }, - { - "epoch": 0.33526363913441026, - "grad_norm": 3.140625, - "learning_rate": 3.783127925405686e-05, - "loss": 0.2628, - "step": 3300 - }, - { - "epoch": 0.33627958955602966, - "grad_norm": 2.1875, - "learning_rate": 3.77620287527595e-05, - "loss": 0.1671, - "step": 3310 - }, - { - "epoch": 0.3372955399776491, - "grad_norm": 5.28125, - "learning_rate": 3.769264556284048e-05, - "loss": 0.2109, - "step": 3320 - }, - { - "epoch": 0.3383114903992685, - "grad_norm": 2.875, - "learning_rate": 3.762313040568665e-05, - "loss": 0.1978, - "step": 3330 - }, - { - "epoch": 0.33932744082088795, - "grad_norm": 2.234375, - "learning_rate": 3.755348400405697e-05, - "loss": 0.1275, - "step": 3340 - }, - { - "epoch": 0.34034339124250734, - "grad_norm": 1.9453125, - "learning_rate": 3.7483707082074945e-05, - "loss": 0.1482, - "step": 3350 - }, - { - "epoch": 0.3413593416641268, - "grad_norm": 5.40625, - "learning_rate": 3.741380036522111e-05, - "loss": 0.1933, - "step": 3360 - }, - { - "epoch": 0.34237529208574624, - "grad_norm": 4.53125, - "learning_rate": 3.734376458032551e-05, - "loss": 0.1925, - "step": 3370 - }, - { - "epoch": 0.34339124250736563, - "grad_norm": 4.0625, - "learning_rate": 3.727360045556014e-05, - "loss": 0.2297, - "step": 3380 - }, - { - "epoch": 0.3444071929289851, - "grad_norm": 2.53125, - "learning_rate": 3.7203308720431336e-05, - "loss": 0.1704, - "step": 3390 - }, - { - "epoch": 0.3454231433506045, - "grad_norm": 1.859375, - "learning_rate": 3.7132890105772234e-05, - "loss": 0.258, - "step": 3400 - }, - { - "epoch": 0.3464390937722239, - "grad_norm": 3.90625, - "learning_rate": 3.706234534373515e-05, - "loss": 0.2376, - "step": 3410 - }, - { - "epoch": 0.3474550441938433, - "grad_norm": 1.1015625, - "learning_rate": 3.6991675167783985e-05, - "loss": 0.2403, - "step": 3420 - }, - { - "epoch": 0.34847099461546277, - "grad_norm": 1.1640625, - "learning_rate": 3.6920880312686556e-05, - "loss": 0.1642, - "step": 3430 - }, - { - "epoch": 0.34948694503708216, - "grad_norm": 2.875, - "learning_rate": 3.684996151450702e-05, - "loss": 0.1455, - "step": 3440 - }, - { - "epoch": 0.3505028954587016, - "grad_norm": 0.59765625, - "learning_rate": 3.6778919510598155e-05, - "loss": 0.2175, - "step": 3450 - }, - { - "epoch": 0.35151884588032106, - "grad_norm": 0.93359375, - "learning_rate": 3.670775503959376e-05, - "loss": 0.1858, - "step": 3460 - }, - { - "epoch": 0.35253479630194046, - "grad_norm": 4.1875, - "learning_rate": 3.6636468841400917e-05, - "loss": 0.1911, - "step": 3470 - }, - { - "epoch": 0.3535507467235599, - "grad_norm": 3.734375, - "learning_rate": 3.656506165719233e-05, - "loss": 0.2114, - "step": 3480 - }, - { - "epoch": 0.3545666971451793, - "grad_norm": 1.171875, - "learning_rate": 3.649353422939863e-05, - "loss": 0.1841, - "step": 3490 - }, - { - "epoch": 0.35558264756679875, - "grad_norm": 2.53125, - "learning_rate": 3.6421887301700615e-05, - "loss": 0.1505, - "step": 3500 - }, - { - "epoch": 0.35659859798841814, - "grad_norm": 4.9375, - "learning_rate": 3.6350121619021524e-05, - "loss": 0.2625, - "step": 3510 - }, - { - "epoch": 0.3576145484100376, - "grad_norm": 5.25, - "learning_rate": 3.627823792751936e-05, - "loss": 0.1676, - "step": 3520 - }, - { - "epoch": 0.35863049883165704, - "grad_norm": 1.09375, - "learning_rate": 3.620623697457905e-05, - "loss": 0.1963, - "step": 3530 - }, - { - "epoch": 0.35964644925327643, - "grad_norm": 4.03125, - "learning_rate": 3.613411950880468e-05, - "loss": 0.2048, - "step": 3540 - }, - { - "epoch": 0.3606623996748959, - "grad_norm": 4.40625, - "learning_rate": 3.606188628001178e-05, - "loss": 0.226, - "step": 3550 - }, - { - "epoch": 0.3616783500965153, - "grad_norm": 2.375, - "learning_rate": 3.598953803921947e-05, - "loss": 0.1884, - "step": 3560 - }, - { - "epoch": 0.3626943005181347, - "grad_norm": 3.21875, - "learning_rate": 3.591707553864266e-05, - "loss": 0.224, - "step": 3570 - }, - { - "epoch": 0.3637102509397541, - "grad_norm": 3.5625, - "learning_rate": 3.584449953168423e-05, - "loss": 0.1866, - "step": 3580 - }, - { - "epoch": 0.36472620136137357, - "grad_norm": 2.359375, - "learning_rate": 3.577181077292722e-05, - "loss": 0.1663, - "step": 3590 - }, - { - "epoch": 0.36574215178299296, - "grad_norm": 5.0, - "learning_rate": 3.569901001812696e-05, - "loss": 0.2032, - "step": 3600 - }, - { - "epoch": 0.3667581022046124, - "grad_norm": 1.953125, - "learning_rate": 3.562609802420321e-05, - "loss": 0.2395, - "step": 3610 - }, - { - "epoch": 0.36777405262623186, - "grad_norm": 3.796875, - "learning_rate": 3.555307554923229e-05, - "loss": 0.1799, - "step": 3620 - }, - { - "epoch": 0.36879000304785126, - "grad_norm": 4.4375, - "learning_rate": 3.547994335243925e-05, - "loss": 0.1771, - "step": 3630 - }, - { - "epoch": 0.3698059534694707, - "grad_norm": 1.890625, - "learning_rate": 3.540670219418989e-05, - "loss": 0.2123, - "step": 3640 - }, - { - "epoch": 0.3708219038910901, - "grad_norm": 4.03125, - "learning_rate": 3.53333528359829e-05, - "loss": 0.2159, - "step": 3650 - }, - { - "epoch": 0.37183785431270955, - "grad_norm": 3.265625, - "learning_rate": 3.525989604044198e-05, - "loss": 0.2749, - "step": 3660 - }, - { - "epoch": 0.37285380473432894, - "grad_norm": 1.4375, - "learning_rate": 3.5186332571307826e-05, - "loss": 0.1613, - "step": 3670 - }, - { - "epoch": 0.3738697551559484, - "grad_norm": 3.984375, - "learning_rate": 3.511266319343025e-05, - "loss": 0.1877, - "step": 3680 - }, - { - "epoch": 0.37488570557756784, - "grad_norm": 2.203125, - "learning_rate": 3.503888867276022e-05, - "loss": 0.2185, - "step": 3690 - }, - { - "epoch": 0.37590165599918723, - "grad_norm": 1.5078125, - "learning_rate": 3.4965009776341894e-05, - "loss": 0.2195, - "step": 3700 - }, - { - "epoch": 0.3769176064208067, - "grad_norm": 4.375, - "learning_rate": 3.489102727230461e-05, - "loss": 0.2344, - "step": 3710 - }, - { - "epoch": 0.3779335568424261, - "grad_norm": 2.984375, - "learning_rate": 3.481694192985496e-05, - "loss": 0.1863, - "step": 3720 - }, - { - "epoch": 0.3789495072640455, - "grad_norm": 1.1328125, - "learning_rate": 3.474275451926875e-05, - "loss": 0.1894, - "step": 3730 - }, - { - "epoch": 0.3799654576856649, - "grad_norm": 2.265625, - "learning_rate": 3.4668465811883e-05, - "loss": 0.2127, - "step": 3740 - }, - { - "epoch": 0.38098140810728437, - "grad_norm": 2.921875, - "learning_rate": 3.4594076580087914e-05, - "loss": 0.2125, - "step": 3750 - }, - { - "epoch": 0.38199735852890376, - "grad_norm": 2.390625, - "learning_rate": 3.451958759731889e-05, - "loss": 0.1801, - "step": 3760 - }, - { - "epoch": 0.3830133089505232, - "grad_norm": 3.046875, - "learning_rate": 3.4444999638048456e-05, - "loss": 0.1949, - "step": 3770 - }, - { - "epoch": 0.38402925937214266, - "grad_norm": 2.890625, - "learning_rate": 3.437031347777817e-05, - "loss": 0.2719, - "step": 3780 - }, - { - "epoch": 0.38504520979376206, - "grad_norm": 3.9375, - "learning_rate": 3.4295529893030634e-05, - "loss": 0.1697, - "step": 3790 - }, - { - "epoch": 0.3860611602153815, - "grad_norm": 2.0625, - "learning_rate": 3.422064966134138e-05, - "loss": 0.1557, - "step": 3800 - }, - { - "epoch": 0.3870771106370009, - "grad_norm": 2.234375, - "learning_rate": 3.4145673561250794e-05, - "loss": 0.2129, - "step": 3810 - }, - { - "epoch": 0.38809306105862035, - "grad_norm": 4.96875, - "learning_rate": 3.4070602372296e-05, - "loss": 0.2068, - "step": 3820 - }, - { - "epoch": 0.38910901148023974, - "grad_norm": 2.234375, - "learning_rate": 3.39954368750028e-05, - "loss": 0.1634, - "step": 3830 - }, - { - "epoch": 0.3901249619018592, - "grad_norm": 1.75, - "learning_rate": 3.392017785087752e-05, - "loss": 0.2299, - "step": 3840 - }, - { - "epoch": 0.39114091232347864, - "grad_norm": 3.90625, - "learning_rate": 3.38448260823989e-05, - "loss": 0.1585, - "step": 3850 - }, - { - "epoch": 0.39215686274509803, - "grad_norm": 2.8125, - "learning_rate": 3.376938235300996e-05, - "loss": 0.2382, - "step": 3860 - }, - { - "epoch": 0.3931728131667175, - "grad_norm": 5.375, - "learning_rate": 3.369384744710984e-05, - "loss": 0.1987, - "step": 3870 - }, - { - "epoch": 0.3941887635883369, - "grad_norm": 2.578125, - "learning_rate": 3.361822215004566e-05, - "loss": 0.2316, - "step": 3880 - }, - { - "epoch": 0.3952047140099563, - "grad_norm": 2.0, - "learning_rate": 3.354250724810436e-05, - "loss": 0.2019, - "step": 3890 - }, - { - "epoch": 0.3962206644315757, - "grad_norm": 2.3125, - "learning_rate": 3.34667035285045e-05, - "loss": 0.187, - "step": 3900 - }, - { - "epoch": 0.39723661485319517, - "grad_norm": 3.53125, - "learning_rate": 3.339081177938811e-05, - "loss": 0.2353, - "step": 3910 - }, - { - "epoch": 0.39825256527481456, - "grad_norm": 1.9609375, - "learning_rate": 3.331483278981244e-05, - "loss": 0.2078, - "step": 3920 - }, - { - "epoch": 0.399268515696434, - "grad_norm": 1.2109375, - "learning_rate": 3.323876734974183e-05, - "loss": 0.1761, - "step": 3930 - }, - { - "epoch": 0.40028446611805346, - "grad_norm": 4.0625, - "learning_rate": 3.316261625003943e-05, - "loss": 0.2081, - "step": 3940 - }, - { - "epoch": 0.40130041653967286, - "grad_norm": 1.953125, - "learning_rate": 3.308638028245902e-05, - "loss": 0.2087, - "step": 3950 - }, - { - "epoch": 0.4023163669612923, - "grad_norm": 2.390625, - "learning_rate": 3.301006023963676e-05, - "loss": 0.1579, - "step": 3960 - }, - { - "epoch": 0.4033323173829117, - "grad_norm": 3.53125, - "learning_rate": 3.293365691508295e-05, - "loss": 0.1904, - "step": 3970 - }, - { - "epoch": 0.40434826780453115, - "grad_norm": 3.0, - "learning_rate": 3.285717110317379e-05, - "loss": 0.1991, - "step": 3980 - }, - { - "epoch": 0.40536421822615054, - "grad_norm": 7.21875, - "learning_rate": 3.27806035991431e-05, - "loss": 0.1445, - "step": 3990 - }, - { - "epoch": 0.40638016864777, - "grad_norm": 1.0859375, - "learning_rate": 3.2703955199074075e-05, - "loss": 0.2393, - "step": 4000 - }, - { - "epoch": 0.40739611906938944, - "grad_norm": 4.5625, - "learning_rate": 3.262722669989098e-05, - "loss": 0.1789, - "step": 4010 - }, - { - "epoch": 0.40841206949100883, - "grad_norm": 3.09375, - "learning_rate": 3.255041889935092e-05, - "loss": 0.1511, - "step": 4020 - }, - { - "epoch": 0.4094280199126283, - "grad_norm": 1.90625, - "learning_rate": 3.247353259603547e-05, - "loss": 0.2066, - "step": 4030 - }, - { - "epoch": 0.4104439703342477, - "grad_norm": 2.28125, - "learning_rate": 3.239656858934242e-05, - "loss": 0.1564, - "step": 4040 - }, - { - "epoch": 0.4114599207558671, - "grad_norm": 2.609375, - "learning_rate": 3.231952767947746e-05, - "loss": 0.1503, - "step": 4050 - }, - { - "epoch": 0.4124758711774865, - "grad_norm": 1.4453125, - "learning_rate": 3.2242410667445844e-05, - "loss": 0.1633, - "step": 4060 - }, - { - "epoch": 0.41349182159910597, - "grad_norm": 3.015625, - "learning_rate": 3.2165218355044076e-05, - "loss": 0.1492, - "step": 4070 - }, - { - "epoch": 0.41450777202072536, - "grad_norm": 3.234375, - "learning_rate": 3.2087951544851566e-05, - "loss": 0.3051, - "step": 4080 - }, - { - "epoch": 0.4155237224423448, - "grad_norm": 2.9375, - "learning_rate": 3.20106110402223e-05, - "loss": 0.2229, - "step": 4090 - }, - { - "epoch": 0.41653967286396426, - "grad_norm": 3.171875, - "learning_rate": 3.1933197645276455e-05, - "loss": 0.2224, - "step": 4100 - }, - { - "epoch": 0.41755562328558365, - "grad_norm": 2.09375, - "learning_rate": 3.185571216489209e-05, - "loss": 0.1297, - "step": 4110 - }, - { - "epoch": 0.4185715737072031, - "grad_norm": 3.625, - "learning_rate": 3.177815540469669e-05, - "loss": 0.2074, - "step": 4120 - }, - { - "epoch": 0.4195875241288225, - "grad_norm": 2.296875, - "learning_rate": 3.1700528171058916e-05, - "loss": 0.1949, - "step": 4130 - }, - { - "epoch": 0.42060347455044195, - "grad_norm": 3.8125, - "learning_rate": 3.162283127108011e-05, - "loss": 0.1661, - "step": 4140 - }, - { - "epoch": 0.42161942497206134, - "grad_norm": 2.5, - "learning_rate": 3.154506551258594e-05, - "loss": 0.2275, - "step": 4150 - }, - { - "epoch": 0.4226353753936808, - "grad_norm": 2.96875, - "learning_rate": 3.146723170411804e-05, - "loss": 0.2242, - "step": 4160 - }, - { - "epoch": 0.42365132581530024, - "grad_norm": 6.625, - "learning_rate": 3.138933065492552e-05, - "loss": 0.1897, - "step": 4170 - }, - { - "epoch": 0.42466727623691963, - "grad_norm": 0.8515625, - "learning_rate": 3.131136317495665e-05, - "loss": 0.1629, - "step": 4180 - }, - { - "epoch": 0.4256832266585391, - "grad_norm": 0.94140625, - "learning_rate": 3.1233330074850364e-05, - "loss": 0.1535, - "step": 4190 - }, - { - "epoch": 0.4266991770801585, - "grad_norm": 2.6875, - "learning_rate": 3.115523216592786e-05, - "loss": 0.2494, - "step": 4200 - }, - { - "epoch": 0.4277151275017779, - "grad_norm": 2.578125, - "learning_rate": 3.107707026018417e-05, - "loss": 0.1705, - "step": 4210 - }, - { - "epoch": 0.4287310779233973, - "grad_norm": 3.0625, - "learning_rate": 3.09988451702797e-05, - "loss": 0.1507, - "step": 4220 - }, - { - "epoch": 0.42974702834501677, - "grad_norm": 2.421875, - "learning_rate": 3.0920557709531804e-05, - "loss": 0.3071, - "step": 4230 - }, - { - "epoch": 0.43076297876663616, - "grad_norm": 3.640625, - "learning_rate": 3.0842208691906306e-05, - "loss": 0.199, - "step": 4240 - }, - { - "epoch": 0.4317789291882556, - "grad_norm": 3.5625, - "learning_rate": 3.076379893200904e-05, - "loss": 0.1987, - "step": 4250 - }, - { - "epoch": 0.43279487960987506, - "grad_norm": 3.65625, - "learning_rate": 3.068532924507739e-05, - "loss": 0.1945, - "step": 4260 - }, - { - "epoch": 0.43381083003149445, - "grad_norm": 5.875, - "learning_rate": 3.060680044697183e-05, - "loss": 0.1937, - "step": 4270 - }, - { - "epoch": 0.4348267804531139, - "grad_norm": 2.859375, - "learning_rate": 3.052821335416739e-05, - "loss": 0.1643, - "step": 4280 - }, - { - "epoch": 0.4358427308747333, - "grad_norm": 3.296875, - "learning_rate": 3.0449568783745203e-05, - "loss": 0.1455, - "step": 4290 - }, - { - "epoch": 0.43685868129635275, - "grad_norm": 0.427734375, - "learning_rate": 3.0370867553384023e-05, - "loss": 0.1891, - "step": 4300 - }, - { - "epoch": 0.43787463171797214, - "grad_norm": 0.361328125, - "learning_rate": 3.029211048135171e-05, - "loss": 0.1377, - "step": 4310 - }, - { - "epoch": 0.4388905821395916, - "grad_norm": 1.8203125, - "learning_rate": 3.021329838649668e-05, - "loss": 0.2194, - "step": 4320 - }, - { - "epoch": 0.43990653256121104, - "grad_norm": 1.8828125, - "learning_rate": 3.0134432088239462e-05, - "loss": 0.1915, - "step": 4330 - }, - { - "epoch": 0.44092248298283043, - "grad_norm": 2.015625, - "learning_rate": 3.0055512406564146e-05, - "loss": 0.1794, - "step": 4340 - }, - { - "epoch": 0.4419384334044499, - "grad_norm": 2.546875, - "learning_rate": 2.9976540162009836e-05, - "loss": 0.2154, - "step": 4350 - }, - { - "epoch": 0.4429543838260693, - "grad_norm": 4.09375, - "learning_rate": 2.9897516175662155e-05, - "loss": 0.1861, - "step": 4360 - }, - { - "epoch": 0.4439703342476887, - "grad_norm": 3.953125, - "learning_rate": 2.9818441269144693e-05, - "loss": 0.1857, - "step": 4370 - }, - { - "epoch": 0.4449862846693081, - "grad_norm": 2.234375, - "learning_rate": 2.9739316264610452e-05, - "loss": 0.1493, - "step": 4380 - }, - { - "epoch": 0.44600223509092757, - "grad_norm": 1.109375, - "learning_rate": 2.966014198473332e-05, - "loss": 0.186, - "step": 4390 - }, - { - "epoch": 0.44701818551254696, - "grad_norm": 4.5625, - "learning_rate": 2.9580919252699502e-05, - "loss": 0.1963, - "step": 4400 - }, - { - "epoch": 0.4480341359341664, - "grad_norm": 7.3125, - "learning_rate": 2.9501648892198984e-05, - "loss": 0.2882, - "step": 4410 - }, - { - "epoch": 0.44905008635578586, - "grad_norm": 3.03125, - "learning_rate": 2.942233172741693e-05, - "loss": 0.2154, - "step": 4420 - }, - { - "epoch": 0.45006603677740525, - "grad_norm": 2.421875, - "learning_rate": 2.934296858302515e-05, - "loss": 0.2228, - "step": 4430 - }, - { - "epoch": 0.4510819871990247, - "grad_norm": 1.6015625, - "learning_rate": 2.9263560284173485e-05, - "loss": 0.1637, - "step": 4440 - }, - { - "epoch": 0.4520979376206441, - "grad_norm": 4.5, - "learning_rate": 2.91841076564813e-05, - "loss": 0.1396, - "step": 4450 - }, - { - "epoch": 0.45311388804226355, - "grad_norm": 1.9609375, - "learning_rate": 2.9104611526028808e-05, - "loss": 0.186, - "step": 4460 - }, - { - "epoch": 0.45412983846388294, - "grad_norm": 2.046875, - "learning_rate": 2.902507271934855e-05, - "loss": 0.1706, - "step": 4470 - }, - { - "epoch": 0.4551457888855024, - "grad_norm": 2.390625, - "learning_rate": 2.8945492063416768e-05, - "loss": 0.2191, - "step": 4480 - }, - { - "epoch": 0.45616173930712184, - "grad_norm": 2.734375, - "learning_rate": 2.8865870385644823e-05, - "loss": 0.1651, - "step": 4490 - }, - { - "epoch": 0.45717768972874123, - "grad_norm": 4.4375, - "learning_rate": 2.8786208513870583e-05, - "loss": 0.1907, - "step": 4500 - }, - { - "epoch": 0.4581936401503607, - "grad_norm": 1.9609375, - "learning_rate": 2.8706507276349815e-05, - "loss": 0.2256, - "step": 4510 - }, - { - "epoch": 0.4592095905719801, - "grad_norm": 3.375, - "learning_rate": 2.8626767501747588e-05, - "loss": 0.215, - "step": 4520 - }, - { - "epoch": 0.4602255409935995, - "grad_norm": 2.296875, - "learning_rate": 2.854699001912964e-05, - "loss": 0.2241, - "step": 4530 - }, - { - "epoch": 0.4612414914152189, - "grad_norm": 2.078125, - "learning_rate": 2.846717565795376e-05, - "loss": 0.1541, - "step": 4540 - }, - { - "epoch": 0.46225744183683837, - "grad_norm": 0.81640625, - "learning_rate": 2.8387325248061164e-05, - "loss": 0.1718, - "step": 4550 - }, - { - "epoch": 0.46327339225845776, - "grad_norm": 5.6875, - "learning_rate": 2.8307439619667897e-05, - "loss": 0.259, - "step": 4560 - }, - { - "epoch": 0.4642893426800772, - "grad_norm": 1.78125, - "learning_rate": 2.8227519603356157e-05, - "loss": 0.2205, - "step": 4570 - }, - { - "epoch": 0.46530529310169666, - "grad_norm": 4.78125, - "learning_rate": 2.8147566030065677e-05, - "loss": 0.2256, - "step": 4580 - }, - { - "epoch": 0.46632124352331605, - "grad_norm": 3.296875, - "learning_rate": 2.8067579731085085e-05, - "loss": 0.1671, - "step": 4590 - }, - { - "epoch": 0.4673371939449355, - "grad_norm": 3.265625, - "learning_rate": 2.7987561538043273e-05, - "loss": 0.2471, - "step": 4600 - }, - { - "epoch": 0.4683531443665549, - "grad_norm": 3.390625, - "learning_rate": 2.7907512282900727e-05, - "loss": 0.1749, - "step": 4610 - }, - { - "epoch": 0.46936909478817435, - "grad_norm": 3.140625, - "learning_rate": 2.782743279794091e-05, - "loss": 0.2276, - "step": 4620 - }, - { - "epoch": 0.47038504520979374, - "grad_norm": 2.921875, - "learning_rate": 2.7747323915761574e-05, - "loss": 0.1971, - "step": 4630 - }, - { - "epoch": 0.4714009956314132, - "grad_norm": 4.15625, - "learning_rate": 2.7667186469266122e-05, - "loss": 0.1951, - "step": 4640 - }, - { - "epoch": 0.47241694605303264, - "grad_norm": 2.953125, - "learning_rate": 2.7587021291654924e-05, - "loss": 0.2045, - "step": 4650 - }, - { - "epoch": 0.47343289647465203, - "grad_norm": 1.6640625, - "learning_rate": 2.750682921641672e-05, - "loss": 0.155, - "step": 4660 - }, - { - "epoch": 0.4744488468962715, - "grad_norm": 4.375, - "learning_rate": 2.7426611077319864e-05, - "loss": 0.2038, - "step": 4670 - }, - { - "epoch": 0.4754647973178909, - "grad_norm": 5.5, - "learning_rate": 2.734636770840372e-05, - "loss": 0.159, - "step": 4680 - }, - { - "epoch": 0.4764807477395103, - "grad_norm": 1.703125, - "learning_rate": 2.7266099943969976e-05, - "loss": 0.1566, - "step": 4690 - }, - { - "epoch": 0.4774966981611297, - "grad_norm": 0.81640625, - "learning_rate": 2.7185808618573943e-05, - "loss": 0.1927, - "step": 4700 - }, - { - "epoch": 0.47851264858274917, - "grad_norm": 0.81640625, - "learning_rate": 2.710549456701592e-05, - "loss": 0.1873, - "step": 4710 - }, - { - "epoch": 0.47952859900436856, - "grad_norm": 3.828125, - "learning_rate": 2.702515862433247e-05, - "loss": 0.2474, - "step": 4720 - }, - { - "epoch": 0.480544549425988, - "grad_norm": 1.1640625, - "learning_rate": 2.6944801625787795e-05, - "loss": 0.204, - "step": 4730 - }, - { - "epoch": 0.48156049984760746, - "grad_norm": 2.953125, - "learning_rate": 2.6864424406864984e-05, - "loss": 0.1758, - "step": 4740 - }, - { - "epoch": 0.48257645026922685, - "grad_norm": 3.265625, - "learning_rate": 2.6784027803257377e-05, - "loss": 0.161, - "step": 4750 - }, - { - "epoch": 0.4835924006908463, - "grad_norm": 2.046875, - "learning_rate": 2.6703612650859848e-05, - "loss": 0.1469, - "step": 4760 - }, - { - "epoch": 0.4846083511124657, - "grad_norm": 4.03125, - "learning_rate": 2.6623179785760148e-05, - "loss": 0.1858, - "step": 4770 - }, - { - "epoch": 0.48562430153408515, - "grad_norm": 2.65625, - "learning_rate": 2.6542730044230175e-05, - "loss": 0.176, - "step": 4780 - }, - { - "epoch": 0.48664025195570454, - "grad_norm": 2.59375, - "learning_rate": 2.6462264262717278e-05, - "loss": 0.1657, - "step": 4790 - }, - { - "epoch": 0.487656202377324, - "grad_norm": 4.78125, - "learning_rate": 2.6381783277835605e-05, - "loss": 0.2705, - "step": 4800 - }, - { - "epoch": 0.48867215279894344, - "grad_norm": 3.65625, - "learning_rate": 2.6301287926357355e-05, - "loss": 0.2252, - "step": 4810 - }, - { - "epoch": 0.48968810322056283, - "grad_norm": 0.734375, - "learning_rate": 2.622077904520411e-05, - "loss": 0.2141, - "step": 4820 - }, - { - "epoch": 0.4907040536421823, - "grad_norm": 5.15625, - "learning_rate": 2.6140257471438108e-05, - "loss": 0.1935, - "step": 4830 - }, - { - "epoch": 0.4917200040638017, - "grad_norm": 3.625, - "learning_rate": 2.6059724042253574e-05, - "loss": 0.2121, - "step": 4840 - }, - { - "epoch": 0.4927359544854211, - "grad_norm": 1.2890625, - "learning_rate": 2.5979179594967983e-05, - "loss": 0.1221, - "step": 4850 - }, - { - "epoch": 0.4937519049070405, - "grad_norm": 3.4375, - "learning_rate": 2.5898624967013367e-05, - "loss": 0.2208, - "step": 4860 - }, - { - "epoch": 0.49476785532865997, - "grad_norm": 2.40625, - "learning_rate": 2.5818060995927607e-05, - "loss": 0.1904, - "step": 4870 - }, - { - "epoch": 0.49578380575027936, - "grad_norm": 2.921875, - "learning_rate": 2.573748851934574e-05, - "loss": 0.1658, - "step": 4880 - }, - { - "epoch": 0.4967997561718988, - "grad_norm": 1.6640625, - "learning_rate": 2.5656908374991213e-05, - "loss": 0.1626, - "step": 4890 - }, - { - "epoch": 0.49781570659351826, - "grad_norm": 1.8046875, - "learning_rate": 2.557632140066721e-05, - "loss": 0.1905, - "step": 4900 - }, - { - "epoch": 0.49883165701513765, - "grad_norm": 4.875, - "learning_rate": 2.5495728434247917e-05, - "loss": 0.2591, - "step": 4910 - }, - { - "epoch": 0.4998476074367571, - "grad_norm": 1.4453125, - "learning_rate": 2.5415130313669845e-05, - "loss": 0.1359, - "step": 4920 - }, - { - "epoch": 0.5008635578583766, - "grad_norm": 2.109375, - "learning_rate": 2.5334527876923063e-05, - "loss": 0.2353, - "step": 4930 - }, - { - "epoch": 0.501879508279996, - "grad_norm": 3.546875, - "learning_rate": 2.5253921962042525e-05, - "loss": 0.2173, - "step": 4940 - }, - { - "epoch": 0.5028954587016153, - "grad_norm": 1.8125, - "learning_rate": 2.5173313407099373e-05, - "loss": 0.1631, - "step": 4950 - }, - { - "epoch": 0.5039114091232347, - "grad_norm": 2.671875, - "learning_rate": 2.5092703050192163e-05, - "loss": 0.1884, - "step": 4960 - }, - { - "epoch": 0.5049273595448542, - "grad_norm": 2.5625, - "learning_rate": 2.501209172943819e-05, - "loss": 0.217, - "step": 4970 - }, - { - "epoch": 0.5059433099664736, - "grad_norm": 4.375, - "learning_rate": 2.49314802829648e-05, - "loss": 0.1854, - "step": 4980 - }, - { - "epoch": 0.506959260388093, - "grad_norm": 2.3125, - "learning_rate": 2.4850869548900628e-05, - "loss": 0.2049, - "step": 4990 - }, - { - "epoch": 0.5079752108097125, - "grad_norm": 3.859375, - "learning_rate": 2.477026036536688e-05, - "loss": 0.2093, - "step": 5000 - }, - { - "epoch": 0.5089911612313319, - "grad_norm": 1.09375, - "learning_rate": 2.4689653570468677e-05, - "loss": 0.164, - "step": 5010 - }, - { - "epoch": 0.5100071116529513, - "grad_norm": 3.40625, - "learning_rate": 2.460905000228628e-05, - "loss": 0.1649, - "step": 5020 - }, - { - "epoch": 0.5110230620745707, - "grad_norm": 3.546875, - "learning_rate": 2.4528450498866428e-05, - "loss": 0.1777, - "step": 5030 - }, - { - "epoch": 0.5120390124961902, - "grad_norm": 3.0, - "learning_rate": 2.444785589821356e-05, - "loss": 0.1505, - "step": 5040 - }, - { - "epoch": 0.5130549629178096, - "grad_norm": 1.6484375, - "learning_rate": 2.436726703828118e-05, - "loss": 0.2672, - "step": 5050 - }, - { - "epoch": 0.514070913339429, - "grad_norm": 4.34375, - "learning_rate": 2.428668475696308e-05, - "loss": 0.1756, - "step": 5060 - }, - { - "epoch": 0.5150868637610485, - "grad_norm": 2.78125, - "learning_rate": 2.420610989208465e-05, - "loss": 0.1655, - "step": 5070 - }, - { - "epoch": 0.5161028141826679, - "grad_norm": 1.4609375, - "learning_rate": 2.412554328139419e-05, - "loss": 0.1579, - "step": 5080 - }, - { - "epoch": 0.5171187646042873, - "grad_norm": 2.28125, - "learning_rate": 2.404498576255416e-05, - "loss": 0.1599, - "step": 5090 - }, - { - "epoch": 0.5181347150259067, - "grad_norm": 0.6484375, - "learning_rate": 2.3964438173132522e-05, - "loss": 0.1508, - "step": 5100 - }, - { - "epoch": 0.5191506654475262, - "grad_norm": 3.390625, - "learning_rate": 2.388390135059395e-05, - "loss": 0.1578, - "step": 5110 - }, - { - "epoch": 0.5201666158691456, - "grad_norm": 1.21875, - "learning_rate": 2.3803376132291226e-05, - "loss": 0.1374, - "step": 5120 - }, - { - "epoch": 0.521182566290765, - "grad_norm": 4.0625, - "learning_rate": 2.3722863355456436e-05, - "loss": 0.1854, - "step": 5130 - }, - { - "epoch": 0.5221985167123845, - "grad_norm": 4.71875, - "learning_rate": 2.364236385719236e-05, - "loss": 0.1391, - "step": 5140 - }, - { - "epoch": 0.5232144671340039, - "grad_norm": 3.296875, - "learning_rate": 2.356187847446366e-05, - "loss": 0.2106, - "step": 5150 - }, - { - "epoch": 0.5242304175556233, - "grad_norm": 3.296875, - "learning_rate": 2.348140804408829e-05, - "loss": 0.2383, - "step": 5160 - }, - { - "epoch": 0.5252463679772427, - "grad_norm": 3.359375, - "learning_rate": 2.3400953402728713e-05, - "loss": 0.1537, - "step": 5170 - }, - { - "epoch": 0.5262623183988622, - "grad_norm": 1.4921875, - "learning_rate": 2.332051538688322e-05, - "loss": 0.1841, - "step": 5180 - }, - { - "epoch": 0.5272782688204816, - "grad_norm": 3.25, - "learning_rate": 2.3240094832877287e-05, - "loss": 0.1855, - "step": 5190 - }, - { - "epoch": 0.528294219242101, - "grad_norm": 3.34375, - "learning_rate": 2.3159692576854793e-05, - "loss": 0.2625, - "step": 5200 - }, - { - "epoch": 0.5293101696637205, - "grad_norm": 3.6875, - "learning_rate": 2.3079309454769413e-05, - "loss": 0.1292, - "step": 5210 - }, - { - "epoch": 0.5303261200853399, - "grad_norm": 1.1171875, - "learning_rate": 2.2998946302375827e-05, - "loss": 0.1263, - "step": 5220 - }, - { - "epoch": 0.5313420705069593, - "grad_norm": 2.71875, - "learning_rate": 2.2918603955221148e-05, - "loss": 0.2296, - "step": 5230 - }, - { - "epoch": 0.5323580209285786, - "grad_norm": 2.015625, - "learning_rate": 2.283828324863613e-05, - "loss": 0.1231, - "step": 5240 - }, - { - "epoch": 0.5333739713501982, - "grad_norm": 3.671875, - "learning_rate": 2.2757985017726557e-05, - "loss": 0.1939, - "step": 5250 - }, - { - "epoch": 0.5343899217718175, - "grad_norm": 1.9765625, - "learning_rate": 2.2677710097364495e-05, - "loss": 0.168, - "step": 5260 - }, - { - "epoch": 0.5354058721934369, - "grad_norm": 2.609375, - "learning_rate": 2.259745932217969e-05, - "loss": 0.1883, - "step": 5270 - }, - { - "epoch": 0.5364218226150563, - "grad_norm": 2.8125, - "learning_rate": 2.2517233526550817e-05, - "loss": 0.1898, - "step": 5280 - }, - { - "epoch": 0.5374377730366758, - "grad_norm": 3.125, - "learning_rate": 2.2437033544596837e-05, - "loss": 0.1838, - "step": 5290 - }, - { - "epoch": 0.5384537234582952, - "grad_norm": 4.90625, - "learning_rate": 2.2356860210168336e-05, - "loss": 0.1553, - "step": 5300 - }, - { - "epoch": 0.5394696738799146, - "grad_norm": 3.171875, - "learning_rate": 2.2276714356838824e-05, - "loss": 0.2248, - "step": 5310 - }, - { - "epoch": 0.5404856243015341, - "grad_norm": 1.34375, - "learning_rate": 2.2196596817896118e-05, - "loss": 0.1421, - "step": 5320 - }, - { - "epoch": 0.5415015747231535, - "grad_norm": 3.28125, - "learning_rate": 2.2116508426333596e-05, - "loss": 0.1947, - "step": 5330 - }, - { - "epoch": 0.5425175251447729, - "grad_norm": 1.9296875, - "learning_rate": 2.2036450014841652e-05, - "loss": 0.2207, - "step": 5340 - }, - { - "epoch": 0.5435334755663923, - "grad_norm": 0.5703125, - "learning_rate": 2.19564224157989e-05, - "loss": 0.2208, - "step": 5350 - }, - { - "epoch": 0.5445494259880118, - "grad_norm": 7.5625, - "learning_rate": 2.1876426461263654e-05, - "loss": 0.1739, - "step": 5360 - }, - { - "epoch": 0.5455653764096312, - "grad_norm": 2.15625, - "learning_rate": 2.179646298296519e-05, - "loss": 0.1938, - "step": 5370 - }, - { - "epoch": 0.5465813268312506, - "grad_norm": 4.1875, - "learning_rate": 2.171653281229511e-05, - "loss": 0.1736, - "step": 5380 - }, - { - "epoch": 0.5475972772528701, - "grad_norm": 4.65625, - "learning_rate": 2.1636636780298732e-05, - "loss": 0.2167, - "step": 5390 - }, - { - "epoch": 0.5486132276744895, - "grad_norm": 1.84375, - "learning_rate": 2.1556775717666427e-05, - "loss": 0.1711, - "step": 5400 - }, - { - "epoch": 0.5496291780961089, - "grad_norm": 5.125, - "learning_rate": 2.147695045472499e-05, - "loss": 0.1789, - "step": 5410 - }, - { - "epoch": 0.5506451285177283, - "grad_norm": 3.859375, - "learning_rate": 2.1397161821428973e-05, - "loss": 0.2187, - "step": 5420 - }, - { - "epoch": 0.5516610789393478, - "grad_norm": 2.25, - "learning_rate": 2.131741064735212e-05, - "loss": 0.1367, - "step": 5430 - }, - { - "epoch": 0.5526770293609672, - "grad_norm": 4.65625, - "learning_rate": 2.1237697761678684e-05, - "loss": 0.1574, - "step": 5440 - }, - { - "epoch": 0.5536929797825866, - "grad_norm": 1.2265625, - "learning_rate": 2.1158023993194848e-05, - "loss": 0.1301, - "step": 5450 - }, - { - "epoch": 0.5547089302042061, - "grad_norm": 4.21875, - "learning_rate": 2.107839017028005e-05, - "loss": 0.2782, - "step": 5460 - }, - { - "epoch": 0.5557248806258255, - "grad_norm": 0.52734375, - "learning_rate": 2.0998797120898457e-05, - "loss": 0.2024, - "step": 5470 - }, - { - "epoch": 0.5567408310474449, - "grad_norm": 1.46875, - "learning_rate": 2.0919245672590277e-05, - "loss": 0.1755, - "step": 5480 - }, - { - "epoch": 0.5577567814690643, - "grad_norm": 2.140625, - "learning_rate": 2.083973665246318e-05, - "loss": 0.2058, - "step": 5490 - }, - { - "epoch": 0.5587727318906838, - "grad_norm": 1.5390625, - "learning_rate": 2.076027088718373e-05, - "loss": 0.2159, - "step": 5500 - }, - { - "epoch": 0.5597886823123032, - "grad_norm": 1.9921875, - "learning_rate": 2.0680849202968743e-05, - "loss": 0.2139, - "step": 5510 - }, - { - "epoch": 0.5608046327339226, - "grad_norm": 2.4375, - "learning_rate": 2.060147242557674e-05, - "loss": 0.183, - "step": 5520 - }, - { - "epoch": 0.5618205831555421, - "grad_norm": 5.5, - "learning_rate": 2.0522141380299308e-05, - "loss": 0.1673, - "step": 5530 - }, - { - "epoch": 0.5628365335771615, - "grad_norm": 4.25, - "learning_rate": 2.044285689195258e-05, - "loss": 0.1674, - "step": 5540 - }, - { - "epoch": 0.5638524839987809, - "grad_norm": 2.109375, - "learning_rate": 2.0363619784868604e-05, - "loss": 0.1531, - "step": 5550 - }, - { - "epoch": 0.5648684344204002, - "grad_norm": 2.59375, - "learning_rate": 2.0284430882886836e-05, - "loss": 0.1665, - "step": 5560 - }, - { - "epoch": 0.5658843848420197, - "grad_norm": 3.984375, - "learning_rate": 2.020529100934549e-05, - "loss": 0.1717, - "step": 5570 - }, - { - "epoch": 0.5669003352636391, - "grad_norm": 1.6015625, - "learning_rate": 2.012620098707306e-05, - "loss": 0.1167, - "step": 5580 - }, - { - "epoch": 0.5679162856852585, - "grad_norm": 6.0625, - "learning_rate": 2.004716163837972e-05, - "loss": 0.2084, - "step": 5590 - }, - { - "epoch": 0.5689322361068779, - "grad_norm": 2.5625, - "learning_rate": 1.996817378504876e-05, - "loss": 0.1939, - "step": 5600 - }, - { - "epoch": 0.5699481865284974, - "grad_norm": 3.109375, - "learning_rate": 1.9889238248328108e-05, - "loss": 0.1241, - "step": 5610 - }, - { - "epoch": 0.5709641369501168, - "grad_norm": 4.875, - "learning_rate": 1.981035584892171e-05, - "loss": 0.1865, - "step": 5620 - }, - { - "epoch": 0.5719800873717362, - "grad_norm": 2.984375, - "learning_rate": 1.9731527406981072e-05, - "loss": 0.1639, - "step": 5630 - }, - { - "epoch": 0.5729960377933557, - "grad_norm": 4.4375, - "learning_rate": 1.9652753742096655e-05, - "loss": 0.2019, - "step": 5640 - }, - { - "epoch": 0.5740119882149751, - "grad_norm": 4.3125, - "learning_rate": 1.9574035673289432e-05, - "loss": 0.1829, - "step": 5650 - }, - { - "epoch": 0.5750279386365945, - "grad_norm": 3.203125, - "learning_rate": 1.9495374019002312e-05, - "loss": 0.2267, - "step": 5660 - }, - { - "epoch": 0.5760438890582139, - "grad_norm": 1.765625, - "learning_rate": 1.9416769597091673e-05, - "loss": 0.1411, - "step": 5670 - }, - { - "epoch": 0.5770598394798334, - "grad_norm": 2.640625, - "learning_rate": 1.9338223224818818e-05, - "loss": 0.1476, - "step": 5680 - }, - { - "epoch": 0.5780757899014528, - "grad_norm": 4.84375, - "learning_rate": 1.9259735718841524e-05, - "loss": 0.1417, - "step": 5690 - }, - { - "epoch": 0.5790917403230722, - "grad_norm": 2.421875, - "learning_rate": 1.918130789520551e-05, - "loss": 0.1592, - "step": 5700 - }, - { - "epoch": 0.5801076907446917, - "grad_norm": 2.984375, - "learning_rate": 1.9102940569335963e-05, - "loss": 0.161, - "step": 5710 - }, - { - "epoch": 0.5811236411663111, - "grad_norm": 1.0234375, - "learning_rate": 1.9024634556029093e-05, - "loss": 0.1614, - "step": 5720 - }, - { - "epoch": 0.5821395915879305, - "grad_norm": 2.90625, - "learning_rate": 1.89463906694436e-05, - "loss": 0.1505, - "step": 5730 - }, - { - "epoch": 0.5831555420095499, - "grad_norm": 2.875, - "learning_rate": 1.8868209723092286e-05, - "loss": 0.1674, - "step": 5740 - }, - { - "epoch": 0.5841714924311694, - "grad_norm": 0.408203125, - "learning_rate": 1.8790092529833508e-05, - "loss": 0.1468, - "step": 5750 - }, - { - "epoch": 0.5851874428527888, - "grad_norm": 5.1875, - "learning_rate": 1.871203990186281e-05, - "loss": 0.1903, - "step": 5760 - }, - { - "epoch": 0.5862033932744082, - "grad_norm": 0.5546875, - "learning_rate": 1.8634052650704415e-05, - "loss": 0.2644, - "step": 5770 - }, - { - "epoch": 0.5872193436960277, - "grad_norm": 3.203125, - "learning_rate": 1.8556131587202848e-05, - "loss": 0.1968, - "step": 5780 - }, - { - "epoch": 0.5882352941176471, - "grad_norm": 2.484375, - "learning_rate": 1.8478277521514424e-05, - "loss": 0.2249, - "step": 5790 - }, - { - "epoch": 0.5892512445392665, - "grad_norm": 4.0, - "learning_rate": 1.8400491263098906e-05, - "loss": 0.1881, - "step": 5800 - }, - { - "epoch": 0.5902671949608859, - "grad_norm": 1.90625, - "learning_rate": 1.832277362071106e-05, - "loss": 0.1352, - "step": 5810 - }, - { - "epoch": 0.5912831453825054, - "grad_norm": 2.765625, - "learning_rate": 1.824512540239221e-05, - "loss": 0.2737, - "step": 5820 - }, - { - "epoch": 0.5922990958041248, - "grad_norm": 2.609375, - "learning_rate": 1.81675474154619e-05, - "loss": 0.1566, - "step": 5830 - }, - { - "epoch": 0.5933150462257442, - "grad_norm": 2.6875, - "learning_rate": 1.8090040466509444e-05, - "loss": 0.1999, - "step": 5840 - }, - { - "epoch": 0.5943309966473637, - "grad_norm": 2.609375, - "learning_rate": 1.8012605361385592e-05, - "loss": 0.2372, - "step": 5850 - }, - { - "epoch": 0.5953469470689831, - "grad_norm": 8.125, - "learning_rate": 1.7935242905194087e-05, - "loss": 0.2411, - "step": 5860 - }, - { - "epoch": 0.5963628974906025, - "grad_norm": 3.46875, - "learning_rate": 1.785795390228336e-05, - "loss": 0.138, - "step": 5870 - }, - { - "epoch": 0.5973788479122218, - "grad_norm": 2.3125, - "learning_rate": 1.7780739156238125e-05, - "loss": 0.1867, - "step": 5880 - }, - { - "epoch": 0.5983947983338413, - "grad_norm": 4.0625, - "learning_rate": 1.770359946987105e-05, - "loss": 0.2091, - "step": 5890 - }, - { - "epoch": 0.5994107487554607, - "grad_norm": 5.21875, - "learning_rate": 1.7626535645214378e-05, - "loss": 0.2091, - "step": 5900 - }, - { - "epoch": 0.6004266991770801, - "grad_norm": 3.15625, - "learning_rate": 1.7549548483511614e-05, - "loss": 0.1927, - "step": 5910 - }, - { - "epoch": 0.6014426495986995, - "grad_norm": 4.71875, - "learning_rate": 1.7472638785209198e-05, - "loss": 0.1893, - "step": 5920 - }, - { - "epoch": 0.602458600020319, - "grad_norm": 3.015625, - "learning_rate": 1.7395807349948145e-05, - "loss": 0.1557, - "step": 5930 - }, - { - "epoch": 0.6034745504419384, - "grad_norm": 2.9375, - "learning_rate": 1.73190549765558e-05, - "loss": 0.1717, - "step": 5940 - }, - { - "epoch": 0.6044905008635578, - "grad_norm": 3.109375, - "learning_rate": 1.724238246303745e-05, - "loss": 0.1879, - "step": 5950 - }, - { - "epoch": 0.6055064512851773, - "grad_norm": 3.875, - "learning_rate": 1.71657906065681e-05, - "loss": 0.1908, - "step": 5960 - }, - { - "epoch": 0.6065224017067967, - "grad_norm": 5.09375, - "learning_rate": 1.7089280203484115e-05, - "loss": 0.1712, - "step": 5970 - }, - { - "epoch": 0.6075383521284161, - "grad_norm": 3.015625, - "learning_rate": 1.701285204927502e-05, - "loss": 0.1454, - "step": 5980 - }, - { - "epoch": 0.6085543025500355, - "grad_norm": 3.265625, - "learning_rate": 1.693650693857515e-05, - "loss": 0.2283, - "step": 5990 - }, - { - "epoch": 0.609570252971655, - "grad_norm": 3.40625, - "learning_rate": 1.6860245665155466e-05, - "loss": 0.2188, - "step": 6000 - }, - { - "epoch": 0.6105862033932744, - "grad_norm": 2.5625, - "learning_rate": 1.678406902191521e-05, - "loss": 0.1605, - "step": 6010 - }, - { - "epoch": 0.6116021538148938, - "grad_norm": 0.6796875, - "learning_rate": 1.670797780087374e-05, - "loss": 0.1472, - "step": 6020 - }, - { - "epoch": 0.6126181042365133, - "grad_norm": 2.234375, - "learning_rate": 1.6631972793162288e-05, - "loss": 0.1676, - "step": 6030 - }, - { - "epoch": 0.6136340546581327, - "grad_norm": 1.25, - "learning_rate": 1.6556054789015662e-05, - "loss": 0.1508, - "step": 6040 - }, - { - "epoch": 0.6146500050797521, - "grad_norm": 4.78125, - "learning_rate": 1.6480224577764132e-05, - "loss": 0.1981, - "step": 6050 - }, - { - "epoch": 0.6156659555013715, - "grad_norm": 3.46875, - "learning_rate": 1.6404482947825137e-05, - "loss": 0.2514, - "step": 6060 - }, - { - "epoch": 0.616681905922991, - "grad_norm": 1.265625, - "learning_rate": 1.6328830686695154e-05, - "loss": 0.2397, - "step": 6070 - }, - { - "epoch": 0.6176978563446104, - "grad_norm": 1.953125, - "learning_rate": 1.625326858094144e-05, - "loss": 0.1523, - "step": 6080 - }, - { - "epoch": 0.6187138067662298, - "grad_norm": 3.484375, - "learning_rate": 1.6177797416193953e-05, - "loss": 0.218, - "step": 6090 - }, - { - "epoch": 0.6197297571878493, - "grad_norm": 3.484375, - "learning_rate": 1.6102417977137052e-05, - "loss": 0.1476, - "step": 6100 - }, - { - "epoch": 0.6207457076094687, - "grad_norm": 4.90625, - "learning_rate": 1.602713104750147e-05, - "loss": 0.1818, - "step": 6110 - }, - { - "epoch": 0.6217616580310881, - "grad_norm": 4.375, - "learning_rate": 1.5951937410056087e-05, - "loss": 0.2061, - "step": 6120 - }, - { - "epoch": 0.6227776084527075, - "grad_norm": 6.3125, - "learning_rate": 1.587683784659979e-05, - "loss": 0.1566, - "step": 6130 - }, - { - "epoch": 0.623793558874327, - "grad_norm": 2.828125, - "learning_rate": 1.58018331379534e-05, - "loss": 0.1376, - "step": 6140 - }, - { - "epoch": 0.6248095092959464, - "grad_norm": 2.40625, - "learning_rate": 1.572692406395149e-05, - "loss": 0.1655, - "step": 6150 - }, - { - "epoch": 0.6258254597175658, - "grad_norm": 4.34375, - "learning_rate": 1.5652111403434338e-05, - "loss": 0.2363, - "step": 6160 - }, - { - "epoch": 0.6268414101391853, - "grad_norm": 2.453125, - "learning_rate": 1.5577395934239757e-05, - "loss": 0.2464, - "step": 6170 - }, - { - "epoch": 0.6278573605608047, - "grad_norm": 2.53125, - "learning_rate": 1.5502778433195085e-05, - "loss": 0.1898, - "step": 6180 - }, - { - "epoch": 0.628873310982424, - "grad_norm": 2.28125, - "learning_rate": 1.5428259676109048e-05, - "loss": 0.1804, - "step": 6190 - }, - { - "epoch": 0.6298892614040434, - "grad_norm": 4.3125, - "learning_rate": 1.5353840437763732e-05, - "loss": 0.1409, - "step": 6200 - }, - { - "epoch": 0.630905211825663, - "grad_norm": 2.5625, - "learning_rate": 1.5279521491906496e-05, - "loss": 0.2449, - "step": 6210 - }, - { - "epoch": 0.6319211622472823, - "grad_norm": 3.0625, - "learning_rate": 1.520530361124195e-05, - "loss": 0.2103, - "step": 6220 - }, - { - "epoch": 0.6329371126689017, - "grad_norm": 2.609375, - "learning_rate": 1.5131187567423937e-05, - "loss": 0.2156, - "step": 6230 - }, - { - "epoch": 0.6339530630905211, - "grad_norm": 2.703125, - "learning_rate": 1.5057174131047446e-05, - "loss": 0.161, - "step": 6240 - }, - { - "epoch": 0.6349690135121406, - "grad_norm": 3.265625, - "learning_rate": 1.4983264071640679e-05, - "loss": 0.1757, - "step": 6250 - }, - { - "epoch": 0.63598496393376, - "grad_norm": 3.15625, - "learning_rate": 1.490945815765699e-05, - "loss": 0.2011, - "step": 6260 - }, - { - "epoch": 0.6370009143553794, - "grad_norm": 5.375, - "learning_rate": 1.4835757156466945e-05, - "loss": 0.1658, - "step": 6270 - }, - { - "epoch": 0.6380168647769989, - "grad_norm": 2.984375, - "learning_rate": 1.4762161834350271e-05, - "loss": 0.1754, - "step": 6280 - }, - { - "epoch": 0.6390328151986183, - "grad_norm": 2.015625, - "learning_rate": 1.4688672956487987e-05, - "loss": 0.1427, - "step": 6290 - }, - { - "epoch": 0.6400487656202377, - "grad_norm": 3.78125, - "learning_rate": 1.4615291286954352e-05, - "loss": 0.1517, - "step": 6300 - }, - { - "epoch": 0.6410647160418571, - "grad_norm": 2.859375, - "learning_rate": 1.4542017588709005e-05, - "loss": 0.2348, - "step": 6310 - }, - { - "epoch": 0.6420806664634766, - "grad_norm": 2.421875, - "learning_rate": 1.4468852623588961e-05, - "loss": 0.2089, - "step": 6320 - }, - { - "epoch": 0.643096616885096, - "grad_norm": 2.15625, - "learning_rate": 1.4395797152300719e-05, - "loss": 0.1702, - "step": 6330 - }, - { - "epoch": 0.6441125673067154, - "grad_norm": 1.53125, - "learning_rate": 1.4322851934412382e-05, - "loss": 0.1017, - "step": 6340 - }, - { - "epoch": 0.6451285177283349, - "grad_norm": 1.90625, - "learning_rate": 1.4250017728345716e-05, - "loss": 0.1813, - "step": 6350 - }, - { - "epoch": 0.6461444681499543, - "grad_norm": 2.015625, - "learning_rate": 1.4177295291368292e-05, - "loss": 0.1095, - "step": 6360 - }, - { - "epoch": 0.6471604185715737, - "grad_norm": 2.625, - "learning_rate": 1.410468537958558e-05, - "loss": 0.2259, - "step": 6370 - }, - { - "epoch": 0.6481763689931931, - "grad_norm": 3.5, - "learning_rate": 1.4032188747933136e-05, - "loss": 0.1595, - "step": 6380 - }, - { - "epoch": 0.6491923194148126, - "grad_norm": 5.21875, - "learning_rate": 1.39598061501687e-05, - "loss": 0.2226, - "step": 6390 - }, - { - "epoch": 0.650208269836432, - "grad_norm": 5.34375, - "learning_rate": 1.388753833886442e-05, - "loss": 0.2132, - "step": 6400 - }, - { - "epoch": 0.6512242202580514, - "grad_norm": 3.640625, - "learning_rate": 1.3815386065398945e-05, - "loss": 0.1227, - "step": 6410 - }, - { - "epoch": 0.6522401706796709, - "grad_norm": 1.0, - "learning_rate": 1.3743350079949705e-05, - "loss": 0.1755, - "step": 6420 - }, - { - "epoch": 0.6532561211012903, - "grad_norm": 2.359375, - "learning_rate": 1.3671431131485057e-05, - "loss": 0.1552, - "step": 6430 - }, - { - "epoch": 0.6542720715229097, - "grad_norm": 5.3125, - "learning_rate": 1.3599629967756483e-05, - "loss": 0.1917, - "step": 6440 - }, - { - "epoch": 0.6552880219445291, - "grad_norm": 4.625, - "learning_rate": 1.3527947335290877e-05, - "loss": 0.1812, - "step": 6450 - }, - { - "epoch": 0.6563039723661486, - "grad_norm": 1.234375, - "learning_rate": 1.3456383979382708e-05, - "loss": 0.1896, - "step": 6460 - }, - { - "epoch": 0.657319922787768, - "grad_norm": 3.984375, - "learning_rate": 1.3384940644086352e-05, - "loss": 0.1484, - "step": 6470 - }, - { - "epoch": 0.6583358732093874, - "grad_norm": 2.40625, - "learning_rate": 1.3313618072208268e-05, - "loss": 0.1334, - "step": 6480 - }, - { - "epoch": 0.6593518236310069, - "grad_norm": 4.375, - "learning_rate": 1.3242417005299357e-05, - "loss": 0.1351, - "step": 6490 - }, - { - "epoch": 0.6603677740526263, - "grad_norm": 2.640625, - "learning_rate": 1.31713381836472e-05, - "loss": 0.1717, - "step": 6500 - }, - { - "epoch": 0.6613837244742456, - "grad_norm": 2.640625, - "learning_rate": 1.3100382346268392e-05, - "loss": 0.1867, - "step": 6510 - }, - { - "epoch": 0.662399674895865, - "grad_norm": 1.734375, - "learning_rate": 1.3029550230900812e-05, - "loss": 0.1997, - "step": 6520 - }, - { - "epoch": 0.6634156253174845, - "grad_norm": 3.609375, - "learning_rate": 1.2958842573996016e-05, - "loss": 0.1969, - "step": 6530 - }, - { - "epoch": 0.6644315757391039, - "grad_norm": 3.578125, - "learning_rate": 1.2888260110711525e-05, - "loss": 0.1469, - "step": 6540 - }, - { - "epoch": 0.6654475261607233, - "grad_norm": 1.3515625, - "learning_rate": 1.2817803574903212e-05, - "loss": 0.1524, - "step": 6550 - }, - { - "epoch": 0.6664634765823427, - "grad_norm": 2.109375, - "learning_rate": 1.2747473699117668e-05, - "loss": 0.159, - "step": 6560 - }, - { - "epoch": 0.6674794270039622, - "grad_norm": 1.53125, - "learning_rate": 1.267727121458458e-05, - "loss": 0.1999, - "step": 6570 - }, - { - "epoch": 0.6684953774255816, - "grad_norm": 1.7265625, - "learning_rate": 1.2607196851209137e-05, - "loss": 0.2216, - "step": 6580 - }, - { - "epoch": 0.669511327847201, - "grad_norm": 3.125, - "learning_rate": 1.2537251337564412e-05, - "loss": 0.1607, - "step": 6590 - }, - { - "epoch": 0.6705272782688205, - "grad_norm": 2.421875, - "learning_rate": 1.2467435400883839e-05, - "loss": 0.2187, - "step": 6600 - }, - { - "epoch": 0.6715432286904399, - "grad_norm": 1.5078125, - "learning_rate": 1.239774976705359e-05, - "loss": 0.1753, - "step": 6610 - }, - { - "epoch": 0.6725591791120593, - "grad_norm": 1.140625, - "learning_rate": 1.2328195160605092e-05, - "loss": 0.194, - "step": 6620 - }, - { - "epoch": 0.6735751295336787, - "grad_norm": 4.9375, - "learning_rate": 1.225877230470743e-05, - "loss": 0.1485, - "step": 6630 - }, - { - "epoch": 0.6745910799552982, - "grad_norm": 3.65625, - "learning_rate": 1.218948192115988e-05, - "loss": 0.1847, - "step": 6640 - }, - { - "epoch": 0.6756070303769176, - "grad_norm": 3.875, - "learning_rate": 1.21203247303844e-05, - "loss": 0.1874, - "step": 6650 - }, - { - "epoch": 0.676622980798537, - "grad_norm": 2.65625, - "learning_rate": 1.2051301451418073e-05, - "loss": 0.2377, - "step": 6660 - }, - { - "epoch": 0.6776389312201565, - "grad_norm": 2.09375, - "learning_rate": 1.198241280190574e-05, - "loss": 0.1508, - "step": 6670 - }, - { - "epoch": 0.6786548816417759, - "grad_norm": 2.203125, - "learning_rate": 1.1913659498092431e-05, - "loss": 0.1537, - "step": 6680 - }, - { - "epoch": 0.6796708320633953, - "grad_norm": 2.484375, - "learning_rate": 1.184504225481601e-05, - "loss": 0.2339, - "step": 6690 - }, - { - "epoch": 0.6806867824850147, - "grad_norm": 5.625, - "learning_rate": 1.177656178549966e-05, - "loss": 0.2102, - "step": 6700 - }, - { - "epoch": 0.6817027329066342, - "grad_norm": 2.5, - "learning_rate": 1.1708218802144536e-05, - "loss": 0.1435, - "step": 6710 - }, - { - "epoch": 0.6827186833282536, - "grad_norm": 3.84375, - "learning_rate": 1.1640014015322323e-05, - "loss": 0.1823, - "step": 6720 - }, - { - "epoch": 0.683734633749873, - "grad_norm": 2.359375, - "learning_rate": 1.1571948134167862e-05, - "loss": 0.1154, - "step": 6730 - }, - { - "epoch": 0.6847505841714925, - "grad_norm": 2.90625, - "learning_rate": 1.1504021866371761e-05, - "loss": 0.2105, - "step": 6740 - }, - { - "epoch": 0.6857665345931119, - "grad_norm": 5.46875, - "learning_rate": 1.143623591817304e-05, - "loss": 0.1317, - "step": 6750 - }, - { - "epoch": 0.6867824850147313, - "grad_norm": 3.34375, - "learning_rate": 1.1368590994351835e-05, - "loss": 0.1406, - "step": 6760 - }, - { - "epoch": 0.6877984354363507, - "grad_norm": 3.78125, - "learning_rate": 1.130108779822198e-05, - "loss": 0.1425, - "step": 6770 - }, - { - "epoch": 0.6888143858579702, - "grad_norm": 0.77734375, - "learning_rate": 1.1233727031623783e-05, - "loss": 0.1623, - "step": 6780 - }, - { - "epoch": 0.6898303362795896, - "grad_norm": 4.625, - "learning_rate": 1.1166509394916682e-05, - "loss": 0.1591, - "step": 6790 - }, - { - "epoch": 0.690846286701209, - "grad_norm": 3.84375, - "learning_rate": 1.1099435586971982e-05, - "loss": 0.1758, - "step": 6800 - }, - { - "epoch": 0.6918622371228285, - "grad_norm": 2.4375, - "learning_rate": 1.1032506305165555e-05, - "loss": 0.1018, - "step": 6810 - }, - { - "epoch": 0.6928781875444479, - "grad_norm": 3.203125, - "learning_rate": 1.0965722245370641e-05, - "loss": 0.1485, - "step": 6820 - }, - { - "epoch": 0.6938941379660672, - "grad_norm": 0.7109375, - "learning_rate": 1.0899084101950561e-05, - "loss": 0.1762, - "step": 6830 - }, - { - "epoch": 0.6949100883876866, - "grad_norm": 1.9765625, - "learning_rate": 1.0832592567751555e-05, - "loss": 0.1402, - "step": 6840 - }, - { - "epoch": 0.6959260388093061, - "grad_norm": 1.4609375, - "learning_rate": 1.0766248334095505e-05, - "loss": 0.2278, - "step": 6850 - }, - { - "epoch": 0.6969419892309255, - "grad_norm": 3.953125, - "learning_rate": 1.0700052090772828e-05, - "loss": 0.1969, - "step": 6860 - }, - { - "epoch": 0.6979579396525449, - "grad_norm": 2.453125, - "learning_rate": 1.0634004526035249e-05, - "loss": 0.2073, - "step": 6870 - }, - { - "epoch": 0.6989738900741643, - "grad_norm": 1.6171875, - "learning_rate": 1.0568106326588645e-05, - "loss": 0.1902, - "step": 6880 - }, - { - "epoch": 0.6999898404957838, - "grad_norm": 1.2734375, - "learning_rate": 1.0502358177585953e-05, - "loss": 0.2165, - "step": 6890 - }, - { - "epoch": 0.7010057909174032, - "grad_norm": 1.671875, - "learning_rate": 1.0436760762619977e-05, - "loss": 0.1952, - "step": 6900 - }, - { - "epoch": 0.7020217413390226, - "grad_norm": 2.8125, - "learning_rate": 1.0371314763716347e-05, - "loss": 0.1422, - "step": 6910 - }, - { - "epoch": 0.7030376917606421, - "grad_norm": 2.53125, - "learning_rate": 1.0306020861326388e-05, - "loss": 0.0961, - "step": 6920 - }, - { - "epoch": 0.7040536421822615, - "grad_norm": 3.046875, - "learning_rate": 1.0240879734320068e-05, - "loss": 0.1542, - "step": 6930 - }, - { - "epoch": 0.7050695926038809, - "grad_norm": 2.859375, - "learning_rate": 1.0175892059978901e-05, - "loss": 0.1748, - "step": 6940 - }, - { - "epoch": 0.7060855430255003, - "grad_norm": 2.671875, - "learning_rate": 1.0111058513988958e-05, - "loss": 0.0819, - "step": 6950 - }, - { - "epoch": 0.7071014934471198, - "grad_norm": 3.5625, - "learning_rate": 1.0046379770433803e-05, - "loss": 0.1933, - "step": 6960 - }, - { - "epoch": 0.7081174438687392, - "grad_norm": 2.859375, - "learning_rate": 9.98185650178749e-06, - "loss": 0.1891, - "step": 6970 - }, - { - "epoch": 0.7091333942903586, - "grad_norm": 3.15625, - "learning_rate": 9.917489378907591e-06, - "loss": 0.2102, - "step": 6980 - }, - { - "epoch": 0.7101493447119781, - "grad_norm": 6.40625, - "learning_rate": 9.853279071028212e-06, - "loss": 0.1714, - "step": 6990 - }, - { - "epoch": 0.7111652951335975, - "grad_norm": 2.375, - "learning_rate": 9.78922624575303e-06, - "loss": 0.1299, - "step": 7000 - }, - { - "epoch": 0.7121812455552169, - "grad_norm": 2.078125, - "learning_rate": 9.72533156904833e-06, - "loss": 0.1914, - "step": 7010 - }, - { - "epoch": 0.7131971959768363, - "grad_norm": 3.859375, - "learning_rate": 9.661595705236137e-06, - "loss": 0.2377, - "step": 7020 - }, - { - "epoch": 0.7142131463984558, - "grad_norm": 1.171875, - "learning_rate": 9.598019316987244e-06, - "loss": 0.1851, - "step": 7030 - }, - { - "epoch": 0.7152290968200752, - "grad_norm": 1.078125, - "learning_rate": 9.53460306531439e-06, - "loss": 0.2661, - "step": 7040 - }, - { - "epoch": 0.7162450472416946, - "grad_norm": 1.6484375, - "learning_rate": 9.471347609565311e-06, - "loss": 0.1669, - "step": 7050 - }, - { - "epoch": 0.7172609976633141, - "grad_norm": 4.59375, - "learning_rate": 9.408253607415957e-06, - "loss": 0.2487, - "step": 7060 - }, - { - "epoch": 0.7182769480849335, - "grad_norm": 3.09375, - "learning_rate": 9.345321714863614e-06, - "loss": 0.186, - "step": 7070 - }, - { - "epoch": 0.7192928985065529, - "grad_norm": 6.0625, - "learning_rate": 9.282552586220075e-06, - "loss": 0.2249, - "step": 7080 - }, - { - "epoch": 0.7203088489281723, - "grad_norm": 1.5703125, - "learning_rate": 9.219946874104885e-06, - "loss": 0.1255, - "step": 7090 - }, - { - "epoch": 0.7213247993497918, - "grad_norm": 1.9453125, - "learning_rate": 9.157505229438481e-06, - "loss": 0.1999, - "step": 7100 - }, - { - "epoch": 0.7223407497714112, - "grad_norm": 5.1875, - "learning_rate": 9.095228301435518e-06, - "loss": 0.199, - "step": 7110 - }, - { - "epoch": 0.7233567001930306, - "grad_norm": 2.078125, - "learning_rate": 9.03311673759802e-06, - "loss": 0.2182, - "step": 7120 - }, - { - "epoch": 0.7243726506146501, - "grad_norm": 6.46875, - "learning_rate": 8.971171183708733e-06, - "loss": 0.1573, - "step": 7130 - }, - { - "epoch": 0.7253886010362695, - "grad_norm": 3.015625, - "learning_rate": 8.909392283824353e-06, - "loss": 0.2044, - "step": 7140 - }, - { - "epoch": 0.7264045514578888, - "grad_norm": 2.921875, - "learning_rate": 8.847780680268872e-06, - "loss": 0.11, - "step": 7150 - }, - { - "epoch": 0.7274205018795082, - "grad_norm": 2.96875, - "learning_rate": 8.786337013626853e-06, - "loss": 0.1897, - "step": 7160 - }, - { - "epoch": 0.7284364523011277, - "grad_norm": 1.7578125, - "learning_rate": 8.725061922736799e-06, - "loss": 0.153, - "step": 7170 - }, - { - "epoch": 0.7294524027227471, - "grad_norm": 1.609375, - "learning_rate": 8.663956044684532e-06, - "loss": 0.1746, - "step": 7180 - }, - { - "epoch": 0.7304683531443665, - "grad_norm": 1.9375, - "learning_rate": 8.603020014796507e-06, - "loss": 0.2284, - "step": 7190 - }, - { - "epoch": 0.7314843035659859, - "grad_norm": 1.515625, - "learning_rate": 8.542254466633273e-06, - "loss": 0.1186, - "step": 7200 - }, - { - "epoch": 0.7325002539876054, - "grad_norm": 1.671875, - "learning_rate": 8.481660031982844e-06, - "loss": 0.1971, - "step": 7210 - }, - { - "epoch": 0.7335162044092248, - "grad_norm": 1.453125, - "learning_rate": 8.421237340854157e-06, - "loss": 0.196, - "step": 7220 - }, - { - "epoch": 0.7345321548308442, - "grad_norm": 0.65234375, - "learning_rate": 8.360987021470479e-06, - "loss": 0.1724, - "step": 7230 - }, - { - "epoch": 0.7355481052524637, - "grad_norm": 2.84375, - "learning_rate": 8.300909700262929e-06, - "loss": 0.175, - "step": 7240 - }, - { - "epoch": 0.7365640556740831, - "grad_norm": 3.109375, - "learning_rate": 8.241006001863924e-06, - "loss": 0.2276, - "step": 7250 - }, - { - "epoch": 0.7375800060957025, - "grad_norm": 4.8125, - "learning_rate": 8.181276549100714e-06, - "loss": 0.2029, - "step": 7260 - }, - { - "epoch": 0.7385959565173219, - "grad_norm": 4.03125, - "learning_rate": 8.12172196298887e-06, - "loss": 0.175, - "step": 7270 - }, - { - "epoch": 0.7396119069389414, - "grad_norm": 3.046875, - "learning_rate": 8.062342862725878e-06, - "loss": 0.1662, - "step": 7280 - }, - { - "epoch": 0.7406278573605608, - "grad_norm": 3.375, - "learning_rate": 8.003139865684662e-06, - "loss": 0.1616, - "step": 7290 - }, - { - "epoch": 0.7416438077821802, - "grad_norm": 2.5625, - "learning_rate": 7.944113587407157e-06, - "loss": 0.2448, - "step": 7300 - }, - { - "epoch": 0.7426597582037997, - "grad_norm": 4.125, - "learning_rate": 7.885264641597961e-06, - "loss": 0.1618, - "step": 7310 - }, - { - "epoch": 0.7436757086254191, - "grad_norm": 3.5, - "learning_rate": 7.826593640117889e-06, - "loss": 0.1134, - "step": 7320 - }, - { - "epoch": 0.7446916590470385, - "grad_norm": 2.6875, - "learning_rate": 7.76810119297767e-06, - "loss": 0.1795, - "step": 7330 - }, - { - "epoch": 0.7457076094686579, - "grad_norm": 4.34375, - "learning_rate": 7.709787908331556e-06, - "loss": 0.2736, - "step": 7340 - }, - { - "epoch": 0.7467235598902774, - "grad_norm": 1.21875, - "learning_rate": 7.651654392471038e-06, - "loss": 0.139, - "step": 7350 - }, - { - "epoch": 0.7477395103118968, - "grad_norm": 3.578125, - "learning_rate": 7.593701249818521e-06, - "loss": 0.2023, - "step": 7360 - }, - { - "epoch": 0.7487554607335162, - "grad_norm": 2.15625, - "learning_rate": 7.535929082921048e-06, - "loss": 0.1702, - "step": 7370 - }, - { - "epoch": 0.7497714111551357, - "grad_norm": 1.96875, - "learning_rate": 7.47833849244402e-06, - "loss": 0.1835, - "step": 7380 - }, - { - "epoch": 0.7507873615767551, - "grad_norm": 2.796875, - "learning_rate": 7.420930077164959e-06, - "loss": 0.1713, - "step": 7390 - }, - { - "epoch": 0.7518033119983745, - "grad_norm": 4.46875, - "learning_rate": 7.363704433967311e-06, - "loss": 0.1906, - "step": 7400 - }, - { - "epoch": 0.7528192624199939, - "grad_norm": 1.75, - "learning_rate": 7.306662157834185e-06, - "loss": 0.1421, - "step": 7410 - }, - { - "epoch": 0.7538352128416134, - "grad_norm": 1.140625, - "learning_rate": 7.2498038418422145e-06, - "loss": 0.1793, - "step": 7420 - }, - { - "epoch": 0.7548511632632328, - "grad_norm": 2.578125, - "learning_rate": 7.193130077155374e-06, - "loss": 0.1603, - "step": 7430 - }, - { - "epoch": 0.7558671136848522, - "grad_norm": 4.3125, - "learning_rate": 7.13664145301883e-06, - "loss": 0.2169, - "step": 7440 - }, - { - "epoch": 0.7568830641064717, - "grad_norm": 3.078125, - "learning_rate": 7.0803385567528025e-06, - "loss": 0.1685, - "step": 7450 - }, - { - "epoch": 0.757899014528091, - "grad_norm": 3.5625, - "learning_rate": 7.024221973746495e-06, - "loss": 0.2282, - "step": 7460 - }, - { - "epoch": 0.7589149649497104, - "grad_norm": 2.265625, - "learning_rate": 6.968292287451961e-06, - "loss": 0.1786, - "step": 7470 - }, - { - "epoch": 0.7599309153713298, - "grad_norm": 4.71875, - "learning_rate": 6.912550079378091e-06, - "loss": 0.1811, - "step": 7480 - }, - { - "epoch": 0.7609468657929493, - "grad_norm": 2.328125, - "learning_rate": 6.856995929084506e-06, - "loss": 0.1747, - "step": 7490 - }, - { - "epoch": 0.7619628162145687, - "grad_norm": 5.21875, - "learning_rate": 6.801630414175589e-06, - "loss": 0.2028, - "step": 7500 - }, - { - "epoch": 0.7629787666361881, - "grad_norm": 3.78125, - "learning_rate": 6.746454110294451e-06, - "loss": 0.2255, - "step": 7510 - }, - { - "epoch": 0.7639947170578075, - "grad_norm": 1.625, - "learning_rate": 6.691467591116931e-06, - "loss": 0.1604, - "step": 7520 - }, - { - "epoch": 0.765010667479427, - "grad_norm": 1.7734375, - "learning_rate": 6.6366714283456755e-06, - "loss": 0.2559, - "step": 7530 - }, - { - "epoch": 0.7660266179010464, - "grad_norm": 4.59375, - "learning_rate": 6.582066191704142e-06, - "loss": 0.2034, - "step": 7540 - }, - { - "epoch": 0.7670425683226658, - "grad_norm": 1.578125, - "learning_rate": 6.527652448930724e-06, - "loss": 0.148, - "step": 7550 - }, - { - "epoch": 0.7680585187442853, - "grad_norm": 1.7109375, - "learning_rate": 6.4734307657728e-06, - "loss": 0.1811, - "step": 7560 - }, - { - "epoch": 0.7690744691659047, - "grad_norm": 1.2734375, - "learning_rate": 6.419401705980924e-06, - "loss": 0.1407, - "step": 7570 - }, - { - "epoch": 0.7700904195875241, - "grad_norm": 2.25, - "learning_rate": 6.365565831302869e-06, - "loss": 0.1893, - "step": 7580 - }, - { - "epoch": 0.7711063700091435, - "grad_norm": 1.625, - "learning_rate": 6.311923701477854e-06, - "loss": 0.1835, - "step": 7590 - }, - { - "epoch": 0.772122320430763, - "grad_norm": 2.375, - "learning_rate": 6.258475874230713e-06, - "loss": 0.1579, - "step": 7600 - }, - { - "epoch": 0.7731382708523824, - "grad_norm": 4.5, - "learning_rate": 6.205222905266067e-06, - "loss": 0.1794, - "step": 7610 - }, - { - "epoch": 0.7741542212740018, - "grad_norm": 4.25, - "learning_rate": 6.152165348262598e-06, - "loss": 0.1477, - "step": 7620 - }, - { - "epoch": 0.7751701716956213, - "grad_norm": 1.9765625, - "learning_rate": 6.0993037548672246e-06, - "loss": 0.2396, - "step": 7630 - }, - { - "epoch": 0.7761861221172407, - "grad_norm": 2.671875, - "learning_rate": 6.046638674689454e-06, - "loss": 0.1717, - "step": 7640 - }, - { - "epoch": 0.7772020725388601, - "grad_norm": 3.671875, - "learning_rate": 5.994170655295567e-06, - "loss": 0.2646, - "step": 7650 - }, - { - "epoch": 0.7782180229604795, - "grad_norm": 1.3046875, - "learning_rate": 5.9419002422030106e-06, - "loss": 0.1553, - "step": 7660 - }, - { - "epoch": 0.779233973382099, - "grad_norm": 3.734375, - "learning_rate": 5.889827978874665e-06, - "loss": 0.1854, - "step": 7670 - }, - { - "epoch": 0.7802499238037184, - "grad_norm": 2.140625, - "learning_rate": 5.837954406713245e-06, - "loss": 0.1857, - "step": 7680 - }, - { - "epoch": 0.7812658742253378, - "grad_norm": 3.34375, - "learning_rate": 5.786280065055619e-06, - "loss": 0.1797, - "step": 7690 - }, - { - "epoch": 0.7822818246469573, - "grad_norm": 0.97265625, - "learning_rate": 5.734805491167244e-06, - "loss": 0.1488, - "step": 7700 - }, - { - "epoch": 0.7832977750685767, - "grad_norm": 2.078125, - "learning_rate": 5.683531220236576e-06, - "loss": 0.1688, - "step": 7710 - }, - { - "epoch": 0.7843137254901961, - "grad_norm": 3.046875, - "learning_rate": 5.632457785369455e-06, - "loss": 0.1503, - "step": 7720 - }, - { - "epoch": 0.7853296759118155, - "grad_norm": 1.6875, - "learning_rate": 5.581585717583637e-06, - "loss": 0.1658, - "step": 7730 - }, - { - "epoch": 0.786345626333435, - "grad_norm": 3.421875, - "learning_rate": 5.530915545803209e-06, - "loss": 0.2112, - "step": 7740 - }, - { - "epoch": 0.7873615767550544, - "grad_norm": 4.1875, - "learning_rate": 5.480447796853141e-06, - "loss": 0.165, - "step": 7750 - }, - { - "epoch": 0.7883775271766738, - "grad_norm": 5.3125, - "learning_rate": 5.430182995453756e-06, - "loss": 0.1499, - "step": 7760 - }, - { - "epoch": 0.7893934775982933, - "grad_norm": 2.1875, - "learning_rate": 5.380121664215329e-06, - "loss": 0.1559, - "step": 7770 - }, - { - "epoch": 0.7904094280199127, - "grad_norm": 1.46875, - "learning_rate": 5.330264323632611e-06, - "loss": 0.2098, - "step": 7780 - }, - { - "epoch": 0.791425378441532, - "grad_norm": 4.65625, - "learning_rate": 5.280611492079449e-06, - "loss": 0.1776, - "step": 7790 - }, - { - "epoch": 0.7924413288631514, - "grad_norm": 1.3359375, - "learning_rate": 5.231163685803361e-06, - "loss": 0.1497, - "step": 7800 - }, - { - "epoch": 0.7934572792847709, - "grad_norm": 2.640625, - "learning_rate": 5.181921418920191e-06, - "loss": 0.12, - "step": 7810 - }, - { - "epoch": 0.7944732297063903, - "grad_norm": 2.328125, - "learning_rate": 5.13288520340878e-06, - "loss": 0.1981, - "step": 7820 - }, - { - "epoch": 0.7954891801280097, - "grad_norm": 3.0625, - "learning_rate": 5.084055549105596e-06, - "loss": 0.1389, - "step": 7830 - }, - { - "epoch": 0.7965051305496291, - "grad_norm": 2.796875, - "learning_rate": 5.035432963699479e-06, - "loss": 0.2293, - "step": 7840 - }, - { - "epoch": 0.7975210809712486, - "grad_norm": 5.0625, - "learning_rate": 4.98701795272635e-06, - "loss": 0.1618, - "step": 7850 - }, - { - "epoch": 0.798537031392868, - "grad_norm": 5.09375, - "learning_rate": 4.938811019563938e-06, - "loss": 0.1755, - "step": 7860 - }, - { - "epoch": 0.7995529818144874, - "grad_norm": 2.140625, - "learning_rate": 4.8908126654265475e-06, - "loss": 0.1565, - "step": 7870 - }, - { - "epoch": 0.8005689322361069, - "grad_norm": 0.76171875, - "learning_rate": 4.843023389359885e-06, - "loss": 0.2176, - "step": 7880 - }, - { - "epoch": 0.8015848826577263, - "grad_norm": 2.625, - "learning_rate": 4.79544368823581e-06, - "loss": 0.2013, - "step": 7890 - }, - { - "epoch": 0.8026008330793457, - "grad_norm": 2.078125, - "learning_rate": 4.748074056747234e-06, - "loss": 0.1246, - "step": 7900 - }, - { - "epoch": 0.8036167835009651, - "grad_norm": 3.5, - "learning_rate": 4.700914987402919e-06, - "loss": 0.1638, - "step": 7910 - }, - { - "epoch": 0.8046327339225846, - "grad_norm": 3.4375, - "learning_rate": 4.6539669705223916e-06, - "loss": 0.2213, - "step": 7920 - }, - { - "epoch": 0.805648684344204, - "grad_norm": 2.96875, - "learning_rate": 4.607230494230849e-06, - "loss": 0.1822, - "step": 7930 - }, - { - "epoch": 0.8066646347658234, - "grad_norm": 2.359375, - "learning_rate": 4.560706044454047e-06, - "loss": 0.1763, - "step": 7940 - }, - { - "epoch": 0.8076805851874429, - "grad_norm": 4.59375, - "learning_rate": 4.514394104913291e-06, - "loss": 0.234, - "step": 7950 - }, - { - "epoch": 0.8086965356090623, - "grad_norm": 1.96875, - "learning_rate": 4.468295157120372e-06, - "loss": 0.1939, - "step": 7960 - }, - { - "epoch": 0.8097124860306817, - "grad_norm": 2.578125, - "learning_rate": 4.422409680372594e-06, - "loss": 0.174, - "step": 7970 - }, - { - "epoch": 0.8107284364523011, - "grad_norm": 4.5625, - "learning_rate": 4.3767381517477505e-06, - "loss": 0.2375, - "step": 7980 - }, - { - "epoch": 0.8117443868739206, - "grad_norm": 0.9609375, - "learning_rate": 4.331281046099203e-06, - "loss": 0.2076, - "step": 7990 - }, - { - "epoch": 0.81276033729554, - "grad_norm": 6.0625, - "learning_rate": 4.286038836050929e-06, - "loss": 0.2504, - "step": 8000 - }, - { - "epoch": 0.8137762877171594, - "grad_norm": 3.484375, - "learning_rate": 4.241011991992586e-06, - "loss": 0.2102, - "step": 8010 - }, - { - "epoch": 0.8147922381387789, - "grad_norm": 1.9765625, - "learning_rate": 4.1962009820746635e-06, - "loss": 0.1846, - "step": 8020 - }, - { - "epoch": 0.8158081885603983, - "grad_norm": 1.875, - "learning_rate": 4.15160627220357e-06, - "loss": 0.1741, - "step": 8030 - }, - { - "epoch": 0.8168241389820177, - "grad_norm": 5.5625, - "learning_rate": 4.107228326036838e-06, - "loss": 0.2078, - "step": 8040 - }, - { - "epoch": 0.8178400894036371, - "grad_norm": 1.7578125, - "learning_rate": 4.063067604978252e-06, - "loss": 0.212, - "step": 8050 - }, - { - "epoch": 0.8188560398252566, - "grad_norm": 4.09375, - "learning_rate": 4.019124568173094e-06, - "loss": 0.1831, - "step": 8060 - }, - { - "epoch": 0.819871990246876, - "grad_norm": 6.625, - "learning_rate": 3.975399672503341e-06, - "loss": 0.2196, - "step": 8070 - }, - { - "epoch": 0.8208879406684954, - "grad_norm": 2.78125, - "learning_rate": 3.931893372582943e-06, - "loss": 0.2002, - "step": 8080 - }, - { - "epoch": 0.8219038910901149, - "grad_norm": 6.90625, - "learning_rate": 3.888606120753047e-06, - "loss": 0.2138, - "step": 8090 - }, - { - "epoch": 0.8229198415117343, - "grad_norm": 4.09375, - "learning_rate": 3.845538367077362e-06, - "loss": 0.2593, - "step": 8100 - }, - { - "epoch": 0.8239357919333536, - "grad_norm": 1.859375, - "learning_rate": 3.8026905593374213e-06, - "loss": 0.2062, - "step": 8110 - }, - { - "epoch": 0.824951742354973, - "grad_norm": 4.3125, - "learning_rate": 3.760063143027945e-06, - "loss": 0.1343, - "step": 8120 - }, - { - "epoch": 0.8259676927765925, - "grad_norm": 1.984375, - "learning_rate": 3.7176565613522313e-06, - "loss": 0.2494, - "step": 8130 - }, - { - "epoch": 0.8269836431982119, - "grad_norm": 3.71875, - "learning_rate": 3.675471255217516e-06, - "loss": 0.1502, - "step": 8140 - }, - { - "epoch": 0.8279995936198313, - "grad_norm": 2.359375, - "learning_rate": 3.6335076632304175e-06, - "loss": 0.1256, - "step": 8150 - }, - { - "epoch": 0.8290155440414507, - "grad_norm": 1.46875, - "learning_rate": 3.5917662216923332e-06, - "loss": 0.1709, - "step": 8160 - }, - { - "epoch": 0.8300314944630702, - "grad_norm": 2.78125, - "learning_rate": 3.550247364594958e-06, - "loss": 0.1881, - "step": 8170 - }, - { - "epoch": 0.8310474448846896, - "grad_norm": 1.0703125, - "learning_rate": 3.508951523615725e-06, - "loss": 0.1998, - "step": 8180 - }, - { - "epoch": 0.832063395306309, - "grad_norm": 2.40625, - "learning_rate": 3.467879128113352e-06, - "loss": 0.2429, - "step": 8190 - }, - { - "epoch": 0.8330793457279285, - "grad_norm": 2.609375, - "learning_rate": 3.427030605123352e-06, - "loss": 0.1942, - "step": 8200 - }, - { - "epoch": 0.8340952961495479, - "grad_norm": 1.6015625, - "learning_rate": 3.3864063793536043e-06, - "loss": 0.1898, - "step": 8210 - }, - { - "epoch": 0.8351112465711673, - "grad_norm": 5.375, - "learning_rate": 3.3460068731799577e-06, - "loss": 0.1919, - "step": 8220 - }, - { - "epoch": 0.8361271969927867, - "grad_norm": 3.3125, - "learning_rate": 3.3058325066417818e-06, - "loss": 0.1516, - "step": 8230 - }, - { - "epoch": 0.8371431474144062, - "grad_norm": 0.76171875, - "learning_rate": 3.26588369743768e-06, - "loss": 0.1068, - "step": 8240 - }, - { - "epoch": 0.8381590978360256, - "grad_norm": 3.171875, - "learning_rate": 3.2261608609210653e-06, - "loss": 0.1203, - "step": 8250 - }, - { - "epoch": 0.839175048257645, - "grad_norm": 2.359375, - "learning_rate": 3.186664410095913e-06, - "loss": 0.2172, - "step": 8260 - }, - { - "epoch": 0.8401909986792645, - "grad_norm": 3.328125, - "learning_rate": 3.1473947556124093e-06, - "loss": 0.1249, - "step": 8270 - }, - { - "epoch": 0.8412069491008839, - "grad_norm": 2.484375, - "learning_rate": 3.1083523057627213e-06, - "loss": 0.1744, - "step": 8280 - }, - { - "epoch": 0.8422228995225033, - "grad_norm": 4.46875, - "learning_rate": 3.0695374664767353e-06, - "loss": 0.1772, - "step": 8290 - }, - { - "epoch": 0.8432388499441227, - "grad_norm": 0.59375, - "learning_rate": 3.0309506413178397e-06, - "loss": 0.2302, - "step": 8300 - }, - { - "epoch": 0.8442548003657422, - "grad_norm": 2.390625, - "learning_rate": 2.9925922314787136e-06, - "loss": 0.1635, - "step": 8310 - }, - { - "epoch": 0.8452707507873616, - "grad_norm": 2.34375, - "learning_rate": 2.954462635777194e-06, - "loss": 0.1573, - "step": 8320 - }, - { - "epoch": 0.846286701208981, - "grad_norm": 2.015625, - "learning_rate": 2.916562250652083e-06, - "loss": 0.1608, - "step": 8330 - }, - { - "epoch": 0.8473026516306005, - "grad_norm": 4.125, - "learning_rate": 2.878891470159048e-06, - "loss": 0.184, - "step": 8340 - }, - { - "epoch": 0.8483186020522199, - "grad_norm": 2.515625, - "learning_rate": 2.8414506859665514e-06, - "loss": 0.2141, - "step": 8350 - }, - { - "epoch": 0.8493345524738393, - "grad_norm": 3.375, - "learning_rate": 2.8042402873517197e-06, - "loss": 0.1729, - "step": 8360 - }, - { - "epoch": 0.8503505028954587, - "grad_norm": 3.078125, - "learning_rate": 2.76726066119635e-06, - "loss": 0.2252, - "step": 8370 - }, - { - "epoch": 0.8513664533170782, - "grad_norm": 1.5390625, - "learning_rate": 2.730512191982845e-06, - "loss": 0.1644, - "step": 8380 - }, - { - "epoch": 0.8523824037386976, - "grad_norm": 1.9296875, - "learning_rate": 2.693995261790261e-06, - "loss": 0.1822, - "step": 8390 - }, - { - "epoch": 0.853398354160317, - "grad_norm": 3.3125, - "learning_rate": 2.657710250290285e-06, - "loss": 0.2068, - "step": 8400 - }, - { - "epoch": 0.8544143045819365, - "grad_norm": 0.640625, - "learning_rate": 2.621657534743327e-06, - "loss": 0.1224, - "step": 8410 - }, - { - "epoch": 0.8554302550035559, - "grad_norm": 3.421875, - "learning_rate": 2.5858374899945804e-06, - "loss": 0.179, - "step": 8420 - }, - { - "epoch": 0.8564462054251752, - "grad_norm": 3.484375, - "learning_rate": 2.550250488470135e-06, - "loss": 0.1873, - "step": 8430 - }, - { - "epoch": 0.8574621558467946, - "grad_norm": 3.984375, - "learning_rate": 2.5148969001730806e-06, - "loss": 0.1799, - "step": 8440 - }, - { - "epoch": 0.8584781062684141, - "grad_norm": 1.375, - "learning_rate": 2.4797770926796858e-06, - "loss": 0.176, - "step": 8450 - }, - { - "epoch": 0.8594940566900335, - "grad_norm": 1.8984375, - "learning_rate": 2.444891431135571e-06, - "loss": 0.1664, - "step": 8460 - }, - { - "epoch": 0.8605100071116529, - "grad_norm": 4.15625, - "learning_rate": 2.4102402782518936e-06, - "loss": 0.1512, - "step": 8470 - }, - { - "epoch": 0.8615259575332723, - "grad_norm": 1.34375, - "learning_rate": 2.3758239943016096e-06, - "loss": 0.1629, - "step": 8480 - }, - { - "epoch": 0.8625419079548918, - "grad_norm": 5.3125, - "learning_rate": 2.3416429371157013e-06, - "loss": 0.2099, - "step": 8490 - }, - { - "epoch": 0.8635578583765112, - "grad_norm": 5.9375, - "learning_rate": 2.307697462079464e-06, - "loss": 0.2221, - "step": 8500 - }, - { - "epoch": 0.8645738087981306, - "grad_norm": 5.4375, - "learning_rate": 2.273987922128809e-06, - "loss": 0.2191, - "step": 8510 - }, - { - "epoch": 0.8655897592197501, - "grad_norm": 2.171875, - "learning_rate": 2.240514667746607e-06, - "loss": 0.1843, - "step": 8520 - }, - { - "epoch": 0.8666057096413695, - "grad_norm": 2.5625, - "learning_rate": 2.2072780469590245e-06, - "loss": 0.2494, - "step": 8530 - }, - { - "epoch": 0.8676216600629889, - "grad_norm": 2.25, - "learning_rate": 2.1742784053319116e-06, - "loss": 0.1712, - "step": 8540 - }, - { - "epoch": 0.8686376104846083, - "grad_norm": 4.5625, - "learning_rate": 2.141516085967224e-06, - "loss": 0.1169, - "step": 8550 - }, - { - "epoch": 0.8696535609062278, - "grad_norm": 4.25, - "learning_rate": 2.1089914294994434e-06, - "loss": 0.1374, - "step": 8560 - }, - { - "epoch": 0.8706695113278472, - "grad_norm": 3.265625, - "learning_rate": 2.0767047740920336e-06, - "loss": 0.2162, - "step": 8570 - }, - { - "epoch": 0.8716854617494666, - "grad_norm": 1.8203125, - "learning_rate": 2.0446564554339187e-06, - "loss": 0.1593, - "step": 8580 - }, - { - "epoch": 0.8727014121710861, - "grad_norm": 2.671875, - "learning_rate": 2.0128468067360185e-06, - "loss": 0.1857, - "step": 8590 - }, - { - "epoch": 0.8737173625927055, - "grad_norm": 2.765625, - "learning_rate": 1.981276158727749e-06, - "loss": 0.1989, - "step": 8600 - }, - { - "epoch": 0.8747333130143249, - "grad_norm": 2.65625, - "learning_rate": 1.949944839653625e-06, - "loss": 0.2077, - "step": 8610 - }, - { - "epoch": 0.8757492634359443, - "grad_norm": 2.625, - "learning_rate": 1.918853175269797e-06, - "loss": 0.2003, - "step": 8620 - }, - { - "epoch": 0.8767652138575638, - "grad_norm": 0.71875, - "learning_rate": 1.8880014888407127e-06, - "loss": 0.2486, - "step": 8630 - }, - { - "epoch": 0.8777811642791832, - "grad_norm": 4.71875, - "learning_rate": 1.8573901011357336e-06, - "loss": 0.1896, - "step": 8640 - }, - { - "epoch": 0.8787971147008026, - "grad_norm": 5.0625, - "learning_rate": 1.8270193304257887e-06, - "loss": 0.1727, - "step": 8650 - }, - { - "epoch": 0.8798130651224221, - "grad_norm": 1.75, - "learning_rate": 1.7968894924800916e-06, - "loss": 0.1687, - "step": 8660 - }, - { - "epoch": 0.8808290155440415, - "grad_norm": 2.65625, - "learning_rate": 1.7670009005628291e-06, - "loss": 0.166, - "step": 8670 - }, - { - "epoch": 0.8818449659656609, - "grad_norm": 4.71875, - "learning_rate": 1.737353865429936e-06, - "loss": 0.1471, - "step": 8680 - }, - { - "epoch": 0.8828609163872803, - "grad_norm": 0.546875, - "learning_rate": 1.7079486953258283e-06, - "loss": 0.1075, - "step": 8690 - }, - { - "epoch": 0.8838768668088998, - "grad_norm": 1.640625, - "learning_rate": 1.6787856959802367e-06, - "loss": 0.2113, - "step": 8700 - }, - { - "epoch": 0.8848928172305192, - "grad_norm": 2.953125, - "learning_rate": 1.6498651706049945e-06, - "loss": 0.1412, - "step": 8710 - }, - { - "epoch": 0.8859087676521386, - "grad_norm": 3.796875, - "learning_rate": 1.6211874198909072e-06, - "loss": 0.1701, - "step": 8720 - }, - { - "epoch": 0.8869247180737581, - "grad_norm": 3.734375, - "learning_rate": 1.592752742004605e-06, - "loss": 0.1348, - "step": 8730 - }, - { - "epoch": 0.8879406684953774, - "grad_norm": 2.21875, - "learning_rate": 1.5645614325854735e-06, - "loss": 0.1931, - "step": 8740 - }, - { - "epoch": 0.8889566189169968, - "grad_norm": 3.4375, - "learning_rate": 1.5366137847425466e-06, - "loss": 0.1705, - "step": 8750 - }, - { - "epoch": 0.8899725693386162, - "grad_norm": 3.5625, - "learning_rate": 1.5089100890514769e-06, - "loss": 0.1889, - "step": 8760 - }, - { - "epoch": 0.8909885197602357, - "grad_norm": 2.65625, - "learning_rate": 1.4814506335515176e-06, - "loss": 0.1837, - "step": 8770 - }, - { - "epoch": 0.8920044701818551, - "grad_norm": 1.421875, - "learning_rate": 1.4542357037425207e-06, - "loss": 0.1728, - "step": 8780 - }, - { - "epoch": 0.8930204206034745, - "grad_norm": 1.625, - "learning_rate": 1.4272655825819713e-06, - "loss": 0.1562, - "step": 8790 - }, - { - "epoch": 0.8940363710250939, - "grad_norm": 4.0625, - "learning_rate": 1.4005405504820351e-06, - "loss": 0.1681, - "step": 8800 - }, - { - "epoch": 0.8950523214467134, - "grad_norm": 2.328125, - "learning_rate": 1.3740608853066634e-06, - "loss": 0.1449, - "step": 8810 - }, - { - "epoch": 0.8960682718683328, - "grad_norm": 4.0625, - "learning_rate": 1.347826862368684e-06, - "loss": 0.2418, - "step": 8820 - }, - { - "epoch": 0.8970842222899522, - "grad_norm": 0.55859375, - "learning_rate": 1.3218387544269545e-06, - "loss": 0.2473, - "step": 8830 - }, - { - "epoch": 0.8981001727115717, - "grad_norm": 4.78125, - "learning_rate": 1.2960968316835132e-06, - "loss": 0.194, - "step": 8840 - }, - { - "epoch": 0.8991161231331911, - "grad_norm": 3.921875, - "learning_rate": 1.2706013617807822e-06, - "loss": 0.2109, - "step": 8850 - }, - { - "epoch": 0.9001320735548105, - "grad_norm": 5.03125, - "learning_rate": 1.2453526097987778e-06, - "loss": 0.151, - "step": 8860 - }, - { - "epoch": 0.9011480239764299, - "grad_norm": 5.96875, - "learning_rate": 1.2203508382523431e-06, - "loss": 0.1811, - "step": 8870 - }, - { - "epoch": 0.9021639743980494, - "grad_norm": 3.828125, - "learning_rate": 1.1955963070884534e-06, - "loss": 0.2004, - "step": 8880 - }, - { - "epoch": 0.9031799248196688, - "grad_norm": 1.9765625, - "learning_rate": 1.171089273683465e-06, - "loss": 0.1395, - "step": 8890 - }, - { - "epoch": 0.9041958752412882, - "grad_norm": 2.328125, - "learning_rate": 1.1468299928404868e-06, - "loss": 0.1915, - "step": 8900 - }, - { - "epoch": 0.9052118256629077, - "grad_norm": 1.265625, - "learning_rate": 1.1228187167866943e-06, - "loss": 0.1281, - "step": 8910 - }, - { - "epoch": 0.9062277760845271, - "grad_norm": 1.4375, - "learning_rate": 1.099055695170728e-06, - "loss": 0.1627, - "step": 8920 - }, - { - "epoch": 0.9072437265061465, - "grad_norm": 0.6953125, - "learning_rate": 1.0755411750600962e-06, - "loss": 0.1768, - "step": 8930 - }, - { - "epoch": 0.9082596769277659, - "grad_norm": 1.046875, - "learning_rate": 1.052275400938596e-06, - "loss": 0.1544, - "step": 8940 - }, - { - "epoch": 0.9092756273493854, - "grad_norm": 2.71875, - "learning_rate": 1.0292586147037764e-06, - "loss": 0.2498, - "step": 8950 - }, - { - "epoch": 0.9102915777710048, - "grad_norm": 3.0625, - "learning_rate": 1.0064910556644214e-06, - "loss": 0.1918, - "step": 8960 - }, - { - "epoch": 0.9113075281926242, - "grad_norm": 4.0, - "learning_rate": 9.839729605380766e-07, - "loss": 0.2388, - "step": 8970 - }, - { - "epoch": 0.9123234786142437, - "grad_norm": 3.765625, - "learning_rate": 9.61704563448565e-07, - "loss": 0.1944, - "step": 8980 - }, - { - "epoch": 0.9133394290358631, - "grad_norm": 2.90625, - "learning_rate": 9.396860959235671e-07, - "loss": 0.1667, - "step": 8990 - }, - { - "epoch": 0.9143553794574825, - "grad_norm": 2.4375, - "learning_rate": 9.179177868922085e-07, - "loss": 0.2143, - "step": 9000 - }, - { - "epoch": 0.9153713298791019, - "grad_norm": 3.03125, - "learning_rate": 8.963998626826925e-07, - "loss": 0.1994, - "step": 9010 - }, - { - "epoch": 0.9163872803007214, - "grad_norm": 3.859375, - "learning_rate": 8.751325470199134e-07, - "loss": 0.1714, - "step": 9020 - }, - { - "epoch": 0.9174032307223408, - "grad_norm": 3.375, - "learning_rate": 8.541160610231803e-07, - "loss": 0.144, - "step": 9030 - }, - { - "epoch": 0.9184191811439602, - "grad_norm": 1.3046875, - "learning_rate": 8.333506232038629e-07, - "loss": 0.1333, - "step": 9040 - }, - { - "epoch": 0.9194351315655795, - "grad_norm": 1.734375, - "learning_rate": 8.128364494631724e-07, - "loss": 0.1504, - "step": 9050 - }, - { - "epoch": 0.920451081987199, - "grad_norm": 2.90625, - "learning_rate": 7.925737530898702e-07, - "loss": 0.2235, - "step": 9060 - }, - { - "epoch": 0.9214670324088184, - "grad_norm": 3.359375, - "learning_rate": 7.725627447580902e-07, - "loss": 0.1256, - "step": 9070 - }, - { - "epoch": 0.9224829828304378, - "grad_norm": 6.125, - "learning_rate": 7.528036325251231e-07, - "loss": 0.1963, - "step": 9080 - }, - { - "epoch": 0.9234989332520573, - "grad_norm": 2.359375, - "learning_rate": 7.33296621829252e-07, - "loss": 0.2208, - "step": 9090 - }, - { - "epoch": 0.9245148836736767, - "grad_norm": 3.546875, - "learning_rate": 7.140419154876372e-07, - "loss": 0.184, - "step": 9100 - }, - { - "epoch": 0.9255308340952961, - "grad_norm": 2.640625, - "learning_rate": 6.950397136941872e-07, - "loss": 0.2097, - "step": 9110 - }, - { - "epoch": 0.9265467845169155, - "grad_norm": 3.671875, - "learning_rate": 6.762902140174888e-07, - "loss": 0.19, - "step": 9120 - }, - { - "epoch": 0.927562734938535, - "grad_norm": 2.03125, - "learning_rate": 6.577936113987437e-07, - "loss": 0.1427, - "step": 9130 - }, - { - "epoch": 0.9285786853601544, - "grad_norm": 2.1875, - "learning_rate": 6.395500981497577e-07, - "loss": 0.2116, - "step": 9140 - }, - { - "epoch": 0.9295946357817738, - "grad_norm": 5.25, - "learning_rate": 6.215598639509185e-07, - "loss": 0.2384, - "step": 9150 - }, - { - "epoch": 0.9306105862033933, - "grad_norm": 3.953125, - "learning_rate": 6.038230958492403e-07, - "loss": 0.2406, - "step": 9160 - }, - { - "epoch": 0.9316265366250127, - "grad_norm": 4.09375, - "learning_rate": 5.863399782564199e-07, - "loss": 0.1889, - "step": 9170 - }, - { - "epoch": 0.9326424870466321, - "grad_norm": 3.109375, - "learning_rate": 5.691106929469004e-07, - "loss": 0.1916, - "step": 9180 - }, - { - "epoch": 0.9336584374682515, - "grad_norm": 1.65625, - "learning_rate": 5.521354190560102e-07, - "loss": 0.1752, - "step": 9190 - }, - { - "epoch": 0.934674387889871, - "grad_norm": 5.21875, - "learning_rate": 5.354143330780714e-07, - "loss": 0.1779, - "step": 9200 - }, - { - "epoch": 0.9356903383114904, - "grad_norm": 2.359375, - "learning_rate": 5.18947608864595e-07, - "loss": 0.1461, - "step": 9210 - }, - { - "epoch": 0.9367062887331098, - "grad_norm": 1.0859375, - "learning_rate": 5.027354176224353e-07, - "loss": 0.2565, - "step": 9220 - }, - { - "epoch": 0.9377222391547293, - "grad_norm": 4.5, - "learning_rate": 4.867779279120493e-07, - "loss": 0.2301, - "step": 9230 - }, - { - "epoch": 0.9387381895763487, - "grad_norm": 2.984375, - "learning_rate": 4.710753056457157e-07, - "loss": 0.1916, - "step": 9240 - }, - { - "epoch": 0.9397541399979681, - "grad_norm": 4.40625, - "learning_rate": 4.556277140858267e-07, - "loss": 0.1808, - "step": 9250 - }, - { - "epoch": 0.9407700904195875, - "grad_norm": 4.3125, - "learning_rate": 4.404353138431766e-07, - "loss": 0.1552, - "step": 9260 - }, - { - "epoch": 0.941786040841207, - "grad_norm": 2.640625, - "learning_rate": 4.254982628753096e-07, - "loss": 0.1995, - "step": 9270 - }, - { - "epoch": 0.9428019912628264, - "grad_norm": 3.03125, - "learning_rate": 4.108167164848575e-07, - "loss": 0.1495, - "step": 9280 - }, - { - "epoch": 0.9438179416844458, - "grad_norm": 3.203125, - "learning_rate": 3.963908273179384e-07, - "loss": 0.1787, - "step": 9290 - }, - { - "epoch": 0.9448338921060653, - "grad_norm": 0.8671875, - "learning_rate": 3.8222074536257144e-07, - "loss": 0.1742, - "step": 9300 - }, - { - "epoch": 0.9458498425276847, - "grad_norm": 0.734375, - "learning_rate": 3.683066179470979e-07, - "loss": 0.1386, - "step": 9310 - }, - { - "epoch": 0.9468657929493041, - "grad_norm": 2.796875, - "learning_rate": 3.5464858973868476e-07, - "loss": 0.1806, - "step": 9320 - }, - { - "epoch": 0.9478817433709235, - "grad_norm": 2.734375, - "learning_rate": 3.4124680274177646e-07, - "loss": 0.1873, - "step": 9330 - }, - { - "epoch": 0.948897693792543, - "grad_norm": 2.390625, - "learning_rate": 3.2810139629665393e-07, - "loss": 0.1745, - "step": 9340 - }, - { - "epoch": 0.9499136442141624, - "grad_norm": 3.546875, - "learning_rate": 3.152125070779749e-07, - "loss": 0.2116, - "step": 9350 - }, - { - "epoch": 0.9509295946357817, - "grad_norm": 5.5625, - "learning_rate": 3.0258026909334713e-07, - "loss": 0.2088, - "step": 9360 - }, - { - "epoch": 0.9519455450574011, - "grad_norm": 4.5, - "learning_rate": 2.9020481368193795e-07, - "loss": 0.111, - "step": 9370 - }, - { - "epoch": 0.9529614954790206, - "grad_norm": 3.15625, - "learning_rate": 2.7808626951310867e-07, - "loss": 0.2391, - "step": 9380 - }, - { - "epoch": 0.95397744590064, - "grad_norm": 2.6875, - "learning_rate": 2.662247625850822e-07, - "loss": 0.3217, - "step": 9390 - }, - { - "epoch": 0.9549933963222594, - "grad_norm": 2.65625, - "learning_rate": 2.5462041622362767e-07, - "loss": 0.1667, - "step": 9400 - }, - { - "epoch": 0.9560093467438789, - "grad_norm": 0.8984375, - "learning_rate": 2.4327335108077773e-07, - "loss": 0.1709, - "step": 9410 - }, - { - "epoch": 0.9570252971654983, - "grad_norm": 4.65625, - "learning_rate": 2.3218368513357737e-07, - "loss": 0.1912, - "step": 9420 - }, - { - "epoch": 0.9580412475871177, - "grad_norm": 3.390625, - "learning_rate": 2.213515336828592e-07, - "loss": 0.1544, - "step": 9430 - }, - { - "epoch": 0.9590571980087371, - "grad_norm": 2.921875, - "learning_rate": 2.1077700935202836e-07, - "loss": 0.1806, - "step": 9440 - }, - { - "epoch": 0.9600731484303566, - "grad_norm": 3.5625, - "learning_rate": 2.004602220859214e-07, - "loss": 0.152, - "step": 9450 - }, - { - "epoch": 0.961089098851976, - "grad_norm": 2.671875, - "learning_rate": 1.9040127914963514e-07, - "loss": 0.1799, - "step": 9460 - }, - { - "epoch": 0.9621050492735954, - "grad_norm": 2.828125, - "learning_rate": 1.8060028512742188e-07, - "loss": 0.1811, - "step": 9470 - }, - { - "epoch": 0.9631209996952149, - "grad_norm": 5.875, - "learning_rate": 1.7105734192160717e-07, - "loss": 0.2012, - "step": 9480 - }, - { - "epoch": 0.9641369501168343, - "grad_norm": 3.21875, - "learning_rate": 1.6177254875152647e-07, - "loss": 0.2129, - "step": 9490 - }, - { - "epoch": 0.9651529005384537, - "grad_norm": 1.078125, - "learning_rate": 1.5274600215248736e-07, - "loss": 0.1498, - "step": 9500 - } - ], - "logging_steps": 10, - "max_steps": 9843, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "total_flos": 0.0, - "train_batch_size": 7, - "trial_name": null, - "trial_params": null -} diff --git a/checkpoints/checkpoint-9500/training_args.bin b/checkpoints/checkpoint-9500/training_args.bin deleted file mode 100644 index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..0000000000000000000000000000000000000000 --- a/checkpoints/checkpoint-9500/training_args.bin +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 -size 4984 diff --git a/training_args.bin b/training_args.bin index 6bb0a4ec10ab29ca1942aecdfa4212d352e373f1..4551afd3015888d322905d1067aeaf632b02e4d9 100644 --- a/training_args.bin +++ b/training_args.bin @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:678fdb0261b74cfbf22c48f7675247cc333daf413df3ae34c37afbda83a36025 +oid sha256:c506e22e77e2cbebfc5cc094ca912b0f34562cb90a59703aabccd84045bda36d size 4984