diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5669 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.944375772558715, + "eval_steps": 5000, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00024721878862793575, + "grad_norm": 1448.0, + "learning_rate": 2.0000000000000002e-07, + "loss": 13.9716, + "step": 1 + }, + { + "epoch": 0.006180469715698393, + "grad_norm": 334.0, + "learning_rate": 5e-06, + "loss": 9.5244, + "step": 25 + }, + { + "epoch": 0.012360939431396786, + "grad_norm": 78.5, + "learning_rate": 1e-05, + "loss": 7.5535, + "step": 50 + }, + { + "epoch": 0.018541409147095178, + "grad_norm": 34.5, + "learning_rate": 1.5e-05, + "loss": 7.0022, + "step": 75 + }, + { + "epoch": 0.024721878862793572, + "grad_norm": 18.75, + "learning_rate": 2e-05, + "loss": 6.4562, + "step": 100 + }, + { + "epoch": 0.030902348578491966, + "grad_norm": 63.75, + "learning_rate": 2.5e-05, + "loss": 5.6999, + "step": 125 + }, + { + "epoch": 0.037082818294190356, + "grad_norm": 29.125, + "learning_rate": 3e-05, + "loss": 5.2536, + "step": 150 + }, + { + "epoch": 0.04326328800988875, + "grad_norm": 29.5, + "learning_rate": 3.5e-05, + "loss": 4.6767, + "step": 175 + }, + { + "epoch": 0.049443757725587144, + "grad_norm": 20.25, + "learning_rate": 4e-05, + "loss": 4.2913, + "step": 200 + }, + { + "epoch": 0.05562422744128554, + "grad_norm": 16.625, + "learning_rate": 4.5e-05, + "loss": 3.7505, + "step": 225 + }, + { + "epoch": 0.06180469715698393, + "grad_norm": 17.375, + "learning_rate": 5e-05, + "loss": 3.3317, + "step": 250 + }, + { + "epoch": 0.06798516687268233, + "grad_norm": 112.0, + "learning_rate": 5.500000000000001e-05, + "loss": 4.1318, + "step": 275 + }, + { + "epoch": 0.07416563658838071, + "grad_norm": 31.75, + "learning_rate": 6e-05, + "loss": 3.558, + "step": 300 + }, + { + "epoch": 0.08034610630407911, + "grad_norm": 13.3125, + "learning_rate": 6.500000000000001e-05, + "loss": 2.9825, + "step": 325 + }, + { + "epoch": 0.0865265760197775, + "grad_norm": 12.875, + "learning_rate": 7e-05, + "loss": 2.6931, + "step": 350 + }, + { + "epoch": 0.09270704573547589, + "grad_norm": 12.8125, + "learning_rate": 7.500000000000001e-05, + "loss": 2.4328, + "step": 375 + }, + { + "epoch": 0.09888751545117429, + "grad_norm": 11.0625, + "learning_rate": 8e-05, + "loss": 2.285, + "step": 400 + }, + { + "epoch": 0.10506798516687268, + "grad_norm": 6.28125, + "learning_rate": 8.5e-05, + "loss": 2.1667, + "step": 425 + }, + { + "epoch": 0.11124845488257108, + "grad_norm": 4.5625, + "learning_rate": 9e-05, + "loss": 2.0534, + "step": 450 + }, + { + "epoch": 0.11742892459826947, + "grad_norm": 3.90625, + "learning_rate": 9.5e-05, + "loss": 1.9744, + "step": 475 + }, + { + "epoch": 0.12360939431396786, + "grad_norm": 4.78125, + "learning_rate": 0.0001, + "loss": 1.9209, + "step": 500 + }, + { + "epoch": 0.12978986402966625, + "grad_norm": 3.09375, + "learning_rate": 9.987179487179488e-05, + "loss": 1.8793, + "step": 525 + }, + { + "epoch": 0.13597033374536466, + "grad_norm": 2.953125, + "learning_rate": 9.974358974358975e-05, + "loss": 1.8301, + "step": 550 + }, + { + "epoch": 0.14215080346106304, + "grad_norm": 2.25, + "learning_rate": 9.961538461538463e-05, + "loss": 1.7935, + "step": 575 + }, + { + "epoch": 0.14833127317676142, + "grad_norm": 2.3125, + "learning_rate": 9.948717948717949e-05, + "loss": 1.7651, + "step": 600 + }, + { + "epoch": 0.15451174289245984, + "grad_norm": 2.21875, + "learning_rate": 9.935897435897437e-05, + "loss": 1.737, + "step": 625 + }, + { + "epoch": 0.16069221260815822, + "grad_norm": 2.28125, + "learning_rate": 9.923076923076923e-05, + "loss": 1.7098, + "step": 650 + }, + { + "epoch": 0.1668726823238566, + "grad_norm": 1.71875, + "learning_rate": 9.910256410256411e-05, + "loss": 1.69, + "step": 675 + }, + { + "epoch": 0.173053152039555, + "grad_norm": 1.671875, + "learning_rate": 9.897435897435898e-05, + "loss": 1.6622, + "step": 700 + }, + { + "epoch": 0.1792336217552534, + "grad_norm": 1.8125, + "learning_rate": 9.884615384615386e-05, + "loss": 1.6502, + "step": 725 + }, + { + "epoch": 0.18541409147095178, + "grad_norm": 1.4765625, + "learning_rate": 9.871794871794872e-05, + "loss": 1.629, + "step": 750 + }, + { + "epoch": 0.1915945611866502, + "grad_norm": 1.3984375, + "learning_rate": 9.85897435897436e-05, + "loss": 1.6102, + "step": 775 + }, + { + "epoch": 0.19777503090234858, + "grad_norm": 1.4296875, + "learning_rate": 9.846153846153848e-05, + "loss": 1.5991, + "step": 800 + }, + { + "epoch": 0.20395550061804696, + "grad_norm": 1.5234375, + "learning_rate": 9.833333333333333e-05, + "loss": 1.5884, + "step": 825 + }, + { + "epoch": 0.21013597033374537, + "grad_norm": 1.5, + "learning_rate": 9.820512820512821e-05, + "loss": 1.5704, + "step": 850 + }, + { + "epoch": 0.21631644004944375, + "grad_norm": 1.453125, + "learning_rate": 9.807692307692307e-05, + "loss": 1.5573, + "step": 875 + }, + { + "epoch": 0.22249690976514216, + "grad_norm": 1.4453125, + "learning_rate": 9.794871794871795e-05, + "loss": 1.5416, + "step": 900 + }, + { + "epoch": 0.22867737948084055, + "grad_norm": 1.25, + "learning_rate": 9.782051282051282e-05, + "loss": 1.5409, + "step": 925 + }, + { + "epoch": 0.23485784919653893, + "grad_norm": 1.2421875, + "learning_rate": 9.76923076923077e-05, + "loss": 1.5344, + "step": 950 + }, + { + "epoch": 0.24103831891223734, + "grad_norm": 1.5, + "learning_rate": 9.756410256410257e-05, + "loss": 1.5195, + "step": 975 + }, + { + "epoch": 0.24721878862793573, + "grad_norm": 1.2421875, + "learning_rate": 9.743589743589744e-05, + "loss": 1.5051, + "step": 1000 + }, + { + "epoch": 0.25339925834363414, + "grad_norm": 1.1953125, + "learning_rate": 9.730769230769232e-05, + "loss": 1.505, + "step": 1025 + }, + { + "epoch": 0.2595797280593325, + "grad_norm": 1.109375, + "learning_rate": 9.717948717948718e-05, + "loss": 1.4932, + "step": 1050 + }, + { + "epoch": 0.2657601977750309, + "grad_norm": 1.046875, + "learning_rate": 9.705128205128206e-05, + "loss": 1.4901, + "step": 1075 + }, + { + "epoch": 0.2719406674907293, + "grad_norm": 1.0546875, + "learning_rate": 9.692307692307692e-05, + "loss": 1.4806, + "step": 1100 + }, + { + "epoch": 0.27812113720642767, + "grad_norm": 1.265625, + "learning_rate": 9.67948717948718e-05, + "loss": 1.4745, + "step": 1125 + }, + { + "epoch": 0.2843016069221261, + "grad_norm": 1.3125, + "learning_rate": 9.666666666666667e-05, + "loss": 1.4604, + "step": 1150 + }, + { + "epoch": 0.2904820766378245, + "grad_norm": 1.1640625, + "learning_rate": 9.653846153846155e-05, + "loss": 1.4584, + "step": 1175 + }, + { + "epoch": 0.29666254635352285, + "grad_norm": 1.5234375, + "learning_rate": 9.641025641025641e-05, + "loss": 1.4489, + "step": 1200 + }, + { + "epoch": 0.30284301606922126, + "grad_norm": 1.0546875, + "learning_rate": 9.628205128205129e-05, + "loss": 1.4439, + "step": 1225 + }, + { + "epoch": 0.30902348578491967, + "grad_norm": 1.0234375, + "learning_rate": 9.615384615384617e-05, + "loss": 1.4367, + "step": 1250 + }, + { + "epoch": 0.315203955500618, + "grad_norm": 1.15625, + "learning_rate": 9.602564102564103e-05, + "loss": 1.4392, + "step": 1275 + }, + { + "epoch": 0.32138442521631644, + "grad_norm": 0.91015625, + "learning_rate": 9.589743589743591e-05, + "loss": 1.4281, + "step": 1300 + }, + { + "epoch": 0.32756489493201485, + "grad_norm": 1.0234375, + "learning_rate": 9.576923076923078e-05, + "loss": 1.4251, + "step": 1325 + }, + { + "epoch": 0.3337453646477132, + "grad_norm": 0.9921875, + "learning_rate": 9.564102564102565e-05, + "loss": 1.4222, + "step": 1350 + }, + { + "epoch": 0.3399258343634116, + "grad_norm": 1.0859375, + "learning_rate": 9.551282051282052e-05, + "loss": 1.4259, + "step": 1375 + }, + { + "epoch": 0.34610630407911, + "grad_norm": 0.91796875, + "learning_rate": 9.53846153846154e-05, + "loss": 1.4154, + "step": 1400 + }, + { + "epoch": 0.3522867737948084, + "grad_norm": 1.0, + "learning_rate": 9.525641025641026e-05, + "loss": 1.4074, + "step": 1425 + }, + { + "epoch": 0.3584672435105068, + "grad_norm": 0.92578125, + "learning_rate": 9.512820512820513e-05, + "loss": 1.4051, + "step": 1450 + }, + { + "epoch": 0.3646477132262052, + "grad_norm": 1.0703125, + "learning_rate": 9.5e-05, + "loss": 1.4007, + "step": 1475 + }, + { + "epoch": 0.37082818294190356, + "grad_norm": 0.90234375, + "learning_rate": 9.487179487179487e-05, + "loss": 1.389, + "step": 1500 + }, + { + "epoch": 0.377008652657602, + "grad_norm": 0.9375, + "learning_rate": 9.474358974358975e-05, + "loss": 1.3924, + "step": 1525 + }, + { + "epoch": 0.3831891223733004, + "grad_norm": 0.890625, + "learning_rate": 9.461538461538461e-05, + "loss": 1.3868, + "step": 1550 + }, + { + "epoch": 0.38936959208899874, + "grad_norm": 0.9609375, + "learning_rate": 9.448717948717949e-05, + "loss": 1.3889, + "step": 1575 + }, + { + "epoch": 0.39555006180469715, + "grad_norm": 0.9296875, + "learning_rate": 9.435897435897436e-05, + "loss": 1.3825, + "step": 1600 + }, + { + "epoch": 0.40173053152039556, + "grad_norm": 1.0390625, + "learning_rate": 9.423076923076924e-05, + "loss": 1.3709, + "step": 1625 + }, + { + "epoch": 0.4079110012360939, + "grad_norm": 0.9140625, + "learning_rate": 9.41025641025641e-05, + "loss": 1.3719, + "step": 1650 + }, + { + "epoch": 0.41409147095179233, + "grad_norm": 0.953125, + "learning_rate": 9.397435897435898e-05, + "loss": 1.3718, + "step": 1675 + }, + { + "epoch": 0.42027194066749074, + "grad_norm": 1.0078125, + "learning_rate": 9.384615384615386e-05, + "loss": 1.3673, + "step": 1700 + }, + { + "epoch": 0.4264524103831891, + "grad_norm": 1.171875, + "learning_rate": 9.371794871794872e-05, + "loss": 1.3629, + "step": 1725 + }, + { + "epoch": 0.4326328800988875, + "grad_norm": 0.8125, + "learning_rate": 9.35897435897436e-05, + "loss": 1.3616, + "step": 1750 + }, + { + "epoch": 0.4388133498145859, + "grad_norm": 0.89453125, + "learning_rate": 9.346153846153846e-05, + "loss": 1.3599, + "step": 1775 + }, + { + "epoch": 0.44499381953028433, + "grad_norm": 0.88671875, + "learning_rate": 9.333333333333334e-05, + "loss": 1.3516, + "step": 1800 + }, + { + "epoch": 0.4511742892459827, + "grad_norm": 0.796875, + "learning_rate": 9.320512820512821e-05, + "loss": 1.3552, + "step": 1825 + }, + { + "epoch": 0.4573547589616811, + "grad_norm": 0.89453125, + "learning_rate": 9.307692307692309e-05, + "loss": 1.3448, + "step": 1850 + }, + { + "epoch": 0.4635352286773795, + "grad_norm": 0.94140625, + "learning_rate": 9.294871794871795e-05, + "loss": 1.3492, + "step": 1875 + }, + { + "epoch": 0.46971569839307786, + "grad_norm": 0.85546875, + "learning_rate": 9.282051282051283e-05, + "loss": 1.34, + "step": 1900 + }, + { + "epoch": 0.4758961681087763, + "grad_norm": 0.7734375, + "learning_rate": 9.26923076923077e-05, + "loss": 1.3406, + "step": 1925 + }, + { + "epoch": 0.4820766378244747, + "grad_norm": 0.83203125, + "learning_rate": 9.256410256410257e-05, + "loss": 1.3369, + "step": 1950 + }, + { + "epoch": 0.48825710754017304, + "grad_norm": 0.87109375, + "learning_rate": 9.243589743589745e-05, + "loss": 1.3297, + "step": 1975 + }, + { + "epoch": 0.49443757725587145, + "grad_norm": 0.84375, + "learning_rate": 9.230769230769232e-05, + "loss": 1.3347, + "step": 2000 + }, + { + "epoch": 0.5006180469715699, + "grad_norm": 0.81640625, + "learning_rate": 9.217948717948718e-05, + "loss": 1.3269, + "step": 2025 + }, + { + "epoch": 0.5067985166872683, + "grad_norm": 0.96484375, + "learning_rate": 9.205128205128205e-05, + "loss": 1.3316, + "step": 2050 + }, + { + "epoch": 0.5129789864029666, + "grad_norm": 1.078125, + "learning_rate": 9.192307692307692e-05, + "loss": 1.3212, + "step": 2075 + }, + { + "epoch": 0.519159456118665, + "grad_norm": 0.8203125, + "learning_rate": 9.179487179487179e-05, + "loss": 1.3177, + "step": 2100 + }, + { + "epoch": 0.5253399258343634, + "grad_norm": 0.8515625, + "learning_rate": 9.166666666666667e-05, + "loss": 1.3155, + "step": 2125 + }, + { + "epoch": 0.5315203955500618, + "grad_norm": 0.90234375, + "learning_rate": 9.153846153846155e-05, + "loss": 1.3206, + "step": 2150 + }, + { + "epoch": 0.5377008652657602, + "grad_norm": 0.796875, + "learning_rate": 9.141025641025641e-05, + "loss": 1.3131, + "step": 2175 + }, + { + "epoch": 0.5438813349814586, + "grad_norm": 0.91796875, + "learning_rate": 9.128205128205129e-05, + "loss": 1.3143, + "step": 2200 + }, + { + "epoch": 0.5500618046971569, + "grad_norm": 0.98046875, + "learning_rate": 9.115384615384615e-05, + "loss": 1.3099, + "step": 2225 + }, + { + "epoch": 0.5562422744128553, + "grad_norm": 0.77734375, + "learning_rate": 9.102564102564103e-05, + "loss": 1.3104, + "step": 2250 + }, + { + "epoch": 0.5624227441285538, + "grad_norm": 0.828125, + "learning_rate": 9.08974358974359e-05, + "loss": 1.3026, + "step": 2275 + }, + { + "epoch": 0.5686032138442522, + "grad_norm": 0.75390625, + "learning_rate": 9.076923076923078e-05, + "loss": 1.2985, + "step": 2300 + }, + { + "epoch": 0.5747836835599506, + "grad_norm": 0.80078125, + "learning_rate": 9.064102564102564e-05, + "loss": 1.3109, + "step": 2325 + }, + { + "epoch": 0.580964153275649, + "grad_norm": 0.859375, + "learning_rate": 9.051282051282052e-05, + "loss": 1.3061, + "step": 2350 + }, + { + "epoch": 0.5871446229913473, + "grad_norm": 0.77734375, + "learning_rate": 9.038461538461538e-05, + "loss": 1.3022, + "step": 2375 + }, + { + "epoch": 0.5933250927070457, + "grad_norm": 0.91015625, + "learning_rate": 9.025641025641026e-05, + "loss": 1.3006, + "step": 2400 + }, + { + "epoch": 0.5995055624227441, + "grad_norm": 0.84375, + "learning_rate": 9.012820512820514e-05, + "loss": 1.2938, + "step": 2425 + }, + { + "epoch": 0.6056860321384425, + "grad_norm": 0.93359375, + "learning_rate": 9e-05, + "loss": 1.2968, + "step": 2450 + }, + { + "epoch": 0.6118665018541409, + "grad_norm": 0.8046875, + "learning_rate": 8.987179487179488e-05, + "loss": 1.2872, + "step": 2475 + }, + { + "epoch": 0.6180469715698393, + "grad_norm": 0.828125, + "learning_rate": 8.974358974358975e-05, + "loss": 1.2902, + "step": 2500 + }, + { + "epoch": 0.6242274412855378, + "grad_norm": 0.88671875, + "learning_rate": 8.961538461538463e-05, + "loss": 1.2848, + "step": 2525 + }, + { + "epoch": 0.630407911001236, + "grad_norm": 0.76171875, + "learning_rate": 8.948717948717949e-05, + "loss": 1.2902, + "step": 2550 + }, + { + "epoch": 0.6365883807169345, + "grad_norm": 0.859375, + "learning_rate": 8.935897435897437e-05, + "loss": 1.2917, + "step": 2575 + }, + { + "epoch": 0.6427688504326329, + "grad_norm": 0.76953125, + "learning_rate": 8.923076923076924e-05, + "loss": 1.2882, + "step": 2600 + }, + { + "epoch": 0.6489493201483313, + "grad_norm": 0.84375, + "learning_rate": 8.910256410256411e-05, + "loss": 1.277, + "step": 2625 + }, + { + "epoch": 0.6551297898640297, + "grad_norm": 0.71875, + "learning_rate": 8.897435897435898e-05, + "loss": 1.2832, + "step": 2650 + }, + { + "epoch": 0.6613102595797281, + "grad_norm": 0.7578125, + "learning_rate": 8.884615384615384e-05, + "loss": 1.2822, + "step": 2675 + }, + { + "epoch": 0.6674907292954264, + "grad_norm": 0.7578125, + "learning_rate": 8.871794871794872e-05, + "loss": 1.2809, + "step": 2700 + }, + { + "epoch": 0.6736711990111248, + "grad_norm": 0.859375, + "learning_rate": 8.858974358974359e-05, + "loss": 1.2696, + "step": 2725 + }, + { + "epoch": 0.6798516687268232, + "grad_norm": 0.79296875, + "learning_rate": 8.846153846153847e-05, + "loss": 1.2768, + "step": 2750 + }, + { + "epoch": 0.6860321384425216, + "grad_norm": 0.8515625, + "learning_rate": 8.833333333333333e-05, + "loss": 1.2735, + "step": 2775 + }, + { + "epoch": 0.69221260815822, + "grad_norm": 0.7734375, + "learning_rate": 8.820512820512821e-05, + "loss": 1.2743, + "step": 2800 + }, + { + "epoch": 0.6983930778739185, + "grad_norm": 0.796875, + "learning_rate": 8.807692307692307e-05, + "loss": 1.2766, + "step": 2825 + }, + { + "epoch": 0.7045735475896168, + "grad_norm": 0.81640625, + "learning_rate": 8.794871794871795e-05, + "loss": 1.2693, + "step": 2850 + }, + { + "epoch": 0.7107540173053152, + "grad_norm": 0.83203125, + "learning_rate": 8.782051282051283e-05, + "loss": 1.263, + "step": 2875 + }, + { + "epoch": 0.7169344870210136, + "grad_norm": 0.80859375, + "learning_rate": 8.76923076923077e-05, + "loss": 1.2618, + "step": 2900 + }, + { + "epoch": 0.723114956736712, + "grad_norm": 0.77734375, + "learning_rate": 8.756410256410257e-05, + "loss": 1.2626, + "step": 2925 + }, + { + "epoch": 0.7292954264524104, + "grad_norm": 0.8203125, + "learning_rate": 8.743589743589744e-05, + "loss": 1.2644, + "step": 2950 + }, + { + "epoch": 0.7354758961681088, + "grad_norm": 0.7734375, + "learning_rate": 8.730769230769232e-05, + "loss": 1.2623, + "step": 2975 + }, + { + "epoch": 0.7416563658838071, + "grad_norm": 0.77734375, + "learning_rate": 8.717948717948718e-05, + "loss": 1.2633, + "step": 3000 + }, + { + "epoch": 0.7478368355995055, + "grad_norm": 0.75, + "learning_rate": 8.705128205128206e-05, + "loss": 1.261, + "step": 3025 + }, + { + "epoch": 0.754017305315204, + "grad_norm": 0.7890625, + "learning_rate": 8.692307692307692e-05, + "loss": 1.259, + "step": 3050 + }, + { + "epoch": 0.7601977750309024, + "grad_norm": 0.78515625, + "learning_rate": 8.67948717948718e-05, + "loss": 1.2546, + "step": 3075 + }, + { + "epoch": 0.7663782447466008, + "grad_norm": 0.71875, + "learning_rate": 8.666666666666667e-05, + "loss": 1.2551, + "step": 3100 + }, + { + "epoch": 0.7725587144622992, + "grad_norm": 0.71484375, + "learning_rate": 8.653846153846155e-05, + "loss": 1.2565, + "step": 3125 + }, + { + "epoch": 0.7787391841779975, + "grad_norm": 0.76953125, + "learning_rate": 8.641025641025642e-05, + "loss": 1.2559, + "step": 3150 + }, + { + "epoch": 0.7849196538936959, + "grad_norm": 0.76953125, + "learning_rate": 8.628205128205129e-05, + "loss": 1.253, + "step": 3175 + }, + { + "epoch": 0.7911001236093943, + "grad_norm": 0.83203125, + "learning_rate": 8.615384615384617e-05, + "loss": 1.255, + "step": 3200 + }, + { + "epoch": 0.7972805933250927, + "grad_norm": 0.72265625, + "learning_rate": 8.602564102564103e-05, + "loss": 1.2486, + "step": 3225 + }, + { + "epoch": 0.8034610630407911, + "grad_norm": 0.74609375, + "learning_rate": 8.58974358974359e-05, + "loss": 1.24, + "step": 3250 + }, + { + "epoch": 0.8096415327564895, + "grad_norm": 0.8125, + "learning_rate": 8.576923076923076e-05, + "loss": 1.2466, + "step": 3275 + }, + { + "epoch": 0.8158220024721878, + "grad_norm": 0.765625, + "learning_rate": 8.564102564102564e-05, + "loss": 1.2455, + "step": 3300 + }, + { + "epoch": 0.8220024721878862, + "grad_norm": 0.76171875, + "learning_rate": 8.551282051282052e-05, + "loss": 1.2409, + "step": 3325 + }, + { + "epoch": 0.8281829419035847, + "grad_norm": 0.875, + "learning_rate": 8.538461538461538e-05, + "loss": 1.237, + "step": 3350 + }, + { + "epoch": 0.8343634116192831, + "grad_norm": 0.82421875, + "learning_rate": 8.525641025641026e-05, + "loss": 1.2437, + "step": 3375 + }, + { + "epoch": 0.8405438813349815, + "grad_norm": 0.73828125, + "learning_rate": 8.512820512820513e-05, + "loss": 1.2388, + "step": 3400 + }, + { + "epoch": 0.8467243510506799, + "grad_norm": 0.8671875, + "learning_rate": 8.5e-05, + "loss": 1.238, + "step": 3425 + }, + { + "epoch": 0.8529048207663782, + "grad_norm": 0.76171875, + "learning_rate": 8.487179487179487e-05, + "loss": 1.231, + "step": 3450 + }, + { + "epoch": 0.8590852904820766, + "grad_norm": 0.7421875, + "learning_rate": 8.474358974358975e-05, + "loss": 1.2381, + "step": 3475 + }, + { + "epoch": 0.865265760197775, + "grad_norm": 0.75390625, + "learning_rate": 8.461538461538461e-05, + "loss": 1.2305, + "step": 3500 + }, + { + "epoch": 0.8714462299134734, + "grad_norm": 0.70703125, + "learning_rate": 8.448717948717949e-05, + "loss": 1.2326, + "step": 3525 + }, + { + "epoch": 0.8776266996291718, + "grad_norm": 0.8046875, + "learning_rate": 8.435897435897436e-05, + "loss": 1.2288, + "step": 3550 + }, + { + "epoch": 0.8838071693448702, + "grad_norm": 0.828125, + "learning_rate": 8.423076923076924e-05, + "loss": 1.2268, + "step": 3575 + }, + { + "epoch": 0.8899876390605687, + "grad_norm": 0.7421875, + "learning_rate": 8.410256410256411e-05, + "loss": 1.2314, + "step": 3600 + }, + { + "epoch": 0.896168108776267, + "grad_norm": 0.78515625, + "learning_rate": 8.397435897435898e-05, + "loss": 1.2292, + "step": 3625 + }, + { + "epoch": 0.9023485784919654, + "grad_norm": 0.7109375, + "learning_rate": 8.384615384615386e-05, + "loss": 1.2304, + "step": 3650 + }, + { + "epoch": 0.9085290482076638, + "grad_norm": 0.765625, + "learning_rate": 8.371794871794872e-05, + "loss": 1.2256, + "step": 3675 + }, + { + "epoch": 0.9147095179233622, + "grad_norm": 0.765625, + "learning_rate": 8.35897435897436e-05, + "loss": 1.2268, + "step": 3700 + }, + { + "epoch": 0.9208899876390606, + "grad_norm": 0.78515625, + "learning_rate": 8.346153846153847e-05, + "loss": 1.2287, + "step": 3725 + }, + { + "epoch": 0.927070457354759, + "grad_norm": 0.72265625, + "learning_rate": 8.333333333333334e-05, + "loss": 1.2252, + "step": 3750 + }, + { + "epoch": 0.9332509270704573, + "grad_norm": 0.78515625, + "learning_rate": 8.320512820512821e-05, + "loss": 1.225, + "step": 3775 + }, + { + "epoch": 0.9394313967861557, + "grad_norm": 0.7734375, + "learning_rate": 8.307692307692309e-05, + "loss": 1.2183, + "step": 3800 + }, + { + "epoch": 0.9456118665018541, + "grad_norm": 0.6796875, + "learning_rate": 8.294871794871795e-05, + "loss": 1.2244, + "step": 3825 + }, + { + "epoch": 0.9517923362175525, + "grad_norm": 0.71484375, + "learning_rate": 8.282051282051283e-05, + "loss": 1.2194, + "step": 3850 + }, + { + "epoch": 0.957972805933251, + "grad_norm": 0.75, + "learning_rate": 8.26923076923077e-05, + "loss": 1.2162, + "step": 3875 + }, + { + "epoch": 0.9641532756489494, + "grad_norm": 0.73828125, + "learning_rate": 8.256410256410256e-05, + "loss": 1.2146, + "step": 3900 + }, + { + "epoch": 0.9703337453646477, + "grad_norm": 0.76953125, + "learning_rate": 8.243589743589744e-05, + "loss": 1.2144, + "step": 3925 + }, + { + "epoch": 0.9765142150803461, + "grad_norm": 0.73828125, + "learning_rate": 8.23076923076923e-05, + "loss": 1.2146, + "step": 3950 + }, + { + "epoch": 0.9826946847960445, + "grad_norm": 0.70703125, + "learning_rate": 8.217948717948718e-05, + "loss": 1.2151, + "step": 3975 + }, + { + "epoch": 0.9888751545117429, + "grad_norm": 0.72265625, + "learning_rate": 8.205128205128205e-05, + "loss": 1.2146, + "step": 4000 + }, + { + "epoch": 0.9950556242274413, + "grad_norm": 0.91796875, + "learning_rate": 8.192307692307693e-05, + "loss": 1.211, + "step": 4025 + }, + { + "epoch": 1.0012360939431397, + "grad_norm": 0.88671875, + "learning_rate": 8.179487179487179e-05, + "loss": 1.198, + "step": 4050 + }, + { + "epoch": 1.0074165636588381, + "grad_norm": 0.8046875, + "learning_rate": 8.166666666666667e-05, + "loss": 1.1373, + "step": 4075 + }, + { + "epoch": 1.0135970333745365, + "grad_norm": 0.84375, + "learning_rate": 8.153846153846155e-05, + "loss": 1.1436, + "step": 4100 + }, + { + "epoch": 1.019777503090235, + "grad_norm": 0.7265625, + "learning_rate": 8.141025641025641e-05, + "loss": 1.138, + "step": 4125 + }, + { + "epoch": 1.0259579728059331, + "grad_norm": 0.80078125, + "learning_rate": 8.128205128205129e-05, + "loss": 1.1334, + "step": 4150 + }, + { + "epoch": 1.0321384425216316, + "grad_norm": 0.828125, + "learning_rate": 8.115384615384616e-05, + "loss": 1.1423, + "step": 4175 + }, + { + "epoch": 1.03831891223733, + "grad_norm": 0.75390625, + "learning_rate": 8.102564102564103e-05, + "loss": 1.1398, + "step": 4200 + }, + { + "epoch": 1.0444993819530284, + "grad_norm": 0.7890625, + "learning_rate": 8.08974358974359e-05, + "loss": 1.1378, + "step": 4225 + }, + { + "epoch": 1.0506798516687268, + "grad_norm": 0.76953125, + "learning_rate": 8.076923076923078e-05, + "loss": 1.1376, + "step": 4250 + }, + { + "epoch": 1.0568603213844252, + "grad_norm": 0.80078125, + "learning_rate": 8.064102564102564e-05, + "loss": 1.1443, + "step": 4275 + }, + { + "epoch": 1.0630407911001236, + "grad_norm": 0.828125, + "learning_rate": 8.051282051282052e-05, + "loss": 1.1441, + "step": 4300 + }, + { + "epoch": 1.069221260815822, + "grad_norm": 0.77734375, + "learning_rate": 8.038461538461538e-05, + "loss": 1.1435, + "step": 4325 + }, + { + "epoch": 1.0754017305315204, + "grad_norm": 0.73828125, + "learning_rate": 8.025641025641026e-05, + "loss": 1.1449, + "step": 4350 + }, + { + "epoch": 1.0815822002472189, + "grad_norm": 0.80078125, + "learning_rate": 8.012820512820514e-05, + "loss": 1.1468, + "step": 4375 + }, + { + "epoch": 1.0877626699629173, + "grad_norm": 0.796875, + "learning_rate": 8e-05, + "loss": 1.1402, + "step": 4400 + }, + { + "epoch": 1.0939431396786157, + "grad_norm": 0.82421875, + "learning_rate": 7.987179487179488e-05, + "loss": 1.1405, + "step": 4425 + }, + { + "epoch": 1.100123609394314, + "grad_norm": 0.7421875, + "learning_rate": 7.974358974358975e-05, + "loss": 1.1441, + "step": 4450 + }, + { + "epoch": 1.1063040791100123, + "grad_norm": 0.74609375, + "learning_rate": 7.961538461538461e-05, + "loss": 1.1486, + "step": 4475 + }, + { + "epoch": 1.1124845488257107, + "grad_norm": 0.7890625, + "learning_rate": 7.948717948717948e-05, + "loss": 1.1439, + "step": 4500 + }, + { + "epoch": 1.118665018541409, + "grad_norm": 0.73828125, + "learning_rate": 7.935897435897436e-05, + "loss": 1.1408, + "step": 4525 + }, + { + "epoch": 1.1248454882571075, + "grad_norm": 0.74609375, + "learning_rate": 7.923076923076924e-05, + "loss": 1.1381, + "step": 4550 + }, + { + "epoch": 1.131025957972806, + "grad_norm": 0.77734375, + "learning_rate": 7.91025641025641e-05, + "loss": 1.1441, + "step": 4575 + }, + { + "epoch": 1.1372064276885043, + "grad_norm": 0.80078125, + "learning_rate": 7.897435897435898e-05, + "loss": 1.1377, + "step": 4600 + }, + { + "epoch": 1.1433868974042027, + "grad_norm": 0.7734375, + "learning_rate": 7.884615384615384e-05, + "loss": 1.1405, + "step": 4625 + }, + { + "epoch": 1.1495673671199012, + "grad_norm": 0.734375, + "learning_rate": 7.871794871794872e-05, + "loss": 1.146, + "step": 4650 + }, + { + "epoch": 1.1557478368355996, + "grad_norm": 0.8125, + "learning_rate": 7.858974358974359e-05, + "loss": 1.1416, + "step": 4675 + }, + { + "epoch": 1.161928306551298, + "grad_norm": 0.84375, + "learning_rate": 7.846153846153847e-05, + "loss": 1.1352, + "step": 4700 + }, + { + "epoch": 1.1681087762669964, + "grad_norm": 0.83984375, + "learning_rate": 7.833333333333333e-05, + "loss": 1.1398, + "step": 4725 + }, + { + "epoch": 1.1742892459826946, + "grad_norm": 0.75390625, + "learning_rate": 7.820512820512821e-05, + "loss": 1.1422, + "step": 4750 + }, + { + "epoch": 1.180469715698393, + "grad_norm": 0.72265625, + "learning_rate": 7.807692307692307e-05, + "loss": 1.1416, + "step": 4775 + }, + { + "epoch": 1.1866501854140914, + "grad_norm": 0.73828125, + "learning_rate": 7.794871794871795e-05, + "loss": 1.1416, + "step": 4800 + }, + { + "epoch": 1.1928306551297898, + "grad_norm": 0.80078125, + "learning_rate": 7.782051282051283e-05, + "loss": 1.1387, + "step": 4825 + }, + { + "epoch": 1.1990111248454882, + "grad_norm": 0.7890625, + "learning_rate": 7.76923076923077e-05, + "loss": 1.1372, + "step": 4850 + }, + { + "epoch": 1.2051915945611866, + "grad_norm": 0.75, + "learning_rate": 7.756410256410257e-05, + "loss": 1.1381, + "step": 4875 + }, + { + "epoch": 1.211372064276885, + "grad_norm": 0.72265625, + "learning_rate": 7.743589743589744e-05, + "loss": 1.1342, + "step": 4900 + }, + { + "epoch": 1.2175525339925835, + "grad_norm": 0.71875, + "learning_rate": 7.730769230769232e-05, + "loss": 1.1355, + "step": 4925 + }, + { + "epoch": 1.2237330037082819, + "grad_norm": 0.75390625, + "learning_rate": 7.717948717948718e-05, + "loss": 1.1406, + "step": 4950 + }, + { + "epoch": 1.2299134734239803, + "grad_norm": 0.78125, + "learning_rate": 7.705128205128206e-05, + "loss": 1.1411, + "step": 4975 + }, + { + "epoch": 1.2360939431396787, + "grad_norm": 0.7578125, + "learning_rate": 7.692307692307693e-05, + "loss": 1.135, + "step": 5000 + }, + { + "epoch": 1.2360939431396787, + "eval_loss": 1.048352837562561, + "eval_runtime": 1.5386, + "eval_samples_per_second": 415.323, + "eval_steps_per_second": 1.95, + "step": 5000 + }, + { + "epoch": 1.242274412855377, + "grad_norm": 0.8203125, + "learning_rate": 7.67948717948718e-05, + "loss": 1.1348, + "step": 5025 + }, + { + "epoch": 1.2484548825710755, + "grad_norm": 0.7578125, + "learning_rate": 7.666666666666667e-05, + "loss": 1.1338, + "step": 5050 + }, + { + "epoch": 1.254635352286774, + "grad_norm": 0.8359375, + "learning_rate": 7.653846153846153e-05, + "loss": 1.1291, + "step": 5075 + }, + { + "epoch": 1.260815822002472, + "grad_norm": 0.7734375, + "learning_rate": 7.641025641025641e-05, + "loss": 1.1325, + "step": 5100 + }, + { + "epoch": 1.2669962917181705, + "grad_norm": 0.7265625, + "learning_rate": 7.628205128205128e-05, + "loss": 1.1359, + "step": 5125 + }, + { + "epoch": 1.273176761433869, + "grad_norm": 0.78125, + "learning_rate": 7.615384615384616e-05, + "loss": 1.1337, + "step": 5150 + }, + { + "epoch": 1.2793572311495673, + "grad_norm": 0.7890625, + "learning_rate": 7.602564102564102e-05, + "loss": 1.132, + "step": 5175 + }, + { + "epoch": 1.2855377008652658, + "grad_norm": 0.734375, + "learning_rate": 7.58974358974359e-05, + "loss": 1.1308, + "step": 5200 + }, + { + "epoch": 1.2917181705809642, + "grad_norm": 0.75, + "learning_rate": 7.576923076923076e-05, + "loss": 1.1291, + "step": 5225 + }, + { + "epoch": 1.2978986402966626, + "grad_norm": 0.7109375, + "learning_rate": 7.564102564102564e-05, + "loss": 1.1248, + "step": 5250 + }, + { + "epoch": 1.304079110012361, + "grad_norm": 0.78515625, + "learning_rate": 7.551282051282052e-05, + "loss": 1.1262, + "step": 5275 + }, + { + "epoch": 1.3102595797280594, + "grad_norm": 0.78125, + "learning_rate": 7.538461538461539e-05, + "loss": 1.1246, + "step": 5300 + }, + { + "epoch": 1.3164400494437576, + "grad_norm": 0.75390625, + "learning_rate": 7.525641025641026e-05, + "loss": 1.1234, + "step": 5325 + }, + { + "epoch": 1.322620519159456, + "grad_norm": 0.734375, + "learning_rate": 7.512820512820513e-05, + "loss": 1.1252, + "step": 5350 + }, + { + "epoch": 1.3288009888751544, + "grad_norm": 0.78125, + "learning_rate": 7.500000000000001e-05, + "loss": 1.1288, + "step": 5375 + }, + { + "epoch": 1.3349814585908528, + "grad_norm": 0.7265625, + "learning_rate": 7.487179487179487e-05, + "loss": 1.1227, + "step": 5400 + }, + { + "epoch": 1.3411619283065512, + "grad_norm": 0.72265625, + "learning_rate": 7.474358974358975e-05, + "loss": 1.1233, + "step": 5425 + }, + { + "epoch": 1.3473423980222496, + "grad_norm": 0.73828125, + "learning_rate": 7.461538461538462e-05, + "loss": 1.1233, + "step": 5450 + }, + { + "epoch": 1.353522867737948, + "grad_norm": 0.859375, + "learning_rate": 7.44871794871795e-05, + "loss": 1.1248, + "step": 5475 + }, + { + "epoch": 1.3597033374536465, + "grad_norm": 0.734375, + "learning_rate": 7.435897435897436e-05, + "loss": 1.1264, + "step": 5500 + }, + { + "epoch": 1.3658838071693449, + "grad_norm": 0.703125, + "learning_rate": 7.423076923076924e-05, + "loss": 1.1215, + "step": 5525 + }, + { + "epoch": 1.3720642768850433, + "grad_norm": 0.69921875, + "learning_rate": 7.410256410256412e-05, + "loss": 1.1237, + "step": 5550 + }, + { + "epoch": 1.3782447466007417, + "grad_norm": 0.734375, + "learning_rate": 7.397435897435898e-05, + "loss": 1.1212, + "step": 5575 + }, + { + "epoch": 1.38442521631644, + "grad_norm": 0.7578125, + "learning_rate": 7.384615384615386e-05, + "loss": 1.118, + "step": 5600 + }, + { + "epoch": 1.3906056860321385, + "grad_norm": 0.73828125, + "learning_rate": 7.371794871794872e-05, + "loss": 1.1163, + "step": 5625 + }, + { + "epoch": 1.396786155747837, + "grad_norm": 0.73046875, + "learning_rate": 7.35897435897436e-05, + "loss": 1.1168, + "step": 5650 + }, + { + "epoch": 1.4029666254635353, + "grad_norm": 0.66796875, + "learning_rate": 7.346153846153847e-05, + "loss": 1.118, + "step": 5675 + }, + { + "epoch": 1.4091470951792338, + "grad_norm": 0.79296875, + "learning_rate": 7.333333333333333e-05, + "loss": 1.1153, + "step": 5700 + }, + { + "epoch": 1.415327564894932, + "grad_norm": 0.75390625, + "learning_rate": 7.320512820512821e-05, + "loss": 1.1237, + "step": 5725 + }, + { + "epoch": 1.4215080346106304, + "grad_norm": 0.8828125, + "learning_rate": 7.307692307692307e-05, + "loss": 1.1225, + "step": 5750 + }, + { + "epoch": 1.4276885043263288, + "grad_norm": 0.765625, + "learning_rate": 7.294871794871795e-05, + "loss": 1.1224, + "step": 5775 + }, + { + "epoch": 1.4338689740420272, + "grad_norm": 0.76171875, + "learning_rate": 7.282051282051282e-05, + "loss": 1.1187, + "step": 5800 + }, + { + "epoch": 1.4400494437577256, + "grad_norm": 0.703125, + "learning_rate": 7.26923076923077e-05, + "loss": 1.1186, + "step": 5825 + }, + { + "epoch": 1.446229913473424, + "grad_norm": 0.79296875, + "learning_rate": 7.256410256410256e-05, + "loss": 1.1164, + "step": 5850 + }, + { + "epoch": 1.4524103831891224, + "grad_norm": 0.76953125, + "learning_rate": 7.243589743589744e-05, + "loss": 1.1194, + "step": 5875 + }, + { + "epoch": 1.4585908529048208, + "grad_norm": 0.7734375, + "learning_rate": 7.23076923076923e-05, + "loss": 1.1164, + "step": 5900 + }, + { + "epoch": 1.4647713226205192, + "grad_norm": 0.7109375, + "learning_rate": 7.217948717948718e-05, + "loss": 1.1159, + "step": 5925 + }, + { + "epoch": 1.4709517923362174, + "grad_norm": 0.75390625, + "learning_rate": 7.205128205128205e-05, + "loss": 1.1184, + "step": 5950 + }, + { + "epoch": 1.4771322620519158, + "grad_norm": 0.734375, + "learning_rate": 7.192307692307693e-05, + "loss": 1.1136, + "step": 5975 + }, + { + "epoch": 1.4833127317676142, + "grad_norm": 0.77734375, + "learning_rate": 7.17948717948718e-05, + "loss": 1.1086, + "step": 6000 + }, + { + "epoch": 1.4894932014833127, + "grad_norm": 0.78125, + "learning_rate": 7.166666666666667e-05, + "loss": 1.1098, + "step": 6025 + }, + { + "epoch": 1.495673671199011, + "grad_norm": 0.8046875, + "learning_rate": 7.153846153846155e-05, + "loss": 1.1132, + "step": 6050 + }, + { + "epoch": 1.5018541409147095, + "grad_norm": 0.73828125, + "learning_rate": 7.141025641025641e-05, + "loss": 1.1044, + "step": 6075 + }, + { + "epoch": 1.508034610630408, + "grad_norm": 0.74609375, + "learning_rate": 7.128205128205129e-05, + "loss": 1.1081, + "step": 6100 + }, + { + "epoch": 1.5142150803461063, + "grad_norm": 0.74609375, + "learning_rate": 7.115384615384616e-05, + "loss": 1.1109, + "step": 6125 + }, + { + "epoch": 1.5203955500618047, + "grad_norm": 0.78125, + "learning_rate": 7.102564102564103e-05, + "loss": 1.1085, + "step": 6150 + }, + { + "epoch": 1.5265760197775031, + "grad_norm": 0.80859375, + "learning_rate": 7.08974358974359e-05, + "loss": 1.1143, + "step": 6175 + }, + { + "epoch": 1.5327564894932015, + "grad_norm": 0.73046875, + "learning_rate": 7.076923076923078e-05, + "loss": 1.106, + "step": 6200 + }, + { + "epoch": 1.5389369592089, + "grad_norm": 0.7421875, + "learning_rate": 7.064102564102564e-05, + "loss": 1.1094, + "step": 6225 + }, + { + "epoch": 1.5451174289245984, + "grad_norm": 0.8203125, + "learning_rate": 7.051282051282052e-05, + "loss": 1.1063, + "step": 6250 + }, + { + "epoch": 1.5512978986402968, + "grad_norm": 0.7265625, + "learning_rate": 7.03846153846154e-05, + "loss": 1.105, + "step": 6275 + }, + { + "epoch": 1.5574783683559952, + "grad_norm": 0.7421875, + "learning_rate": 7.025641025641025e-05, + "loss": 1.1063, + "step": 6300 + }, + { + "epoch": 1.5636588380716936, + "grad_norm": 0.7421875, + "learning_rate": 7.012820512820513e-05, + "loss": 1.1029, + "step": 6325 + }, + { + "epoch": 1.569839307787392, + "grad_norm": 0.78515625, + "learning_rate": 7e-05, + "loss": 1.105, + "step": 6350 + }, + { + "epoch": 1.5760197775030902, + "grad_norm": 0.75, + "learning_rate": 6.987179487179487e-05, + "loss": 1.1036, + "step": 6375 + }, + { + "epoch": 1.5822002472187886, + "grad_norm": 0.73828125, + "learning_rate": 6.974358974358974e-05, + "loss": 1.1072, + "step": 6400 + }, + { + "epoch": 1.588380716934487, + "grad_norm": 0.74609375, + "learning_rate": 6.961538461538462e-05, + "loss": 1.098, + "step": 6425 + }, + { + "epoch": 1.5945611866501854, + "grad_norm": 0.7421875, + "learning_rate": 6.94871794871795e-05, + "loss": 1.101, + "step": 6450 + }, + { + "epoch": 1.6007416563658838, + "grad_norm": 0.78515625, + "learning_rate": 6.935897435897436e-05, + "loss": 1.1008, + "step": 6475 + }, + { + "epoch": 1.6069221260815822, + "grad_norm": 0.796875, + "learning_rate": 6.923076923076924e-05, + "loss": 1.1, + "step": 6500 + }, + { + "epoch": 1.6131025957972804, + "grad_norm": 0.7265625, + "learning_rate": 6.91025641025641e-05, + "loss": 1.0997, + "step": 6525 + }, + { + "epoch": 1.6192830655129788, + "grad_norm": 0.7890625, + "learning_rate": 6.897435897435898e-05, + "loss": 1.1005, + "step": 6550 + }, + { + "epoch": 1.6254635352286773, + "grad_norm": 0.7421875, + "learning_rate": 6.884615384615385e-05, + "loss": 1.0953, + "step": 6575 + }, + { + "epoch": 1.6316440049443757, + "grad_norm": 0.71875, + "learning_rate": 6.871794871794872e-05, + "loss": 1.0999, + "step": 6600 + }, + { + "epoch": 1.637824474660074, + "grad_norm": 0.7265625, + "learning_rate": 6.858974358974359e-05, + "loss": 1.0965, + "step": 6625 + }, + { + "epoch": 1.6440049443757725, + "grad_norm": 0.73828125, + "learning_rate": 6.846153846153847e-05, + "loss": 1.098, + "step": 6650 + }, + { + "epoch": 1.650185414091471, + "grad_norm": 0.82421875, + "learning_rate": 6.833333333333333e-05, + "loss": 1.09, + "step": 6675 + }, + { + "epoch": 1.6563658838071693, + "grad_norm": 0.828125, + "learning_rate": 6.820512820512821e-05, + "loss": 1.0992, + "step": 6700 + }, + { + "epoch": 1.6625463535228677, + "grad_norm": 0.78515625, + "learning_rate": 6.807692307692309e-05, + "loss": 1.0948, + "step": 6725 + }, + { + "epoch": 1.6687268232385661, + "grad_norm": 0.75, + "learning_rate": 6.794871794871795e-05, + "loss": 1.098, + "step": 6750 + }, + { + "epoch": 1.6749072929542645, + "grad_norm": 0.75, + "learning_rate": 6.782051282051283e-05, + "loss": 1.0915, + "step": 6775 + }, + { + "epoch": 1.681087762669963, + "grad_norm": 0.75, + "learning_rate": 6.76923076923077e-05, + "loss": 1.0943, + "step": 6800 + }, + { + "epoch": 1.6872682323856614, + "grad_norm": 0.6953125, + "learning_rate": 6.756410256410258e-05, + "loss": 1.0913, + "step": 6825 + }, + { + "epoch": 1.6934487021013598, + "grad_norm": 0.7265625, + "learning_rate": 6.743589743589744e-05, + "loss": 1.092, + "step": 6850 + }, + { + "epoch": 1.6996291718170582, + "grad_norm": 0.8125, + "learning_rate": 6.730769230769232e-05, + "loss": 1.0904, + "step": 6875 + }, + { + "epoch": 1.7058096415327566, + "grad_norm": 0.81640625, + "learning_rate": 6.717948717948718e-05, + "loss": 1.091, + "step": 6900 + }, + { + "epoch": 1.711990111248455, + "grad_norm": 0.7421875, + "learning_rate": 6.705128205128205e-05, + "loss": 1.0901, + "step": 6925 + }, + { + "epoch": 1.7181705809641534, + "grad_norm": 0.765625, + "learning_rate": 6.692307692307693e-05, + "loss": 1.0885, + "step": 6950 + }, + { + "epoch": 1.7243510506798516, + "grad_norm": 0.74609375, + "learning_rate": 6.679487179487179e-05, + "loss": 1.0897, + "step": 6975 + }, + { + "epoch": 1.73053152039555, + "grad_norm": 0.83203125, + "learning_rate": 6.666666666666667e-05, + "loss": 1.0888, + "step": 7000 + }, + { + "epoch": 1.7367119901112484, + "grad_norm": 0.7421875, + "learning_rate": 6.653846153846153e-05, + "loss": 1.0887, + "step": 7025 + }, + { + "epoch": 1.7428924598269468, + "grad_norm": 0.7421875, + "learning_rate": 6.641025641025641e-05, + "loss": 1.0857, + "step": 7050 + }, + { + "epoch": 1.7490729295426453, + "grad_norm": 0.8359375, + "learning_rate": 6.628205128205128e-05, + "loss": 1.089, + "step": 7075 + }, + { + "epoch": 1.7552533992583437, + "grad_norm": 0.75390625, + "learning_rate": 6.615384615384616e-05, + "loss": 1.0866, + "step": 7100 + }, + { + "epoch": 1.7614338689740419, + "grad_norm": 0.73046875, + "learning_rate": 6.602564102564102e-05, + "loss": 1.0871, + "step": 7125 + }, + { + "epoch": 1.7676143386897403, + "grad_norm": 0.75, + "learning_rate": 6.58974358974359e-05, + "loss": 1.0779, + "step": 7150 + }, + { + "epoch": 1.7737948084054387, + "grad_norm": 0.89453125, + "learning_rate": 6.576923076923078e-05, + "loss": 1.0852, + "step": 7175 + }, + { + "epoch": 1.779975278121137, + "grad_norm": 0.7890625, + "learning_rate": 6.564102564102564e-05, + "loss": 1.0836, + "step": 7200 + }, + { + "epoch": 1.7861557478368355, + "grad_norm": 0.78515625, + "learning_rate": 6.551282051282052e-05, + "loss": 1.0843, + "step": 7225 + }, + { + "epoch": 1.792336217552534, + "grad_norm": 0.7265625, + "learning_rate": 6.538461538461539e-05, + "loss": 1.0802, + "step": 7250 + }, + { + "epoch": 1.7985166872682323, + "grad_norm": 0.7421875, + "learning_rate": 6.525641025641026e-05, + "loss": 1.083, + "step": 7275 + }, + { + "epoch": 1.8046971569839307, + "grad_norm": 0.79296875, + "learning_rate": 6.512820512820513e-05, + "loss": 1.0798, + "step": 7300 + }, + { + "epoch": 1.8108776266996292, + "grad_norm": 0.72265625, + "learning_rate": 6.500000000000001e-05, + "loss": 1.0823, + "step": 7325 + }, + { + "epoch": 1.8170580964153276, + "grad_norm": 0.734375, + "learning_rate": 6.487179487179487e-05, + "loss": 1.0814, + "step": 7350 + }, + { + "epoch": 1.823238566131026, + "grad_norm": 0.78125, + "learning_rate": 6.474358974358975e-05, + "loss": 1.0827, + "step": 7375 + }, + { + "epoch": 1.8294190358467244, + "grad_norm": 0.71484375, + "learning_rate": 6.461538461538462e-05, + "loss": 1.0827, + "step": 7400 + }, + { + "epoch": 1.8355995055624228, + "grad_norm": 0.78125, + "learning_rate": 6.44871794871795e-05, + "loss": 1.0799, + "step": 7425 + }, + { + "epoch": 1.8417799752781212, + "grad_norm": 0.71484375, + "learning_rate": 6.435897435897437e-05, + "loss": 1.0783, + "step": 7450 + }, + { + "epoch": 1.8479604449938196, + "grad_norm": 0.7890625, + "learning_rate": 6.423076923076924e-05, + "loss": 1.0758, + "step": 7475 + }, + { + "epoch": 1.854140914709518, + "grad_norm": 0.78515625, + "learning_rate": 6.410256410256412e-05, + "loss": 1.078, + "step": 7500 + }, + { + "epoch": 1.8603213844252164, + "grad_norm": 0.7421875, + "learning_rate": 6.397435897435897e-05, + "loss": 1.0715, + "step": 7525 + }, + { + "epoch": 1.8665018541409149, + "grad_norm": 0.734375, + "learning_rate": 6.384615384615385e-05, + "loss": 1.0761, + "step": 7550 + }, + { + "epoch": 1.8726823238566133, + "grad_norm": 0.7734375, + "learning_rate": 6.371794871794871e-05, + "loss": 1.076, + "step": 7575 + }, + { + "epoch": 1.8788627935723115, + "grad_norm": 0.85546875, + "learning_rate": 6.358974358974359e-05, + "loss": 1.0732, + "step": 7600 + }, + { + "epoch": 1.8850432632880099, + "grad_norm": 0.76953125, + "learning_rate": 6.346153846153847e-05, + "loss": 1.073, + "step": 7625 + }, + { + "epoch": 1.8912237330037083, + "grad_norm": 0.75, + "learning_rate": 6.333333333333333e-05, + "loss": 1.07, + "step": 7650 + }, + { + "epoch": 1.8974042027194067, + "grad_norm": 0.74609375, + "learning_rate": 6.320512820512821e-05, + "loss": 1.0739, + "step": 7675 + }, + { + "epoch": 1.903584672435105, + "grad_norm": 0.80078125, + "learning_rate": 6.307692307692308e-05, + "loss": 1.071, + "step": 7700 + }, + { + "epoch": 1.9097651421508035, + "grad_norm": 0.77734375, + "learning_rate": 6.294871794871795e-05, + "loss": 1.0706, + "step": 7725 + }, + { + "epoch": 1.9159456118665017, + "grad_norm": 0.734375, + "learning_rate": 6.282051282051282e-05, + "loss": 1.0725, + "step": 7750 + }, + { + "epoch": 1.9221260815822, + "grad_norm": 0.72265625, + "learning_rate": 6.26923076923077e-05, + "loss": 1.0666, + "step": 7775 + }, + { + "epoch": 1.9283065512978985, + "grad_norm": 0.7890625, + "learning_rate": 6.256410256410256e-05, + "loss": 1.0661, + "step": 7800 + }, + { + "epoch": 1.934487021013597, + "grad_norm": 0.71484375, + "learning_rate": 6.243589743589744e-05, + "loss": 1.0677, + "step": 7825 + }, + { + "epoch": 1.9406674907292953, + "grad_norm": 0.734375, + "learning_rate": 6.23076923076923e-05, + "loss": 1.0683, + "step": 7850 + }, + { + "epoch": 1.9468479604449938, + "grad_norm": 0.80859375, + "learning_rate": 6.217948717948718e-05, + "loss": 1.0676, + "step": 7875 + }, + { + "epoch": 1.9530284301606922, + "grad_norm": 0.78125, + "learning_rate": 6.205128205128206e-05, + "loss": 1.068, + "step": 7900 + }, + { + "epoch": 1.9592088998763906, + "grad_norm": 0.75390625, + "learning_rate": 6.192307692307693e-05, + "loss": 1.0662, + "step": 7925 + }, + { + "epoch": 1.965389369592089, + "grad_norm": 0.84375, + "learning_rate": 6.17948717948718e-05, + "loss": 1.0653, + "step": 7950 + }, + { + "epoch": 1.9715698393077874, + "grad_norm": 0.80859375, + "learning_rate": 6.166666666666667e-05, + "loss": 1.0629, + "step": 7975 + }, + { + "epoch": 1.9777503090234858, + "grad_norm": 0.7265625, + "learning_rate": 6.153846153846155e-05, + "loss": 1.0669, + "step": 8000 + }, + { + "epoch": 1.9839307787391842, + "grad_norm": 0.71875, + "learning_rate": 6.141025641025641e-05, + "loss": 1.0648, + "step": 8025 + }, + { + "epoch": 1.9901112484548826, + "grad_norm": 0.734375, + "learning_rate": 6.128205128205129e-05, + "loss": 1.0646, + "step": 8050 + }, + { + "epoch": 1.996291718170581, + "grad_norm": 0.8125, + "learning_rate": 6.115384615384616e-05, + "loss": 1.0591, + "step": 8075 + }, + { + "epoch": 2.0024721878862795, + "grad_norm": 0.79296875, + "learning_rate": 6.1025641025641035e-05, + "loss": 1.0239, + "step": 8100 + }, + { + "epoch": 2.008652657601978, + "grad_norm": 0.73046875, + "learning_rate": 6.089743589743589e-05, + "loss": 0.9635, + "step": 8125 + }, + { + "epoch": 2.0148331273176763, + "grad_norm": 0.8828125, + "learning_rate": 6.0769230769230765e-05, + "loss": 0.9628, + "step": 8150 + }, + { + "epoch": 2.0210135970333747, + "grad_norm": 0.78125, + "learning_rate": 6.0641025641025637e-05, + "loss": 0.9707, + "step": 8175 + }, + { + "epoch": 2.027194066749073, + "grad_norm": 0.78125, + "learning_rate": 6.0512820512820515e-05, + "loss": 0.9658, + "step": 8200 + }, + { + "epoch": 2.0333745364647715, + "grad_norm": 0.79296875, + "learning_rate": 6.038461538461539e-05, + "loss": 0.9695, + "step": 8225 + }, + { + "epoch": 2.03955500618047, + "grad_norm": 0.8046875, + "learning_rate": 6.025641025641026e-05, + "loss": 0.9714, + "step": 8250 + }, + { + "epoch": 2.0457354758961683, + "grad_norm": 0.78515625, + "learning_rate": 6.012820512820513e-05, + "loss": 0.973, + "step": 8275 + }, + { + "epoch": 2.0519159456118663, + "grad_norm": 0.80078125, + "learning_rate": 6e-05, + "loss": 0.9738, + "step": 8300 + }, + { + "epoch": 2.0580964153275647, + "grad_norm": 0.7734375, + "learning_rate": 5.987179487179487e-05, + "loss": 0.9714, + "step": 8325 + }, + { + "epoch": 2.064276885043263, + "grad_norm": 0.76953125, + "learning_rate": 5.9743589743589745e-05, + "loss": 0.9718, + "step": 8350 + }, + { + "epoch": 2.0704573547589615, + "grad_norm": 0.82421875, + "learning_rate": 5.9615384615384616e-05, + "loss": 0.9768, + "step": 8375 + }, + { + "epoch": 2.07663782447466, + "grad_norm": 0.81640625, + "learning_rate": 5.948717948717949e-05, + "loss": 0.9756, + "step": 8400 + }, + { + "epoch": 2.0828182941903584, + "grad_norm": 0.8125, + "learning_rate": 5.935897435897436e-05, + "loss": 0.9768, + "step": 8425 + }, + { + "epoch": 2.0889987639060568, + "grad_norm": 0.8046875, + "learning_rate": 5.923076923076923e-05, + "loss": 0.9755, + "step": 8450 + }, + { + "epoch": 2.095179233621755, + "grad_norm": 0.77734375, + "learning_rate": 5.910256410256411e-05, + "loss": 0.9758, + "step": 8475 + }, + { + "epoch": 2.1013597033374536, + "grad_norm": 0.8125, + "learning_rate": 5.897435897435898e-05, + "loss": 0.9762, + "step": 8500 + }, + { + "epoch": 2.107540173053152, + "grad_norm": 0.75, + "learning_rate": 5.884615384615385e-05, + "loss": 0.9774, + "step": 8525 + }, + { + "epoch": 2.1137206427688504, + "grad_norm": 0.76953125, + "learning_rate": 5.8717948717948725e-05, + "loss": 0.9795, + "step": 8550 + }, + { + "epoch": 2.119901112484549, + "grad_norm": 0.796875, + "learning_rate": 5.8589743589743596e-05, + "loss": 0.9762, + "step": 8575 + }, + { + "epoch": 2.1260815822002472, + "grad_norm": 0.79296875, + "learning_rate": 5.846153846153847e-05, + "loss": 0.9808, + "step": 8600 + }, + { + "epoch": 2.1322620519159456, + "grad_norm": 0.88671875, + "learning_rate": 5.833333333333334e-05, + "loss": 0.9781, + "step": 8625 + }, + { + "epoch": 2.138442521631644, + "grad_norm": 0.76953125, + "learning_rate": 5.820512820512821e-05, + "loss": 0.9772, + "step": 8650 + }, + { + "epoch": 2.1446229913473425, + "grad_norm": 0.79296875, + "learning_rate": 5.807692307692308e-05, + "loss": 0.9774, + "step": 8675 + }, + { + "epoch": 2.150803461063041, + "grad_norm": 0.8125, + "learning_rate": 5.7948717948717954e-05, + "loss": 0.9764, + "step": 8700 + }, + { + "epoch": 2.1569839307787393, + "grad_norm": 0.8046875, + "learning_rate": 5.7820512820512826e-05, + "loss": 0.9781, + "step": 8725 + }, + { + "epoch": 2.1631644004944377, + "grad_norm": 0.81640625, + "learning_rate": 5.769230769230769e-05, + "loss": 0.9748, + "step": 8750 + }, + { + "epoch": 2.169344870210136, + "grad_norm": 0.77734375, + "learning_rate": 5.756410256410256e-05, + "loss": 0.9762, + "step": 8775 + }, + { + "epoch": 2.1755253399258345, + "grad_norm": 0.79296875, + "learning_rate": 5.7435897435897434e-05, + "loss": 0.9789, + "step": 8800 + }, + { + "epoch": 2.181705809641533, + "grad_norm": 0.82421875, + "learning_rate": 5.7307692307692306e-05, + "loss": 0.9764, + "step": 8825 + }, + { + "epoch": 2.1878862793572313, + "grad_norm": 0.78515625, + "learning_rate": 5.717948717948718e-05, + "loss": 0.9767, + "step": 8850 + }, + { + "epoch": 2.1940667490729293, + "grad_norm": 0.76171875, + "learning_rate": 5.705128205128205e-05, + "loss": 0.9798, + "step": 8875 + }, + { + "epoch": 2.200247218788628, + "grad_norm": 0.7890625, + "learning_rate": 5.692307692307692e-05, + "loss": 0.9766, + "step": 8900 + }, + { + "epoch": 2.206427688504326, + "grad_norm": 0.8515625, + "learning_rate": 5.679487179487179e-05, + "loss": 0.9814, + "step": 8925 + }, + { + "epoch": 2.2126081582200245, + "grad_norm": 0.8203125, + "learning_rate": 5.666666666666667e-05, + "loss": 0.9784, + "step": 8950 + }, + { + "epoch": 2.218788627935723, + "grad_norm": 0.8515625, + "learning_rate": 5.653846153846154e-05, + "loss": 0.9749, + "step": 8975 + }, + { + "epoch": 2.2249690976514214, + "grad_norm": 0.8046875, + "learning_rate": 5.6410256410256414e-05, + "loss": 0.9814, + "step": 9000 + }, + { + "epoch": 2.23114956736712, + "grad_norm": 0.7578125, + "learning_rate": 5.6282051282051286e-05, + "loss": 0.9805, + "step": 9025 + }, + { + "epoch": 2.237330037082818, + "grad_norm": 0.80859375, + "learning_rate": 5.615384615384616e-05, + "loss": 0.9749, + "step": 9050 + }, + { + "epoch": 2.2435105067985166, + "grad_norm": 0.76171875, + "learning_rate": 5.602564102564103e-05, + "loss": 0.9747, + "step": 9075 + }, + { + "epoch": 2.249690976514215, + "grad_norm": 0.78515625, + "learning_rate": 5.58974358974359e-05, + "loss": 0.9755, + "step": 9100 + }, + { + "epoch": 2.2558714462299134, + "grad_norm": 0.78515625, + "learning_rate": 5.576923076923077e-05, + "loss": 0.9767, + "step": 9125 + }, + { + "epoch": 2.262051915945612, + "grad_norm": 0.77734375, + "learning_rate": 5.5641025641025644e-05, + "loss": 0.9799, + "step": 9150 + }, + { + "epoch": 2.2682323856613102, + "grad_norm": 0.75390625, + "learning_rate": 5.5512820512820515e-05, + "loss": 0.9807, + "step": 9175 + }, + { + "epoch": 2.2744128553770087, + "grad_norm": 0.78125, + "learning_rate": 5.538461538461539e-05, + "loss": 0.9765, + "step": 9200 + }, + { + "epoch": 2.280593325092707, + "grad_norm": 0.80078125, + "learning_rate": 5.5256410256410265e-05, + "loss": 0.9767, + "step": 9225 + }, + { + "epoch": 2.2867737948084055, + "grad_norm": 0.79296875, + "learning_rate": 5.512820512820514e-05, + "loss": 0.9751, + "step": 9250 + }, + { + "epoch": 2.292954264524104, + "grad_norm": 0.78515625, + "learning_rate": 5.500000000000001e-05, + "loss": 0.9749, + "step": 9275 + }, + { + "epoch": 2.2991347342398023, + "grad_norm": 0.76953125, + "learning_rate": 5.487179487179488e-05, + "loss": 0.975, + "step": 9300 + }, + { + "epoch": 2.3053152039555007, + "grad_norm": 0.78125, + "learning_rate": 5.474358974358975e-05, + "loss": 0.9754, + "step": 9325 + }, + { + "epoch": 2.311495673671199, + "grad_norm": 0.796875, + "learning_rate": 5.461538461538461e-05, + "loss": 0.9721, + "step": 9350 + }, + { + "epoch": 2.3176761433868975, + "grad_norm": 0.76953125, + "learning_rate": 5.448717948717948e-05, + "loss": 0.9739, + "step": 9375 + }, + { + "epoch": 2.323856613102596, + "grad_norm": 0.74609375, + "learning_rate": 5.435897435897436e-05, + "loss": 0.9759, + "step": 9400 + }, + { + "epoch": 2.3300370828182944, + "grad_norm": 0.75390625, + "learning_rate": 5.423076923076923e-05, + "loss": 0.9716, + "step": 9425 + }, + { + "epoch": 2.3362175525339928, + "grad_norm": 0.7734375, + "learning_rate": 5.41025641025641e-05, + "loss": 0.9757, + "step": 9450 + }, + { + "epoch": 2.342398022249691, + "grad_norm": 0.73046875, + "learning_rate": 5.3974358974358975e-05, + "loss": 0.9723, + "step": 9475 + }, + { + "epoch": 2.348578491965389, + "grad_norm": 0.80078125, + "learning_rate": 5.384615384615385e-05, + "loss": 0.973, + "step": 9500 + }, + { + "epoch": 2.354758961681088, + "grad_norm": 0.77734375, + "learning_rate": 5.371794871794872e-05, + "loss": 0.9721, + "step": 9525 + }, + { + "epoch": 2.360939431396786, + "grad_norm": 0.76171875, + "learning_rate": 5.358974358974359e-05, + "loss": 0.9762, + "step": 9550 + }, + { + "epoch": 2.3671199011124844, + "grad_norm": 0.81640625, + "learning_rate": 5.346153846153846e-05, + "loss": 0.9781, + "step": 9575 + }, + { + "epoch": 2.373300370828183, + "grad_norm": 0.8203125, + "learning_rate": 5.333333333333333e-05, + "loss": 0.9767, + "step": 9600 + }, + { + "epoch": 2.379480840543881, + "grad_norm": 0.7578125, + "learning_rate": 5.3205128205128205e-05, + "loss": 0.974, + "step": 9625 + }, + { + "epoch": 2.3856613102595796, + "grad_norm": 0.8046875, + "learning_rate": 5.3076923076923076e-05, + "loss": 0.9717, + "step": 9650 + }, + { + "epoch": 2.391841779975278, + "grad_norm": 0.79296875, + "learning_rate": 5.2948717948717955e-05, + "loss": 0.9717, + "step": 9675 + }, + { + "epoch": 2.3980222496909764, + "grad_norm": 0.828125, + "learning_rate": 5.2820512820512826e-05, + "loss": 0.9726, + "step": 9700 + }, + { + "epoch": 2.404202719406675, + "grad_norm": 0.81640625, + "learning_rate": 5.26923076923077e-05, + "loss": 0.9746, + "step": 9725 + }, + { + "epoch": 2.4103831891223733, + "grad_norm": 0.8203125, + "learning_rate": 5.256410256410257e-05, + "loss": 0.9739, + "step": 9750 + }, + { + "epoch": 2.4165636588380717, + "grad_norm": 0.796875, + "learning_rate": 5.243589743589744e-05, + "loss": 0.9702, + "step": 9775 + }, + { + "epoch": 2.42274412855377, + "grad_norm": 0.76171875, + "learning_rate": 5.230769230769231e-05, + "loss": 0.97, + "step": 9800 + }, + { + "epoch": 2.4289245982694685, + "grad_norm": 0.796875, + "learning_rate": 5.2179487179487185e-05, + "loss": 0.9698, + "step": 9825 + }, + { + "epoch": 2.435105067985167, + "grad_norm": 0.83984375, + "learning_rate": 5.2051282051282056e-05, + "loss": 0.9687, + "step": 9850 + }, + { + "epoch": 2.4412855377008653, + "grad_norm": 0.79296875, + "learning_rate": 5.192307692307693e-05, + "loss": 0.9673, + "step": 9875 + }, + { + "epoch": 2.4474660074165637, + "grad_norm": 0.82421875, + "learning_rate": 5.17948717948718e-05, + "loss": 0.9671, + "step": 9900 + }, + { + "epoch": 2.453646477132262, + "grad_norm": 0.78515625, + "learning_rate": 5.166666666666667e-05, + "loss": 0.9685, + "step": 9925 + }, + { + "epoch": 2.4598269468479605, + "grad_norm": 0.83203125, + "learning_rate": 5.1538461538461536e-05, + "loss": 0.9722, + "step": 9950 + }, + { + "epoch": 2.466007416563659, + "grad_norm": 0.875, + "learning_rate": 5.141025641025641e-05, + "loss": 0.969, + "step": 9975 + }, + { + "epoch": 2.4721878862793574, + "grad_norm": 0.80078125, + "learning_rate": 5.128205128205128e-05, + "loss": 0.9717, + "step": 10000 + }, + { + "epoch": 2.4721878862793574, + "eval_loss": 1.0057789087295532, + "eval_runtime": 1.5251, + "eval_samples_per_second": 418.982, + "eval_steps_per_second": 1.967, + "step": 10000 + }, + { + "epoch": 2.478368355995056, + "grad_norm": 0.765625, + "learning_rate": 5.115384615384615e-05, + "loss": 0.9694, + "step": 10025 + }, + { + "epoch": 2.484548825710754, + "grad_norm": 0.8046875, + "learning_rate": 5.102564102564102e-05, + "loss": 0.9696, + "step": 10050 + }, + { + "epoch": 2.490729295426452, + "grad_norm": 0.734375, + "learning_rate": 5.0897435897435894e-05, + "loss": 0.9687, + "step": 10075 + }, + { + "epoch": 2.496909765142151, + "grad_norm": 0.7578125, + "learning_rate": 5.0769230769230766e-05, + "loss": 0.964, + "step": 10100 + }, + { + "epoch": 2.503090234857849, + "grad_norm": 0.82421875, + "learning_rate": 5.0641025641025644e-05, + "loss": 0.9691, + "step": 10125 + }, + { + "epoch": 2.509270704573548, + "grad_norm": 0.7734375, + "learning_rate": 5.0512820512820516e-05, + "loss": 0.9649, + "step": 10150 + }, + { + "epoch": 2.515451174289246, + "grad_norm": 0.7890625, + "learning_rate": 5.038461538461539e-05, + "loss": 0.9673, + "step": 10175 + }, + { + "epoch": 2.521631644004944, + "grad_norm": 0.75390625, + "learning_rate": 5.025641025641026e-05, + "loss": 0.9669, + "step": 10200 + }, + { + "epoch": 2.5278121137206426, + "grad_norm": 0.765625, + "learning_rate": 5.012820512820513e-05, + "loss": 0.9653, + "step": 10225 + }, + { + "epoch": 2.533992583436341, + "grad_norm": 0.88671875, + "learning_rate": 5e-05, + "loss": 0.9682, + "step": 10250 + }, + { + "epoch": 2.5401730531520395, + "grad_norm": 0.7734375, + "learning_rate": 4.9871794871794874e-05, + "loss": 0.9615, + "step": 10275 + }, + { + "epoch": 2.546353522867738, + "grad_norm": 0.765625, + "learning_rate": 4.9743589743589746e-05, + "loss": 0.9618, + "step": 10300 + }, + { + "epoch": 2.5525339925834363, + "grad_norm": 0.796875, + "learning_rate": 4.961538461538462e-05, + "loss": 0.9633, + "step": 10325 + }, + { + "epoch": 2.5587144622991347, + "grad_norm": 0.74609375, + "learning_rate": 4.948717948717949e-05, + "loss": 0.9624, + "step": 10350 + }, + { + "epoch": 2.564894932014833, + "grad_norm": 0.75, + "learning_rate": 4.935897435897436e-05, + "loss": 0.9645, + "step": 10375 + }, + { + "epoch": 2.5710754017305315, + "grad_norm": 0.73046875, + "learning_rate": 4.923076923076924e-05, + "loss": 0.9594, + "step": 10400 + }, + { + "epoch": 2.57725587144623, + "grad_norm": 0.70703125, + "learning_rate": 4.9102564102564104e-05, + "loss": 0.9652, + "step": 10425 + }, + { + "epoch": 2.5834363411619283, + "grad_norm": 0.7734375, + "learning_rate": 4.8974358974358975e-05, + "loss": 0.9606, + "step": 10450 + }, + { + "epoch": 2.5896168108776267, + "grad_norm": 0.7890625, + "learning_rate": 4.884615384615385e-05, + "loss": 0.9639, + "step": 10475 + }, + { + "epoch": 2.595797280593325, + "grad_norm": 0.796875, + "learning_rate": 4.871794871794872e-05, + "loss": 0.9615, + "step": 10500 + }, + { + "epoch": 2.6019777503090236, + "grad_norm": 0.80078125, + "learning_rate": 4.858974358974359e-05, + "loss": 0.961, + "step": 10525 + }, + { + "epoch": 2.608158220024722, + "grad_norm": 0.73046875, + "learning_rate": 4.846153846153846e-05, + "loss": 0.9608, + "step": 10550 + }, + { + "epoch": 2.6143386897404204, + "grad_norm": 0.83203125, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.9605, + "step": 10575 + }, + { + "epoch": 2.620519159456119, + "grad_norm": 0.7421875, + "learning_rate": 4.8205128205128205e-05, + "loss": 0.9572, + "step": 10600 + }, + { + "epoch": 2.626699629171817, + "grad_norm": 0.78125, + "learning_rate": 4.8076923076923084e-05, + "loss": 0.961, + "step": 10625 + }, + { + "epoch": 2.632880098887515, + "grad_norm": 0.84765625, + "learning_rate": 4.7948717948717955e-05, + "loss": 0.9538, + "step": 10650 + }, + { + "epoch": 2.639060568603214, + "grad_norm": 0.84765625, + "learning_rate": 4.782051282051283e-05, + "loss": 0.9529, + "step": 10675 + }, + { + "epoch": 2.645241038318912, + "grad_norm": 0.8359375, + "learning_rate": 4.76923076923077e-05, + "loss": 0.9579, + "step": 10700 + }, + { + "epoch": 2.651421508034611, + "grad_norm": 0.73046875, + "learning_rate": 4.7564102564102563e-05, + "loss": 0.956, + "step": 10725 + }, + { + "epoch": 2.657601977750309, + "grad_norm": 0.76953125, + "learning_rate": 4.7435897435897435e-05, + "loss": 0.9592, + "step": 10750 + }, + { + "epoch": 2.6637824474660077, + "grad_norm": 0.76953125, + "learning_rate": 4.730769230769231e-05, + "loss": 0.9578, + "step": 10775 + }, + { + "epoch": 2.6699629171817056, + "grad_norm": 0.84765625, + "learning_rate": 4.717948717948718e-05, + "loss": 0.9579, + "step": 10800 + }, + { + "epoch": 2.676143386897404, + "grad_norm": 0.76953125, + "learning_rate": 4.705128205128205e-05, + "loss": 0.9561, + "step": 10825 + }, + { + "epoch": 2.6823238566131025, + "grad_norm": 0.70703125, + "learning_rate": 4.692307692307693e-05, + "loss": 0.9567, + "step": 10850 + }, + { + "epoch": 2.688504326328801, + "grad_norm": 0.796875, + "learning_rate": 4.67948717948718e-05, + "loss": 0.956, + "step": 10875 + }, + { + "epoch": 2.6946847960444993, + "grad_norm": 0.78515625, + "learning_rate": 4.666666666666667e-05, + "loss": 0.9567, + "step": 10900 + }, + { + "epoch": 2.7008652657601977, + "grad_norm": 0.80078125, + "learning_rate": 4.653846153846154e-05, + "loss": 0.9514, + "step": 10925 + }, + { + "epoch": 2.707045735475896, + "grad_norm": 0.74609375, + "learning_rate": 4.6410256410256415e-05, + "loss": 0.9535, + "step": 10950 + }, + { + "epoch": 2.7132262051915945, + "grad_norm": 0.87890625, + "learning_rate": 4.6282051282051287e-05, + "loss": 0.9571, + "step": 10975 + }, + { + "epoch": 2.719406674907293, + "grad_norm": 0.79296875, + "learning_rate": 4.615384615384616e-05, + "loss": 0.9511, + "step": 11000 + }, + { + "epoch": 2.7255871446229913, + "grad_norm": 0.8046875, + "learning_rate": 4.602564102564102e-05, + "loss": 0.9511, + "step": 11025 + }, + { + "epoch": 2.7317676143386898, + "grad_norm": 0.78515625, + "learning_rate": 4.5897435897435895e-05, + "loss": 0.9554, + "step": 11050 + }, + { + "epoch": 2.737948084054388, + "grad_norm": 0.80859375, + "learning_rate": 4.576923076923077e-05, + "loss": 0.95, + "step": 11075 + }, + { + "epoch": 2.7441285537700866, + "grad_norm": 0.7890625, + "learning_rate": 4.5641025641025645e-05, + "loss": 0.951, + "step": 11100 + }, + { + "epoch": 2.750309023485785, + "grad_norm": 0.71875, + "learning_rate": 4.5512820512820516e-05, + "loss": 0.9475, + "step": 11125 + }, + { + "epoch": 2.7564894932014834, + "grad_norm": 0.796875, + "learning_rate": 4.538461538461539e-05, + "loss": 0.9511, + "step": 11150 + }, + { + "epoch": 2.762669962917182, + "grad_norm": 0.75, + "learning_rate": 4.525641025641026e-05, + "loss": 0.9489, + "step": 11175 + }, + { + "epoch": 2.76885043263288, + "grad_norm": 0.73046875, + "learning_rate": 4.512820512820513e-05, + "loss": 0.9511, + "step": 11200 + }, + { + "epoch": 2.7750309023485786, + "grad_norm": 0.734375, + "learning_rate": 4.5e-05, + "loss": 0.951, + "step": 11225 + }, + { + "epoch": 2.781211372064277, + "grad_norm": 0.765625, + "learning_rate": 4.4871794871794874e-05, + "loss": 0.9532, + "step": 11250 + }, + { + "epoch": 2.787391841779975, + "grad_norm": 0.7890625, + "learning_rate": 4.4743589743589746e-05, + "loss": 0.9517, + "step": 11275 + }, + { + "epoch": 2.793572311495674, + "grad_norm": 0.76171875, + "learning_rate": 4.461538461538462e-05, + "loss": 0.9479, + "step": 11300 + }, + { + "epoch": 2.799752781211372, + "grad_norm": 0.7734375, + "learning_rate": 4.448717948717949e-05, + "loss": 0.9493, + "step": 11325 + }, + { + "epoch": 2.8059332509270707, + "grad_norm": 0.76171875, + "learning_rate": 4.435897435897436e-05, + "loss": 0.9489, + "step": 11350 + }, + { + "epoch": 2.8121137206427687, + "grad_norm": 0.76953125, + "learning_rate": 4.423076923076923e-05, + "loss": 0.945, + "step": 11375 + }, + { + "epoch": 2.8182941903584675, + "grad_norm": 0.7890625, + "learning_rate": 4.4102564102564104e-05, + "loss": 0.9511, + "step": 11400 + }, + { + "epoch": 2.8244746600741655, + "grad_norm": 0.7734375, + "learning_rate": 4.3974358974358976e-05, + "loss": 0.9455, + "step": 11425 + }, + { + "epoch": 2.830655129789864, + "grad_norm": 0.78125, + "learning_rate": 4.384615384615385e-05, + "loss": 0.9468, + "step": 11450 + }, + { + "epoch": 2.8368355995055623, + "grad_norm": 0.796875, + "learning_rate": 4.371794871794872e-05, + "loss": 0.9478, + "step": 11475 + }, + { + "epoch": 2.8430160692212607, + "grad_norm": 0.76953125, + "learning_rate": 4.358974358974359e-05, + "loss": 0.9448, + "step": 11500 + }, + { + "epoch": 2.849196538936959, + "grad_norm": 0.78125, + "learning_rate": 4.346153846153846e-05, + "loss": 0.9439, + "step": 11525 + }, + { + "epoch": 2.8553770086526575, + "grad_norm": 0.703125, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.948, + "step": 11550 + }, + { + "epoch": 2.861557478368356, + "grad_norm": 0.7578125, + "learning_rate": 4.320512820512821e-05, + "loss": 0.9465, + "step": 11575 + }, + { + "epoch": 2.8677379480840544, + "grad_norm": 0.73046875, + "learning_rate": 4.3076923076923084e-05, + "loss": 0.9469, + "step": 11600 + }, + { + "epoch": 2.8739184177997528, + "grad_norm": 0.82421875, + "learning_rate": 4.294871794871795e-05, + "loss": 0.9417, + "step": 11625 + }, + { + "epoch": 2.880098887515451, + "grad_norm": 0.6953125, + "learning_rate": 4.282051282051282e-05, + "loss": 0.9431, + "step": 11650 + }, + { + "epoch": 2.8862793572311496, + "grad_norm": 0.74609375, + "learning_rate": 4.269230769230769e-05, + "loss": 0.9446, + "step": 11675 + }, + { + "epoch": 2.892459826946848, + "grad_norm": 0.734375, + "learning_rate": 4.2564102564102564e-05, + "loss": 0.943, + "step": 11700 + }, + { + "epoch": 2.8986402966625464, + "grad_norm": 0.76171875, + "learning_rate": 4.2435897435897435e-05, + "loss": 0.9467, + "step": 11725 + }, + { + "epoch": 2.904820766378245, + "grad_norm": 0.7265625, + "learning_rate": 4.230769230769231e-05, + "loss": 0.9489, + "step": 11750 + }, + { + "epoch": 2.9110012360939432, + "grad_norm": 0.72265625, + "learning_rate": 4.217948717948718e-05, + "loss": 0.9435, + "step": 11775 + }, + { + "epoch": 2.9171817058096416, + "grad_norm": 0.734375, + "learning_rate": 4.205128205128206e-05, + "loss": 0.9405, + "step": 11800 + }, + { + "epoch": 2.92336217552534, + "grad_norm": 0.74609375, + "learning_rate": 4.192307692307693e-05, + "loss": 0.9422, + "step": 11825 + }, + { + "epoch": 2.9295426452410385, + "grad_norm": 0.73828125, + "learning_rate": 4.17948717948718e-05, + "loss": 0.9421, + "step": 11850 + }, + { + "epoch": 2.935723114956737, + "grad_norm": 0.7578125, + "learning_rate": 4.166666666666667e-05, + "loss": 0.9394, + "step": 11875 + }, + { + "epoch": 2.941903584672435, + "grad_norm": 0.7734375, + "learning_rate": 4.1538461538461544e-05, + "loss": 0.9416, + "step": 11900 + }, + { + "epoch": 2.9480840543881337, + "grad_norm": 0.71484375, + "learning_rate": 4.1410256410256415e-05, + "loss": 0.9412, + "step": 11925 + }, + { + "epoch": 2.9542645241038317, + "grad_norm": 0.76953125, + "learning_rate": 4.128205128205128e-05, + "loss": 0.9399, + "step": 11950 + }, + { + "epoch": 2.9604449938195305, + "grad_norm": 0.70703125, + "learning_rate": 4.115384615384615e-05, + "loss": 0.9396, + "step": 11975 + }, + { + "epoch": 2.9666254635352285, + "grad_norm": 0.765625, + "learning_rate": 4.1025641025641023e-05, + "loss": 0.9366, + "step": 12000 + }, + { + "epoch": 2.9728059332509273, + "grad_norm": 0.73828125, + "learning_rate": 4.0897435897435895e-05, + "loss": 0.9368, + "step": 12025 + }, + { + "epoch": 2.9789864029666253, + "grad_norm": 0.72265625, + "learning_rate": 4.0769230769230773e-05, + "loss": 0.9392, + "step": 12050 + }, + { + "epoch": 2.9851668726823237, + "grad_norm": 0.77734375, + "learning_rate": 4.0641025641025645e-05, + "loss": 0.9397, + "step": 12075 + }, + { + "epoch": 2.991347342398022, + "grad_norm": 0.7421875, + "learning_rate": 4.051282051282052e-05, + "loss": 0.9385, + "step": 12100 + }, + { + "epoch": 2.9975278121137205, + "grad_norm": 0.7734375, + "learning_rate": 4.038461538461539e-05, + "loss": 0.9397, + "step": 12125 + }, + { + "epoch": 3.003708281829419, + "grad_norm": 0.84375, + "learning_rate": 4.025641025641026e-05, + "loss": 0.8884, + "step": 12150 + }, + { + "epoch": 3.0098887515451174, + "grad_norm": 0.75, + "learning_rate": 4.012820512820513e-05, + "loss": 0.8596, + "step": 12175 + }, + { + "epoch": 3.016069221260816, + "grad_norm": 0.765625, + "learning_rate": 4e-05, + "loss": 0.8606, + "step": 12200 + }, + { + "epoch": 3.022249690976514, + "grad_norm": 0.765625, + "learning_rate": 3.9871794871794875e-05, + "loss": 0.8623, + "step": 12225 + }, + { + "epoch": 3.0284301606922126, + "grad_norm": 0.7734375, + "learning_rate": 3.974358974358974e-05, + "loss": 0.8606, + "step": 12250 + }, + { + "epoch": 3.034610630407911, + "grad_norm": 0.78125, + "learning_rate": 3.961538461538462e-05, + "loss": 0.8626, + "step": 12275 + }, + { + "epoch": 3.0407911001236094, + "grad_norm": 0.72265625, + "learning_rate": 3.948717948717949e-05, + "loss": 0.8634, + "step": 12300 + }, + { + "epoch": 3.046971569839308, + "grad_norm": 0.76171875, + "learning_rate": 3.935897435897436e-05, + "loss": 0.8615, + "step": 12325 + }, + { + "epoch": 3.0531520395550062, + "grad_norm": 0.75, + "learning_rate": 3.923076923076923e-05, + "loss": 0.8622, + "step": 12350 + }, + { + "epoch": 3.0593325092707047, + "grad_norm": 0.75390625, + "learning_rate": 3.9102564102564105e-05, + "loss": 0.8627, + "step": 12375 + }, + { + "epoch": 3.065512978986403, + "grad_norm": 0.7578125, + "learning_rate": 3.8974358974358976e-05, + "loss": 0.8603, + "step": 12400 + }, + { + "epoch": 3.0716934487021015, + "grad_norm": 0.76171875, + "learning_rate": 3.884615384615385e-05, + "loss": 0.8676, + "step": 12425 + }, + { + "epoch": 3.0778739184178, + "grad_norm": 0.7421875, + "learning_rate": 3.871794871794872e-05, + "loss": 0.8622, + "step": 12450 + }, + { + "epoch": 3.0840543881334983, + "grad_norm": 0.80078125, + "learning_rate": 3.858974358974359e-05, + "loss": 0.8625, + "step": 12475 + }, + { + "epoch": 3.0902348578491967, + "grad_norm": 0.75390625, + "learning_rate": 3.846153846153846e-05, + "loss": 0.8649, + "step": 12500 + }, + { + "epoch": 3.096415327564895, + "grad_norm": 0.76953125, + "learning_rate": 3.8333333333333334e-05, + "loss": 0.8663, + "step": 12525 + }, + { + "epoch": 3.1025957972805935, + "grad_norm": 0.7734375, + "learning_rate": 3.8205128205128206e-05, + "loss": 0.8633, + "step": 12550 + }, + { + "epoch": 3.1087762669962915, + "grad_norm": 0.7578125, + "learning_rate": 3.807692307692308e-05, + "loss": 0.8652, + "step": 12575 + }, + { + "epoch": 3.11495673671199, + "grad_norm": 0.74609375, + "learning_rate": 3.794871794871795e-05, + "loss": 0.8671, + "step": 12600 + }, + { + "epoch": 3.1211372064276883, + "grad_norm": 0.7734375, + "learning_rate": 3.782051282051282e-05, + "loss": 0.8633, + "step": 12625 + }, + { + "epoch": 3.1273176761433867, + "grad_norm": 0.72265625, + "learning_rate": 3.769230769230769e-05, + "loss": 0.8672, + "step": 12650 + }, + { + "epoch": 3.133498145859085, + "grad_norm": 0.7421875, + "learning_rate": 3.7564102564102564e-05, + "loss": 0.868, + "step": 12675 + }, + { + "epoch": 3.1396786155747836, + "grad_norm": 0.734375, + "learning_rate": 3.7435897435897436e-05, + "loss": 0.8656, + "step": 12700 + }, + { + "epoch": 3.145859085290482, + "grad_norm": 0.73828125, + "learning_rate": 3.730769230769231e-05, + "loss": 0.8663, + "step": 12725 + }, + { + "epoch": 3.1520395550061804, + "grad_norm": 0.765625, + "learning_rate": 3.717948717948718e-05, + "loss": 0.8679, + "step": 12750 + }, + { + "epoch": 3.158220024721879, + "grad_norm": 0.7734375, + "learning_rate": 3.705128205128206e-05, + "loss": 0.867, + "step": 12775 + }, + { + "epoch": 3.164400494437577, + "grad_norm": 0.7265625, + "learning_rate": 3.692307692307693e-05, + "loss": 0.8685, + "step": 12800 + }, + { + "epoch": 3.1705809641532756, + "grad_norm": 0.76953125, + "learning_rate": 3.67948717948718e-05, + "loss": 0.8673, + "step": 12825 + }, + { + "epoch": 3.176761433868974, + "grad_norm": 0.74609375, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.8688, + "step": 12850 + }, + { + "epoch": 3.1829419035846724, + "grad_norm": 0.75390625, + "learning_rate": 3.653846153846154e-05, + "loss": 0.8638, + "step": 12875 + }, + { + "epoch": 3.189122373300371, + "grad_norm": 0.7734375, + "learning_rate": 3.641025641025641e-05, + "loss": 0.87, + "step": 12900 + }, + { + "epoch": 3.1953028430160693, + "grad_norm": 0.72265625, + "learning_rate": 3.628205128205128e-05, + "loss": 0.8717, + "step": 12925 + }, + { + "epoch": 3.2014833127317677, + "grad_norm": 0.734375, + "learning_rate": 3.615384615384615e-05, + "loss": 0.863, + "step": 12950 + }, + { + "epoch": 3.207663782447466, + "grad_norm": 0.75390625, + "learning_rate": 3.6025641025641024e-05, + "loss": 0.8704, + "step": 12975 + }, + { + "epoch": 3.2138442521631645, + "grad_norm": 0.73828125, + "learning_rate": 3.58974358974359e-05, + "loss": 0.8695, + "step": 13000 + }, + { + "epoch": 3.220024721878863, + "grad_norm": 0.73046875, + "learning_rate": 3.5769230769230774e-05, + "loss": 0.8695, + "step": 13025 + }, + { + "epoch": 3.2262051915945613, + "grad_norm": 0.734375, + "learning_rate": 3.5641025641025646e-05, + "loss": 0.8668, + "step": 13050 + }, + { + "epoch": 3.2323856613102597, + "grad_norm": 0.77734375, + "learning_rate": 3.551282051282052e-05, + "loss": 0.8703, + "step": 13075 + }, + { + "epoch": 3.238566131025958, + "grad_norm": 0.72265625, + "learning_rate": 3.538461538461539e-05, + "loss": 0.8671, + "step": 13100 + }, + { + "epoch": 3.2447466007416566, + "grad_norm": 0.74609375, + "learning_rate": 3.525641025641026e-05, + "loss": 0.8677, + "step": 13125 + }, + { + "epoch": 3.2509270704573545, + "grad_norm": 0.7109375, + "learning_rate": 3.5128205128205125e-05, + "loss": 0.8671, + "step": 13150 + }, + { + "epoch": 3.2571075401730534, + "grad_norm": 0.71875, + "learning_rate": 3.5e-05, + "loss": 0.8682, + "step": 13175 + }, + { + "epoch": 3.2632880098887513, + "grad_norm": 0.7734375, + "learning_rate": 3.487179487179487e-05, + "loss": 0.8677, + "step": 13200 + }, + { + "epoch": 3.2694684796044498, + "grad_norm": 0.703125, + "learning_rate": 3.474358974358975e-05, + "loss": 0.871, + "step": 13225 + }, + { + "epoch": 3.275648949320148, + "grad_norm": 0.78125, + "learning_rate": 3.461538461538462e-05, + "loss": 0.8682, + "step": 13250 + }, + { + "epoch": 3.2818294190358466, + "grad_norm": 0.74609375, + "learning_rate": 3.448717948717949e-05, + "loss": 0.8653, + "step": 13275 + }, + { + "epoch": 3.288009888751545, + "grad_norm": 0.7265625, + "learning_rate": 3.435897435897436e-05, + "loss": 0.8697, + "step": 13300 + }, + { + "epoch": 3.2941903584672434, + "grad_norm": 0.796875, + "learning_rate": 3.4230769230769234e-05, + "loss": 0.8697, + "step": 13325 + }, + { + "epoch": 3.300370828182942, + "grad_norm": 0.7578125, + "learning_rate": 3.4102564102564105e-05, + "loss": 0.869, + "step": 13350 + }, + { + "epoch": 3.30655129789864, + "grad_norm": 0.73046875, + "learning_rate": 3.397435897435898e-05, + "loss": 0.8687, + "step": 13375 + }, + { + "epoch": 3.3127317676143386, + "grad_norm": 0.78125, + "learning_rate": 3.384615384615385e-05, + "loss": 0.8692, + "step": 13400 + }, + { + "epoch": 3.318912237330037, + "grad_norm": 0.74609375, + "learning_rate": 3.371794871794872e-05, + "loss": 0.8687, + "step": 13425 + }, + { + "epoch": 3.3250927070457355, + "grad_norm": 0.7734375, + "learning_rate": 3.358974358974359e-05, + "loss": 0.8708, + "step": 13450 + }, + { + "epoch": 3.331273176761434, + "grad_norm": 0.74609375, + "learning_rate": 3.346153846153846e-05, + "loss": 0.8659, + "step": 13475 + }, + { + "epoch": 3.3374536464771323, + "grad_norm": 0.7578125, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.8662, + "step": 13500 + }, + { + "epoch": 3.3436341161928307, + "grad_norm": 0.72265625, + "learning_rate": 3.3205128205128207e-05, + "loss": 0.8692, + "step": 13525 + }, + { + "epoch": 3.349814585908529, + "grad_norm": 0.78515625, + "learning_rate": 3.307692307692308e-05, + "loss": 0.8677, + "step": 13550 + }, + { + "epoch": 3.3559950556242275, + "grad_norm": 0.765625, + "learning_rate": 3.294871794871795e-05, + "loss": 0.8672, + "step": 13575 + }, + { + "epoch": 3.362175525339926, + "grad_norm": 0.765625, + "learning_rate": 3.282051282051282e-05, + "loss": 0.8703, + "step": 13600 + }, + { + "epoch": 3.3683559950556243, + "grad_norm": 0.76171875, + "learning_rate": 3.269230769230769e-05, + "loss": 0.8658, + "step": 13625 + }, + { + "epoch": 3.3745364647713227, + "grad_norm": 0.75390625, + "learning_rate": 3.2564102564102565e-05, + "loss": 0.8679, + "step": 13650 + }, + { + "epoch": 3.380716934487021, + "grad_norm": 0.76953125, + "learning_rate": 3.2435897435897436e-05, + "loss": 0.8654, + "step": 13675 + }, + { + "epoch": 3.3868974042027196, + "grad_norm": 0.73828125, + "learning_rate": 3.230769230769231e-05, + "loss": 0.8662, + "step": 13700 + }, + { + "epoch": 3.393077873918418, + "grad_norm": 0.73046875, + "learning_rate": 3.2179487179487186e-05, + "loss": 0.8654, + "step": 13725 + }, + { + "epoch": 3.3992583436341164, + "grad_norm": 0.734375, + "learning_rate": 3.205128205128206e-05, + "loss": 0.8696, + "step": 13750 + }, + { + "epoch": 3.4054388133498144, + "grad_norm": 0.7421875, + "learning_rate": 3.192307692307692e-05, + "loss": 0.8648, + "step": 13775 + }, + { + "epoch": 3.411619283065513, + "grad_norm": 0.74609375, + "learning_rate": 3.1794871794871795e-05, + "loss": 0.8677, + "step": 13800 + }, + { + "epoch": 3.417799752781211, + "grad_norm": 0.72265625, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.8665, + "step": 13825 + }, + { + "epoch": 3.4239802224969096, + "grad_norm": 0.74609375, + "learning_rate": 3.153846153846154e-05, + "loss": 0.8662, + "step": 13850 + }, + { + "epoch": 3.430160692212608, + "grad_norm": 0.75390625, + "learning_rate": 3.141025641025641e-05, + "loss": 0.8658, + "step": 13875 + }, + { + "epoch": 3.4363411619283064, + "grad_norm": 0.72265625, + "learning_rate": 3.128205128205128e-05, + "loss": 0.8652, + "step": 13900 + }, + { + "epoch": 3.442521631644005, + "grad_norm": 0.75390625, + "learning_rate": 3.115384615384615e-05, + "loss": 0.8667, + "step": 13925 + }, + { + "epoch": 3.4487021013597032, + "grad_norm": 0.73046875, + "learning_rate": 3.102564102564103e-05, + "loss": 0.8654, + "step": 13950 + }, + { + "epoch": 3.4548825710754016, + "grad_norm": 0.7734375, + "learning_rate": 3.08974358974359e-05, + "loss": 0.8666, + "step": 13975 + }, + { + "epoch": 3.4610630407911, + "grad_norm": 0.765625, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.8712, + "step": 14000 + }, + { + "epoch": 3.4672435105067985, + "grad_norm": 0.7265625, + "learning_rate": 3.0641025641025646e-05, + "loss": 0.867, + "step": 14025 + }, + { + "epoch": 3.473423980222497, + "grad_norm": 0.72265625, + "learning_rate": 3.0512820512820518e-05, + "loss": 0.8635, + "step": 14050 + }, + { + "epoch": 3.4796044499381953, + "grad_norm": 0.765625, + "learning_rate": 3.0384615384615382e-05, + "loss": 0.8686, + "step": 14075 + }, + { + "epoch": 3.4857849196538937, + "grad_norm": 0.7265625, + "learning_rate": 3.0256410256410257e-05, + "loss": 0.8654, + "step": 14100 + }, + { + "epoch": 3.491965389369592, + "grad_norm": 0.71875, + "learning_rate": 3.012820512820513e-05, + "loss": 0.869, + "step": 14125 + }, + { + "epoch": 3.4981458590852905, + "grad_norm": 0.7265625, + "learning_rate": 3e-05, + "loss": 0.862, + "step": 14150 + }, + { + "epoch": 3.504326328800989, + "grad_norm": 0.76171875, + "learning_rate": 2.9871794871794872e-05, + "loss": 0.866, + "step": 14175 + }, + { + "epoch": 3.5105067985166873, + "grad_norm": 0.71875, + "learning_rate": 2.9743589743589744e-05, + "loss": 0.8701, + "step": 14200 + }, + { + "epoch": 3.5166872682323858, + "grad_norm": 0.7734375, + "learning_rate": 2.9615384615384616e-05, + "loss": 0.8679, + "step": 14225 + }, + { + "epoch": 3.522867737948084, + "grad_norm": 0.71484375, + "learning_rate": 2.948717948717949e-05, + "loss": 0.8639, + "step": 14250 + }, + { + "epoch": 3.5290482076637826, + "grad_norm": 0.734375, + "learning_rate": 2.9358974358974362e-05, + "loss": 0.8674, + "step": 14275 + }, + { + "epoch": 3.535228677379481, + "grad_norm": 0.7265625, + "learning_rate": 2.9230769230769234e-05, + "loss": 0.8639, + "step": 14300 + }, + { + "epoch": 3.5414091470951794, + "grad_norm": 0.73828125, + "learning_rate": 2.9102564102564106e-05, + "loss": 0.8652, + "step": 14325 + }, + { + "epoch": 3.5475896168108774, + "grad_norm": 0.7109375, + "learning_rate": 2.8974358974358977e-05, + "loss": 0.8687, + "step": 14350 + }, + { + "epoch": 3.553770086526576, + "grad_norm": 0.75390625, + "learning_rate": 2.8846153846153845e-05, + "loss": 0.8664, + "step": 14375 + }, + { + "epoch": 3.559950556242274, + "grad_norm": 0.703125, + "learning_rate": 2.8717948717948717e-05, + "loss": 0.8671, + "step": 14400 + }, + { + "epoch": 3.566131025957973, + "grad_norm": 0.703125, + "learning_rate": 2.858974358974359e-05, + "loss": 0.8658, + "step": 14425 + }, + { + "epoch": 3.572311495673671, + "grad_norm": 0.69921875, + "learning_rate": 2.846153846153846e-05, + "loss": 0.8598, + "step": 14450 + }, + { + "epoch": 3.57849196538937, + "grad_norm": 0.74609375, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.8643, + "step": 14475 + }, + { + "epoch": 3.584672435105068, + "grad_norm": 0.7265625, + "learning_rate": 2.8205128205128207e-05, + "loss": 0.8641, + "step": 14500 + }, + { + "epoch": 3.5908529048207662, + "grad_norm": 0.75, + "learning_rate": 2.807692307692308e-05, + "loss": 0.8665, + "step": 14525 + }, + { + "epoch": 3.5970333745364647, + "grad_norm": 0.7265625, + "learning_rate": 2.794871794871795e-05, + "loss": 0.8649, + "step": 14550 + }, + { + "epoch": 3.603213844252163, + "grad_norm": 0.73046875, + "learning_rate": 2.7820512820512822e-05, + "loss": 0.8671, + "step": 14575 + }, + { + "epoch": 3.6093943139678615, + "grad_norm": 0.71484375, + "learning_rate": 2.7692307692307694e-05, + "loss": 0.8656, + "step": 14600 + }, + { + "epoch": 3.61557478368356, + "grad_norm": 0.7265625, + "learning_rate": 2.756410256410257e-05, + "loss": 0.8642, + "step": 14625 + }, + { + "epoch": 3.6217552533992583, + "grad_norm": 0.71875, + "learning_rate": 2.743589743589744e-05, + "loss": 0.8628, + "step": 14650 + }, + { + "epoch": 3.6279357231149567, + "grad_norm": 0.73828125, + "learning_rate": 2.7307692307692305e-05, + "loss": 0.8648, + "step": 14675 + }, + { + "epoch": 3.634116192830655, + "grad_norm": 0.7109375, + "learning_rate": 2.717948717948718e-05, + "loss": 0.8674, + "step": 14700 + }, + { + "epoch": 3.6402966625463535, + "grad_norm": 0.71484375, + "learning_rate": 2.705128205128205e-05, + "loss": 0.8664, + "step": 14725 + }, + { + "epoch": 3.646477132262052, + "grad_norm": 0.7421875, + "learning_rate": 2.6923076923076923e-05, + "loss": 0.8643, + "step": 14750 + }, + { + "epoch": 3.6526576019777504, + "grad_norm": 0.7421875, + "learning_rate": 2.6794871794871795e-05, + "loss": 0.8679, + "step": 14775 + }, + { + "epoch": 3.6588380716934488, + "grad_norm": 0.76171875, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.8669, + "step": 14800 + }, + { + "epoch": 3.665018541409147, + "grad_norm": 0.74609375, + "learning_rate": 2.6538461538461538e-05, + "loss": 0.8619, + "step": 14825 + }, + { + "epoch": 3.6711990111248456, + "grad_norm": 0.72265625, + "learning_rate": 2.6410256410256413e-05, + "loss": 0.8685, + "step": 14850 + }, + { + "epoch": 3.677379480840544, + "grad_norm": 0.7265625, + "learning_rate": 2.6282051282051285e-05, + "loss": 0.8666, + "step": 14875 + }, + { + "epoch": 3.6835599505562424, + "grad_norm": 0.69921875, + "learning_rate": 2.6153846153846157e-05, + "loss": 0.8645, + "step": 14900 + }, + { + "epoch": 3.689740420271941, + "grad_norm": 0.6875, + "learning_rate": 2.6025641025641028e-05, + "loss": 0.8657, + "step": 14925 + }, + { + "epoch": 3.6959208899876392, + "grad_norm": 0.73828125, + "learning_rate": 2.58974358974359e-05, + "loss": 0.8611, + "step": 14950 + }, + { + "epoch": 3.702101359703337, + "grad_norm": 0.71875, + "learning_rate": 2.5769230769230768e-05, + "loss": 0.8628, + "step": 14975 + }, + { + "epoch": 3.708281829419036, + "grad_norm": 0.7265625, + "learning_rate": 2.564102564102564e-05, + "loss": 0.8643, + "step": 15000 + }, + { + "epoch": 3.708281829419036, + "eval_loss": 0.9966387748718262, + "eval_runtime": 1.5383, + "eval_samples_per_second": 415.389, + "eval_steps_per_second": 1.95, + "step": 15000 + }, + { + "epoch": 3.714462299134734, + "grad_norm": 0.73046875, + "learning_rate": 2.551282051282051e-05, + "loss": 0.864, + "step": 15025 + }, + { + "epoch": 3.720642768850433, + "grad_norm": 0.71484375, + "learning_rate": 2.5384615384615383e-05, + "loss": 0.8643, + "step": 15050 + }, + { + "epoch": 3.726823238566131, + "grad_norm": 0.7578125, + "learning_rate": 2.5256410256410258e-05, + "loss": 0.8614, + "step": 15075 + }, + { + "epoch": 3.7330037082818293, + "grad_norm": 0.78125, + "learning_rate": 2.512820512820513e-05, + "loss": 0.8694, + "step": 15100 + }, + { + "epoch": 3.7391841779975277, + "grad_norm": 0.75, + "learning_rate": 2.5e-05, + "loss": 0.8666, + "step": 15125 + }, + { + "epoch": 3.745364647713226, + "grad_norm": 0.734375, + "learning_rate": 2.4871794871794873e-05, + "loss": 0.8616, + "step": 15150 + }, + { + "epoch": 3.7515451174289245, + "grad_norm": 0.71484375, + "learning_rate": 2.4743589743589744e-05, + "loss": 0.8646, + "step": 15175 + }, + { + "epoch": 3.757725587144623, + "grad_norm": 0.734375, + "learning_rate": 2.461538461538462e-05, + "loss": 0.8623, + "step": 15200 + }, + { + "epoch": 3.7639060568603213, + "grad_norm": 0.73046875, + "learning_rate": 2.4487179487179488e-05, + "loss": 0.8645, + "step": 15225 + }, + { + "epoch": 3.7700865265760197, + "grad_norm": 0.71484375, + "learning_rate": 2.435897435897436e-05, + "loss": 0.8612, + "step": 15250 + }, + { + "epoch": 3.776266996291718, + "grad_norm": 0.71875, + "learning_rate": 2.423076923076923e-05, + "loss": 0.8642, + "step": 15275 + }, + { + "epoch": 3.7824474660074165, + "grad_norm": 0.734375, + "learning_rate": 2.4102564102564103e-05, + "loss": 0.8615, + "step": 15300 + }, + { + "epoch": 3.788627935723115, + "grad_norm": 0.71875, + "learning_rate": 2.3974358974358978e-05, + "loss": 0.8603, + "step": 15325 + }, + { + "epoch": 3.7948084054388134, + "grad_norm": 0.70703125, + "learning_rate": 2.384615384615385e-05, + "loss": 0.8675, + "step": 15350 + }, + { + "epoch": 3.800988875154512, + "grad_norm": 0.68359375, + "learning_rate": 2.3717948717948718e-05, + "loss": 0.8655, + "step": 15375 + }, + { + "epoch": 3.80716934487021, + "grad_norm": 0.71875, + "learning_rate": 2.358974358974359e-05, + "loss": 0.8632, + "step": 15400 + }, + { + "epoch": 3.8133498145859086, + "grad_norm": 0.734375, + "learning_rate": 2.3461538461538464e-05, + "loss": 0.8599, + "step": 15425 + }, + { + "epoch": 3.819530284301607, + "grad_norm": 0.75390625, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.8656, + "step": 15450 + }, + { + "epoch": 3.8257107540173054, + "grad_norm": 0.73046875, + "learning_rate": 2.3205128205128207e-05, + "loss": 0.8638, + "step": 15475 + }, + { + "epoch": 3.831891223733004, + "grad_norm": 0.69140625, + "learning_rate": 2.307692307692308e-05, + "loss": 0.8621, + "step": 15500 + }, + { + "epoch": 3.8380716934487022, + "grad_norm": 0.71484375, + "learning_rate": 2.2948717948717947e-05, + "loss": 0.8651, + "step": 15525 + }, + { + "epoch": 3.8442521631644007, + "grad_norm": 0.7109375, + "learning_rate": 2.2820512820512822e-05, + "loss": 0.8607, + "step": 15550 + }, + { + "epoch": 3.850432632880099, + "grad_norm": 0.6953125, + "learning_rate": 2.2692307692307694e-05, + "loss": 0.8632, + "step": 15575 + }, + { + "epoch": 3.856613102595797, + "grad_norm": 0.69140625, + "learning_rate": 2.2564102564102566e-05, + "loss": 0.8632, + "step": 15600 + }, + { + "epoch": 3.862793572311496, + "grad_norm": 0.69921875, + "learning_rate": 2.2435897435897437e-05, + "loss": 0.8631, + "step": 15625 + }, + { + "epoch": 3.868974042027194, + "grad_norm": 0.67578125, + "learning_rate": 2.230769230769231e-05, + "loss": 0.8594, + "step": 15650 + }, + { + "epoch": 3.8751545117428927, + "grad_norm": 0.69140625, + "learning_rate": 2.217948717948718e-05, + "loss": 0.8617, + "step": 15675 + }, + { + "epoch": 3.8813349814585907, + "grad_norm": 0.72265625, + "learning_rate": 2.2051282051282052e-05, + "loss": 0.8617, + "step": 15700 + }, + { + "epoch": 3.887515451174289, + "grad_norm": 0.75390625, + "learning_rate": 2.1923076923076924e-05, + "loss": 0.8629, + "step": 15725 + }, + { + "epoch": 3.8936959208899875, + "grad_norm": 0.69921875, + "learning_rate": 2.1794871794871795e-05, + "loss": 0.8639, + "step": 15750 + }, + { + "epoch": 3.899876390605686, + "grad_norm": 0.734375, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.8616, + "step": 15775 + }, + { + "epoch": 3.9060568603213843, + "grad_norm": 0.72265625, + "learning_rate": 2.1538461538461542e-05, + "loss": 0.865, + "step": 15800 + }, + { + "epoch": 3.9122373300370827, + "grad_norm": 0.68359375, + "learning_rate": 2.141025641025641e-05, + "loss": 0.8609, + "step": 15825 + }, + { + "epoch": 3.918417799752781, + "grad_norm": 0.6640625, + "learning_rate": 2.1282051282051282e-05, + "loss": 0.861, + "step": 15850 + }, + { + "epoch": 3.9245982694684796, + "grad_norm": 0.7109375, + "learning_rate": 2.1153846153846154e-05, + "loss": 0.8613, + "step": 15875 + }, + { + "epoch": 3.930778739184178, + "grad_norm": 0.69140625, + "learning_rate": 2.102564102564103e-05, + "loss": 0.8624, + "step": 15900 + }, + { + "epoch": 3.9369592088998764, + "grad_norm": 0.6953125, + "learning_rate": 2.08974358974359e-05, + "loss": 0.8633, + "step": 15925 + }, + { + "epoch": 3.943139678615575, + "grad_norm": 0.703125, + "learning_rate": 2.0769230769230772e-05, + "loss": 0.8589, + "step": 15950 + }, + { + "epoch": 3.949320148331273, + "grad_norm": 0.70703125, + "learning_rate": 2.064102564102564e-05, + "loss": 0.8605, + "step": 15975 + }, + { + "epoch": 3.9555006180469716, + "grad_norm": 0.70703125, + "learning_rate": 2.0512820512820512e-05, + "loss": 0.8612, + "step": 16000 + }, + { + "epoch": 3.96168108776267, + "grad_norm": 0.6875, + "learning_rate": 2.0384615384615387e-05, + "loss": 0.8612, + "step": 16025 + }, + { + "epoch": 3.9678615574783684, + "grad_norm": 0.72265625, + "learning_rate": 2.025641025641026e-05, + "loss": 0.861, + "step": 16050 + }, + { + "epoch": 3.974042027194067, + "grad_norm": 0.73046875, + "learning_rate": 2.012820512820513e-05, + "loss": 0.8593, + "step": 16075 + }, + { + "epoch": 3.9802224969097653, + "grad_norm": 0.69140625, + "learning_rate": 2e-05, + "loss": 0.8612, + "step": 16100 + }, + { + "epoch": 3.9864029666254637, + "grad_norm": 0.71484375, + "learning_rate": 1.987179487179487e-05, + "loss": 0.8664, + "step": 16125 + }, + { + "epoch": 3.992583436341162, + "grad_norm": 0.70703125, + "learning_rate": 1.9743589743589745e-05, + "loss": 0.8599, + "step": 16150 + }, + { + "epoch": 3.99876390605686, + "grad_norm": 0.6953125, + "learning_rate": 1.9615384615384617e-05, + "loss": 0.8649, + "step": 16175 + }, + { + "epoch": 4.004944375772559, + "grad_norm": 0.7109375, + "learning_rate": 1.9487179487179488e-05, + "loss": 0.8264, + "step": 16200 + }, + { + "epoch": 4.011124845488257, + "grad_norm": 0.6875, + "learning_rate": 1.935897435897436e-05, + "loss": 0.8184, + "step": 16225 + }, + { + "epoch": 4.017305315203956, + "grad_norm": 0.69140625, + "learning_rate": 1.923076923076923e-05, + "loss": 0.8214, + "step": 16250 + }, + { + "epoch": 4.023485784919654, + "grad_norm": 0.69140625, + "learning_rate": 1.9102564102564103e-05, + "loss": 0.8166, + "step": 16275 + }, + { + "epoch": 4.0296662546353526, + "grad_norm": 0.671875, + "learning_rate": 1.8974358974358975e-05, + "loss": 0.8191, + "step": 16300 + }, + { + "epoch": 4.0358467243510505, + "grad_norm": 0.703125, + "learning_rate": 1.8846153846153846e-05, + "loss": 0.8194, + "step": 16325 + }, + { + "epoch": 4.042027194066749, + "grad_norm": 0.69921875, + "learning_rate": 1.8717948717948718e-05, + "loss": 0.8189, + "step": 16350 + }, + { + "epoch": 4.048207663782447, + "grad_norm": 0.70703125, + "learning_rate": 1.858974358974359e-05, + "loss": 0.822, + "step": 16375 + }, + { + "epoch": 4.054388133498146, + "grad_norm": 0.73046875, + "learning_rate": 1.8461538461538465e-05, + "loss": 0.8146, + "step": 16400 + }, + { + "epoch": 4.060568603213844, + "grad_norm": 0.6953125, + "learning_rate": 1.8333333333333333e-05, + "loss": 0.8168, + "step": 16425 + }, + { + "epoch": 4.066749072929543, + "grad_norm": 0.6953125, + "learning_rate": 1.8205128205128204e-05, + "loss": 0.8221, + "step": 16450 + }, + { + "epoch": 4.072929542645241, + "grad_norm": 0.7109375, + "learning_rate": 1.8076923076923076e-05, + "loss": 0.8179, + "step": 16475 + }, + { + "epoch": 4.07911001236094, + "grad_norm": 0.6875, + "learning_rate": 1.794871794871795e-05, + "loss": 0.8182, + "step": 16500 + }, + { + "epoch": 4.085290482076638, + "grad_norm": 0.73046875, + "learning_rate": 1.7820512820512823e-05, + "loss": 0.8145, + "step": 16525 + }, + { + "epoch": 4.091470951792337, + "grad_norm": 0.703125, + "learning_rate": 1.7692307692307694e-05, + "loss": 0.8239, + "step": 16550 + }, + { + "epoch": 4.097651421508035, + "grad_norm": 0.69140625, + "learning_rate": 1.7564102564102563e-05, + "loss": 0.8151, + "step": 16575 + }, + { + "epoch": 4.103831891223733, + "grad_norm": 0.7109375, + "learning_rate": 1.7435897435897434e-05, + "loss": 0.8193, + "step": 16600 + }, + { + "epoch": 4.1100123609394315, + "grad_norm": 0.6875, + "learning_rate": 1.730769230769231e-05, + "loss": 0.8193, + "step": 16625 + }, + { + "epoch": 4.116192830655129, + "grad_norm": 0.69140625, + "learning_rate": 1.717948717948718e-05, + "loss": 0.8193, + "step": 16650 + }, + { + "epoch": 4.122373300370828, + "grad_norm": 0.703125, + "learning_rate": 1.7051282051282053e-05, + "loss": 0.8214, + "step": 16675 + }, + { + "epoch": 4.128553770086526, + "grad_norm": 0.69140625, + "learning_rate": 1.6923076923076924e-05, + "loss": 0.8211, + "step": 16700 + }, + { + "epoch": 4.134734239802225, + "grad_norm": 0.6953125, + "learning_rate": 1.6794871794871796e-05, + "loss": 0.8213, + "step": 16725 + }, + { + "epoch": 4.140914709517923, + "grad_norm": 0.7109375, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.8209, + "step": 16750 + }, + { + "epoch": 4.147095179233622, + "grad_norm": 0.6953125, + "learning_rate": 1.653846153846154e-05, + "loss": 0.8185, + "step": 16775 + }, + { + "epoch": 4.15327564894932, + "grad_norm": 0.6875, + "learning_rate": 1.641025641025641e-05, + "loss": 0.8188, + "step": 16800 + }, + { + "epoch": 4.159456118665019, + "grad_norm": 0.69140625, + "learning_rate": 1.6282051282051282e-05, + "loss": 0.8209, + "step": 16825 + }, + { + "epoch": 4.165636588380717, + "grad_norm": 0.70703125, + "learning_rate": 1.6153846153846154e-05, + "loss": 0.82, + "step": 16850 + }, + { + "epoch": 4.171817058096416, + "grad_norm": 0.6953125, + "learning_rate": 1.602564102564103e-05, + "loss": 0.8203, + "step": 16875 + }, + { + "epoch": 4.1779975278121135, + "grad_norm": 0.71484375, + "learning_rate": 1.5897435897435897e-05, + "loss": 0.8175, + "step": 16900 + }, + { + "epoch": 4.184177997527812, + "grad_norm": 0.71875, + "learning_rate": 1.576923076923077e-05, + "loss": 0.8207, + "step": 16925 + }, + { + "epoch": 4.19035846724351, + "grad_norm": 0.6953125, + "learning_rate": 1.564102564102564e-05, + "loss": 0.8213, + "step": 16950 + }, + { + "epoch": 4.196538936959209, + "grad_norm": 0.6953125, + "learning_rate": 1.5512820512820516e-05, + "loss": 0.8235, + "step": 16975 + }, + { + "epoch": 4.202719406674907, + "grad_norm": 0.703125, + "learning_rate": 1.5384615384615387e-05, + "loss": 0.8166, + "step": 17000 + }, + { + "epoch": 4.208899876390606, + "grad_norm": 0.6875, + "learning_rate": 1.5256410256410259e-05, + "loss": 0.8178, + "step": 17025 + }, + { + "epoch": 4.215080346106304, + "grad_norm": 0.6875, + "learning_rate": 1.5128205128205129e-05, + "loss": 0.8214, + "step": 17050 + }, + { + "epoch": 4.221260815822003, + "grad_norm": 0.70703125, + "learning_rate": 1.5e-05, + "loss": 0.8228, + "step": 17075 + }, + { + "epoch": 4.227441285537701, + "grad_norm": 0.69140625, + "learning_rate": 1.4871794871794872e-05, + "loss": 0.8216, + "step": 17100 + }, + { + "epoch": 4.2336217552534, + "grad_norm": 0.6796875, + "learning_rate": 1.4743589743589745e-05, + "loss": 0.8202, + "step": 17125 + }, + { + "epoch": 4.239802224969098, + "grad_norm": 0.6875, + "learning_rate": 1.4615384615384617e-05, + "loss": 0.8245, + "step": 17150 + }, + { + "epoch": 4.2459826946847965, + "grad_norm": 0.68359375, + "learning_rate": 1.4487179487179489e-05, + "loss": 0.8228, + "step": 17175 + }, + { + "epoch": 4.2521631644004945, + "grad_norm": 0.703125, + "learning_rate": 1.4358974358974359e-05, + "loss": 0.8222, + "step": 17200 + }, + { + "epoch": 4.258343634116192, + "grad_norm": 0.703125, + "learning_rate": 1.423076923076923e-05, + "loss": 0.8213, + "step": 17225 + }, + { + "epoch": 4.264524103831891, + "grad_norm": 0.6953125, + "learning_rate": 1.4102564102564104e-05, + "loss": 0.8226, + "step": 17250 + }, + { + "epoch": 4.270704573547589, + "grad_norm": 0.703125, + "learning_rate": 1.3974358974358975e-05, + "loss": 0.8171, + "step": 17275 + }, + { + "epoch": 4.276885043263288, + "grad_norm": 0.70703125, + "learning_rate": 1.3846153846153847e-05, + "loss": 0.8161, + "step": 17300 + }, + { + "epoch": 4.283065512978986, + "grad_norm": 0.70703125, + "learning_rate": 1.371794871794872e-05, + "loss": 0.8203, + "step": 17325 + }, + { + "epoch": 4.289245982694685, + "grad_norm": 0.69921875, + "learning_rate": 1.358974358974359e-05, + "loss": 0.8184, + "step": 17350 + }, + { + "epoch": 4.295426452410383, + "grad_norm": 0.7109375, + "learning_rate": 1.3461538461538462e-05, + "loss": 0.8223, + "step": 17375 + }, + { + "epoch": 4.301606922126082, + "grad_norm": 0.68359375, + "learning_rate": 1.3333333333333333e-05, + "loss": 0.8207, + "step": 17400 + }, + { + "epoch": 4.30778739184178, + "grad_norm": 0.68359375, + "learning_rate": 1.3205128205128207e-05, + "loss": 0.8233, + "step": 17425 + }, + { + "epoch": 4.313967861557479, + "grad_norm": 0.69921875, + "learning_rate": 1.3076923076923078e-05, + "loss": 0.8211, + "step": 17450 + }, + { + "epoch": 4.3201483312731765, + "grad_norm": 0.73046875, + "learning_rate": 1.294871794871795e-05, + "loss": 0.8193, + "step": 17475 + }, + { + "epoch": 4.326328800988875, + "grad_norm": 0.6875, + "learning_rate": 1.282051282051282e-05, + "loss": 0.8207, + "step": 17500 + }, + { + "epoch": 4.332509270704573, + "grad_norm": 0.6796875, + "learning_rate": 1.2692307692307691e-05, + "loss": 0.8247, + "step": 17525 + }, + { + "epoch": 4.338689740420272, + "grad_norm": 0.69140625, + "learning_rate": 1.2564102564102565e-05, + "loss": 0.826, + "step": 17550 + }, + { + "epoch": 4.34487021013597, + "grad_norm": 0.6953125, + "learning_rate": 1.2435897435897436e-05, + "loss": 0.82, + "step": 17575 + }, + { + "epoch": 4.351050679851669, + "grad_norm": 0.6953125, + "learning_rate": 1.230769230769231e-05, + "loss": 0.8161, + "step": 17600 + }, + { + "epoch": 4.357231149567367, + "grad_norm": 0.67578125, + "learning_rate": 1.217948717948718e-05, + "loss": 0.826, + "step": 17625 + }, + { + "epoch": 4.363411619283066, + "grad_norm": 0.68359375, + "learning_rate": 1.2051282051282051e-05, + "loss": 0.8199, + "step": 17650 + }, + { + "epoch": 4.369592088998764, + "grad_norm": 0.703125, + "learning_rate": 1.1923076923076925e-05, + "loss": 0.8251, + "step": 17675 + }, + { + "epoch": 4.375772558714463, + "grad_norm": 0.6953125, + "learning_rate": 1.1794871794871795e-05, + "loss": 0.8209, + "step": 17700 + }, + { + "epoch": 4.381953028430161, + "grad_norm": 0.6953125, + "learning_rate": 1.1666666666666668e-05, + "loss": 0.8206, + "step": 17725 + }, + { + "epoch": 4.388133498145859, + "grad_norm": 0.6875, + "learning_rate": 1.153846153846154e-05, + "loss": 0.8209, + "step": 17750 + }, + { + "epoch": 4.3943139678615575, + "grad_norm": 0.6875, + "learning_rate": 1.1410256410256411e-05, + "loss": 0.8218, + "step": 17775 + }, + { + "epoch": 4.400494437577256, + "grad_norm": 0.68359375, + "learning_rate": 1.1282051282051283e-05, + "loss": 0.8216, + "step": 17800 + }, + { + "epoch": 4.406674907292954, + "grad_norm": 0.6953125, + "learning_rate": 1.1153846153846154e-05, + "loss": 0.82, + "step": 17825 + }, + { + "epoch": 4.412855377008652, + "grad_norm": 0.69921875, + "learning_rate": 1.1025641025641026e-05, + "loss": 0.8224, + "step": 17850 + }, + { + "epoch": 4.419035846724351, + "grad_norm": 0.6953125, + "learning_rate": 1.0897435897435898e-05, + "loss": 0.8209, + "step": 17875 + }, + { + "epoch": 4.425216316440049, + "grad_norm": 0.69140625, + "learning_rate": 1.0769230769230771e-05, + "loss": 0.8198, + "step": 17900 + }, + { + "epoch": 4.431396786155748, + "grad_norm": 0.703125, + "learning_rate": 1.0641025641025641e-05, + "loss": 0.8259, + "step": 17925 + }, + { + "epoch": 4.437577255871446, + "grad_norm": 0.6953125, + "learning_rate": 1.0512820512820514e-05, + "loss": 0.8211, + "step": 17950 + }, + { + "epoch": 4.443757725587145, + "grad_norm": 0.66796875, + "learning_rate": 1.0384615384615386e-05, + "loss": 0.8213, + "step": 17975 + }, + { + "epoch": 4.449938195302843, + "grad_norm": 0.703125, + "learning_rate": 1.0256410256410256e-05, + "loss": 0.8266, + "step": 18000 + }, + { + "epoch": 4.456118665018542, + "grad_norm": 0.69921875, + "learning_rate": 1.012820512820513e-05, + "loss": 0.821, + "step": 18025 + }, + { + "epoch": 4.46229913473424, + "grad_norm": 0.67578125, + "learning_rate": 1e-05, + "loss": 0.8223, + "step": 18050 + }, + { + "epoch": 4.468479604449938, + "grad_norm": 0.6953125, + "learning_rate": 9.871794871794872e-06, + "loss": 0.8216, + "step": 18075 + }, + { + "epoch": 4.474660074165636, + "grad_norm": 0.703125, + "learning_rate": 9.743589743589744e-06, + "loss": 0.8183, + "step": 18100 + }, + { + "epoch": 4.480840543881335, + "grad_norm": 0.6953125, + "learning_rate": 9.615384615384616e-06, + "loss": 0.8243, + "step": 18125 + }, + { + "epoch": 4.487021013597033, + "grad_norm": 0.71484375, + "learning_rate": 9.487179487179487e-06, + "loss": 0.8197, + "step": 18150 + }, + { + "epoch": 4.493201483312732, + "grad_norm": 0.71484375, + "learning_rate": 9.358974358974359e-06, + "loss": 0.8243, + "step": 18175 + }, + { + "epoch": 4.49938195302843, + "grad_norm": 0.703125, + "learning_rate": 9.230769230769232e-06, + "loss": 0.8211, + "step": 18200 + }, + { + "epoch": 4.505562422744129, + "grad_norm": 0.6953125, + "learning_rate": 9.102564102564102e-06, + "loss": 0.8222, + "step": 18225 + }, + { + "epoch": 4.511742892459827, + "grad_norm": 0.6953125, + "learning_rate": 8.974358974358976e-06, + "loss": 0.8225, + "step": 18250 + }, + { + "epoch": 4.517923362175526, + "grad_norm": 0.69140625, + "learning_rate": 8.846153846153847e-06, + "loss": 0.8261, + "step": 18275 + }, + { + "epoch": 4.524103831891224, + "grad_norm": 0.6796875, + "learning_rate": 8.717948717948717e-06, + "loss": 0.8234, + "step": 18300 + }, + { + "epoch": 4.5302843016069225, + "grad_norm": 0.6796875, + "learning_rate": 8.58974358974359e-06, + "loss": 0.8213, + "step": 18325 + }, + { + "epoch": 4.5364647713226205, + "grad_norm": 0.70703125, + "learning_rate": 8.461538461538462e-06, + "loss": 0.8272, + "step": 18350 + }, + { + "epoch": 4.5426452410383185, + "grad_norm": 0.6796875, + "learning_rate": 8.333333333333334e-06, + "loss": 0.8223, + "step": 18375 + }, + { + "epoch": 4.548825710754017, + "grad_norm": 0.70703125, + "learning_rate": 8.205128205128205e-06, + "loss": 0.8218, + "step": 18400 + }, + { + "epoch": 4.555006180469716, + "grad_norm": 0.6875, + "learning_rate": 8.076923076923077e-06, + "loss": 0.8195, + "step": 18425 + }, + { + "epoch": 4.561186650185414, + "grad_norm": 0.6640625, + "learning_rate": 7.948717948717949e-06, + "loss": 0.8222, + "step": 18450 + }, + { + "epoch": 4.567367119901112, + "grad_norm": 0.69140625, + "learning_rate": 7.82051282051282e-06, + "loss": 0.8212, + "step": 18475 + }, + { + "epoch": 4.573547589616811, + "grad_norm": 0.703125, + "learning_rate": 7.692307692307694e-06, + "loss": 0.8263, + "step": 18500 + }, + { + "epoch": 4.579728059332509, + "grad_norm": 0.67578125, + "learning_rate": 7.564102564102564e-06, + "loss": 0.824, + "step": 18525 + }, + { + "epoch": 4.585908529048208, + "grad_norm": 0.7109375, + "learning_rate": 7.435897435897436e-06, + "loss": 0.8189, + "step": 18550 + }, + { + "epoch": 4.592088998763906, + "grad_norm": 0.6953125, + "learning_rate": 7.3076923076923085e-06, + "loss": 0.8213, + "step": 18575 + }, + { + "epoch": 4.598269468479605, + "grad_norm": 0.6796875, + "learning_rate": 7.179487179487179e-06, + "loss": 0.8183, + "step": 18600 + }, + { + "epoch": 4.604449938195303, + "grad_norm": 0.6875, + "learning_rate": 7.051282051282052e-06, + "loss": 0.8203, + "step": 18625 + }, + { + "epoch": 4.610630407911001, + "grad_norm": 0.671875, + "learning_rate": 6.923076923076923e-06, + "loss": 0.819, + "step": 18650 + }, + { + "epoch": 4.616810877626699, + "grad_norm": 0.73828125, + "learning_rate": 6.794871794871795e-06, + "loss": 0.8235, + "step": 18675 + }, + { + "epoch": 4.622991347342398, + "grad_norm": 0.69140625, + "learning_rate": 6.666666666666667e-06, + "loss": 0.8229, + "step": 18700 + }, + { + "epoch": 4.629171817058096, + "grad_norm": 0.6796875, + "learning_rate": 6.538461538461539e-06, + "loss": 0.8189, + "step": 18725 + }, + { + "epoch": 4.635352286773795, + "grad_norm": 0.6875, + "learning_rate": 6.41025641025641e-06, + "loss": 0.8224, + "step": 18750 + }, + { + "epoch": 4.641532756489493, + "grad_norm": 0.66796875, + "learning_rate": 6.282051282051282e-06, + "loss": 0.8225, + "step": 18775 + }, + { + "epoch": 4.647713226205192, + "grad_norm": 0.67578125, + "learning_rate": 6.153846153846155e-06, + "loss": 0.8194, + "step": 18800 + }, + { + "epoch": 4.65389369592089, + "grad_norm": 0.68359375, + "learning_rate": 6.025641025641026e-06, + "loss": 0.8225, + "step": 18825 + }, + { + "epoch": 4.660074165636589, + "grad_norm": 0.6875, + "learning_rate": 5.897435897435897e-06, + "loss": 0.8169, + "step": 18850 + }, + { + "epoch": 4.666254635352287, + "grad_norm": 0.6953125, + "learning_rate": 5.76923076923077e-06, + "loss": 0.8153, + "step": 18875 + }, + { + "epoch": 4.6724351050679855, + "grad_norm": 0.671875, + "learning_rate": 5.641025641025641e-06, + "loss": 0.822, + "step": 18900 + }, + { + "epoch": 4.6786155747836835, + "grad_norm": 0.671875, + "learning_rate": 5.512820512820513e-06, + "loss": 0.822, + "step": 18925 + }, + { + "epoch": 4.684796044499382, + "grad_norm": 0.6796875, + "learning_rate": 5.3846153846153855e-06, + "loss": 0.8184, + "step": 18950 + }, + { + "epoch": 4.69097651421508, + "grad_norm": 0.69921875, + "learning_rate": 5.256410256410257e-06, + "loss": 0.8247, + "step": 18975 + }, + { + "epoch": 4.697156983930778, + "grad_norm": 0.67578125, + "learning_rate": 5.128205128205128e-06, + "loss": 0.8212, + "step": 19000 + }, + { + "epoch": 4.703337453646477, + "grad_norm": 0.671875, + "learning_rate": 5e-06, + "loss": 0.8226, + "step": 19025 + }, + { + "epoch": 4.709517923362176, + "grad_norm": 0.66796875, + "learning_rate": 4.871794871794872e-06, + "loss": 0.8206, + "step": 19050 + }, + { + "epoch": 4.715698393077874, + "grad_norm": 0.6796875, + "learning_rate": 4.743589743589744e-06, + "loss": 0.8183, + "step": 19075 + }, + { + "epoch": 4.721878862793572, + "grad_norm": 0.69140625, + "learning_rate": 4.615384615384616e-06, + "loss": 0.8184, + "step": 19100 + }, + { + "epoch": 4.728059332509271, + "grad_norm": 0.70703125, + "learning_rate": 4.487179487179488e-06, + "loss": 0.8212, + "step": 19125 + }, + { + "epoch": 4.734239802224969, + "grad_norm": 0.66796875, + "learning_rate": 4.3589743589743586e-06, + "loss": 0.8198, + "step": 19150 + }, + { + "epoch": 4.740420271940668, + "grad_norm": 0.69921875, + "learning_rate": 4.230769230769231e-06, + "loss": 0.8235, + "step": 19175 + }, + { + "epoch": 4.746600741656366, + "grad_norm": 0.6796875, + "learning_rate": 4.102564102564103e-06, + "loss": 0.8228, + "step": 19200 + }, + { + "epoch": 4.752781211372064, + "grad_norm": 0.67578125, + "learning_rate": 3.974358974358974e-06, + "loss": 0.8278, + "step": 19225 + }, + { + "epoch": 4.758961681087762, + "grad_norm": 0.66015625, + "learning_rate": 3.846153846153847e-06, + "loss": 0.817, + "step": 19250 + }, + { + "epoch": 4.765142150803461, + "grad_norm": 0.6953125, + "learning_rate": 3.717948717948718e-06, + "loss": 0.8226, + "step": 19275 + }, + { + "epoch": 4.771322620519159, + "grad_norm": 0.6796875, + "learning_rate": 3.5897435897435896e-06, + "loss": 0.8218, + "step": 19300 + }, + { + "epoch": 4.777503090234858, + "grad_norm": 0.69921875, + "learning_rate": 3.4615384615384617e-06, + "loss": 0.8209, + "step": 19325 + }, + { + "epoch": 4.783683559950556, + "grad_norm": 0.6875, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.8231, + "step": 19350 + }, + { + "epoch": 4.789864029666255, + "grad_norm": 0.6875, + "learning_rate": 3.205128205128205e-06, + "loss": 0.8199, + "step": 19375 + }, + { + "epoch": 4.796044499381953, + "grad_norm": 0.68359375, + "learning_rate": 3.0769230769230774e-06, + "loss": 0.8214, + "step": 19400 + }, + { + "epoch": 4.802224969097652, + "grad_norm": 0.6875, + "learning_rate": 2.9487179487179486e-06, + "loss": 0.8221, + "step": 19425 + }, + { + "epoch": 4.80840543881335, + "grad_norm": 0.6796875, + "learning_rate": 2.8205128205128207e-06, + "loss": 0.8198, + "step": 19450 + }, + { + "epoch": 4.8145859085290486, + "grad_norm": 0.68359375, + "learning_rate": 2.6923076923076928e-06, + "loss": 0.8195, + "step": 19475 + }, + { + "epoch": 4.8207663782447465, + "grad_norm": 0.69140625, + "learning_rate": 2.564102564102564e-06, + "loss": 0.8217, + "step": 19500 + }, + { + "epoch": 4.826946847960445, + "grad_norm": 0.6875, + "learning_rate": 2.435897435897436e-06, + "loss": 0.8185, + "step": 19525 + }, + { + "epoch": 4.833127317676143, + "grad_norm": 0.68359375, + "learning_rate": 2.307692307692308e-06, + "loss": 0.8196, + "step": 19550 + }, + { + "epoch": 4.839307787391842, + "grad_norm": 0.69140625, + "learning_rate": 2.1794871794871793e-06, + "loss": 0.8204, + "step": 19575 + }, + { + "epoch": 4.84548825710754, + "grad_norm": 0.66796875, + "learning_rate": 2.0512820512820513e-06, + "loss": 0.8201, + "step": 19600 + }, + { + "epoch": 4.851668726823238, + "grad_norm": 0.67578125, + "learning_rate": 1.9230769230769234e-06, + "loss": 0.8229, + "step": 19625 + }, + { + "epoch": 4.857849196538937, + "grad_norm": 0.6640625, + "learning_rate": 1.7948717948717948e-06, + "loss": 0.8222, + "step": 19650 + }, + { + "epoch": 4.864029666254636, + "grad_norm": 0.6796875, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.8232, + "step": 19675 + }, + { + "epoch": 4.870210135970334, + "grad_norm": 0.69140625, + "learning_rate": 1.5384615384615387e-06, + "loss": 0.8224, + "step": 19700 + }, + { + "epoch": 4.876390605686032, + "grad_norm": 0.66796875, + "learning_rate": 1.4102564102564104e-06, + "loss": 0.8235, + "step": 19725 + }, + { + "epoch": 4.882571075401731, + "grad_norm": 0.671875, + "learning_rate": 1.282051282051282e-06, + "loss": 0.8164, + "step": 19750 + }, + { + "epoch": 4.888751545117429, + "grad_norm": 0.69140625, + "learning_rate": 1.153846153846154e-06, + "loss": 0.8217, + "step": 19775 + }, + { + "epoch": 4.8949320148331275, + "grad_norm": 0.6875, + "learning_rate": 1.0256410256410257e-06, + "loss": 0.8185, + "step": 19800 + }, + { + "epoch": 4.901112484548825, + "grad_norm": 0.66796875, + "learning_rate": 8.974358974358974e-07, + "loss": 0.8256, + "step": 19825 + }, + { + "epoch": 4.907292954264524, + "grad_norm": 0.68359375, + "learning_rate": 7.692307692307694e-07, + "loss": 0.8235, + "step": 19850 + }, + { + "epoch": 4.913473423980222, + "grad_norm": 0.67578125, + "learning_rate": 6.41025641025641e-07, + "loss": 0.8234, + "step": 19875 + }, + { + "epoch": 4.919653893695921, + "grad_norm": 0.6953125, + "learning_rate": 5.128205128205128e-07, + "loss": 0.8208, + "step": 19900 + }, + { + "epoch": 4.925834363411619, + "grad_norm": 0.68359375, + "learning_rate": 3.846153846153847e-07, + "loss": 0.8235, + "step": 19925 + }, + { + "epoch": 4.932014833127318, + "grad_norm": 0.67578125, + "learning_rate": 2.564102564102564e-07, + "loss": 0.8218, + "step": 19950 + }, + { + "epoch": 4.938195302843016, + "grad_norm": 0.67578125, + "learning_rate": 1.282051282051282e-07, + "loss": 0.8207, + "step": 19975 + }, + { + "epoch": 4.944375772558715, + "grad_norm": 0.6796875, + "learning_rate": 0.0, + "loss": 0.8191, + "step": 20000 + }, + { + "epoch": 4.944375772558715, + "eval_loss": 1.0042184591293335, + "eval_runtime": 1.5308, + "eval_samples_per_second": 417.441, + "eval_steps_per_second": 1.96, + "step": 20000 + }, + { + "epoch": 4.944375772558715, + "step": 20000, + "total_flos": 9.058112140235663e+19, + "train_loss": 1.0958750234603882, + "train_runtime": 36068.0493, + "train_samples_per_second": 141.954, + "train_steps_per_second": 0.555 + } + ], + "logging_steps": 25, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 5000, + "total_flos": 9.058112140235663e+19, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}