diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,14662 @@ +{ + "best_metric": 0.32263943552970886, + "best_model_checkpoint": "./cocoa_outputs_mobilenet/checkpoint-3724", + "epoch": 100.0, + "eval_steps": 500, + "global_step": 19600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05102040816326531, + "grad_norm": 17.002086639404297, + "learning_rate": 1.998979591836735e-05, + "loss": 1.561, + "step": 10 + }, + { + "epoch": 0.10204081632653061, + "grad_norm": 15.477813720703125, + "learning_rate": 1.9979591836734697e-05, + "loss": 1.4808, + "step": 20 + }, + { + "epoch": 0.15306122448979592, + "grad_norm": 15.09716796875, + "learning_rate": 1.9969387755102042e-05, + "loss": 1.33, + "step": 30 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 13.078636169433594, + "learning_rate": 1.9959183673469388e-05, + "loss": 1.183, + "step": 40 + }, + { + "epoch": 0.25510204081632654, + "grad_norm": 11.537144660949707, + "learning_rate": 1.9948979591836737e-05, + "loss": 1.0943, + "step": 50 + }, + { + "epoch": 0.30612244897959184, + "grad_norm": 10.925792694091797, + "learning_rate": 1.9938775510204083e-05, + "loss": 0.8695, + "step": 60 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 10.022773742675781, + "learning_rate": 1.992857142857143e-05, + "loss": 0.8132, + "step": 70 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 7.336289405822754, + "learning_rate": 1.9918367346938775e-05, + "loss": 0.7434, + "step": 80 + }, + { + "epoch": 0.45918367346938777, + "grad_norm": 11.058115005493164, + "learning_rate": 1.9908163265306124e-05, + "loss": 0.9563, + "step": 90 + }, + { + "epoch": 0.5102040816326531, + "grad_norm": 12.82748031616211, + "learning_rate": 1.9897959183673473e-05, + "loss": 0.6982, + "step": 100 + }, + { + "epoch": 0.5612244897959183, + "grad_norm": 13.475491523742676, + "learning_rate": 1.988775510204082e-05, + "loss": 0.6721, + "step": 110 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 6.08135461807251, + "learning_rate": 1.9877551020408165e-05, + "loss": 0.6415, + "step": 120 + }, + { + "epoch": 0.6632653061224489, + "grad_norm": 10.286223411560059, + "learning_rate": 1.986734693877551e-05, + "loss": 0.5412, + "step": 130 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 3.4252562522888184, + "learning_rate": 1.985714285714286e-05, + "loss": 0.5227, + "step": 140 + }, + { + "epoch": 0.7653061224489796, + "grad_norm": 12.471209526062012, + "learning_rate": 1.9846938775510205e-05, + "loss": 0.6043, + "step": 150 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 10.609465599060059, + "learning_rate": 1.983673469387755e-05, + "loss": 0.7734, + "step": 160 + }, + { + "epoch": 0.8673469387755102, + "grad_norm": 13.982590675354004, + "learning_rate": 1.9826530612244897e-05, + "loss": 0.5588, + "step": 170 + }, + { + "epoch": 0.9183673469387755, + "grad_norm": 4.562811851501465, + "learning_rate": 1.9816326530612246e-05, + "loss": 0.5498, + "step": 180 + }, + { + "epoch": 0.9693877551020408, + "grad_norm": 10.520743370056152, + "learning_rate": 1.9806122448979595e-05, + "loss": 0.569, + "step": 190 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.8628158844765343, + "eval_loss": 0.5071913599967957, + "eval_runtime": 1.0115, + "eval_samples_per_second": 273.859, + "eval_steps_per_second": 34.603, + "step": 196 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 22.65924835205078, + "learning_rate": 1.979591836734694e-05, + "loss": 0.7331, + "step": 200 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 15.467998504638672, + "learning_rate": 1.9785714285714287e-05, + "loss": 0.4775, + "step": 210 + }, + { + "epoch": 1.1224489795918366, + "grad_norm": 18.562397003173828, + "learning_rate": 1.9775510204081633e-05, + "loss": 0.5106, + "step": 220 + }, + { + "epoch": 1.1734693877551021, + "grad_norm": 14.723051071166992, + "learning_rate": 1.9765306122448982e-05, + "loss": 0.4772, + "step": 230 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 9.702923774719238, + "learning_rate": 1.9755102040816328e-05, + "loss": 0.3284, + "step": 240 + }, + { + "epoch": 1.2755102040816326, + "grad_norm": 13.233383178710938, + "learning_rate": 1.9744897959183677e-05, + "loss": 0.8556, + "step": 250 + }, + { + "epoch": 1.3265306122448979, + "grad_norm": 3.862335443496704, + "learning_rate": 1.9734693877551023e-05, + "loss": 0.5364, + "step": 260 + }, + { + "epoch": 1.3775510204081631, + "grad_norm": 3.4525458812713623, + "learning_rate": 1.972448979591837e-05, + "loss": 0.505, + "step": 270 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 2.733076572418213, + "learning_rate": 1.9714285714285718e-05, + "loss": 0.3426, + "step": 280 + }, + { + "epoch": 1.4795918367346939, + "grad_norm": 20.27985382080078, + "learning_rate": 1.9704081632653063e-05, + "loss": 0.8935, + "step": 290 + }, + { + "epoch": 1.5306122448979593, + "grad_norm": 19.398176193237305, + "learning_rate": 1.969387755102041e-05, + "loss": 0.6985, + "step": 300 + }, + { + "epoch": 1.5816326530612246, + "grad_norm": 6.593906402587891, + "learning_rate": 1.9683673469387755e-05, + "loss": 0.5255, + "step": 310 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 9.199732780456543, + "learning_rate": 1.9673469387755104e-05, + "loss": 0.5397, + "step": 320 + }, + { + "epoch": 1.683673469387755, + "grad_norm": 12.648165702819824, + "learning_rate": 1.966326530612245e-05, + "loss": 0.7595, + "step": 330 + }, + { + "epoch": 1.7346938775510203, + "grad_norm": 18.061668395996094, + "learning_rate": 1.96530612244898e-05, + "loss": 0.6119, + "step": 340 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 13.193414688110352, + "learning_rate": 1.9642857142857145e-05, + "loss": 0.3622, + "step": 350 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 8.120110511779785, + "learning_rate": 1.963265306122449e-05, + "loss": 0.6294, + "step": 360 + }, + { + "epoch": 1.8877551020408163, + "grad_norm": 13.86812973022461, + "learning_rate": 1.962244897959184e-05, + "loss": 0.5736, + "step": 370 + }, + { + "epoch": 1.9387755102040818, + "grad_norm": 6.705575466156006, + "learning_rate": 1.9612244897959186e-05, + "loss": 0.5637, + "step": 380 + }, + { + "epoch": 1.989795918367347, + "grad_norm": 4.23462438583374, + "learning_rate": 1.960204081632653e-05, + "loss": 0.3973, + "step": 390 + }, + { + "epoch": 2.0, + "eval_accuracy": 0.8700361010830325, + "eval_loss": 0.4277961552143097, + "eval_runtime": 1.0511, + "eval_samples_per_second": 263.531, + "eval_steps_per_second": 33.298, + "step": 392 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 8.447417259216309, + "learning_rate": 1.9591836734693877e-05, + "loss": 0.4196, + "step": 400 + }, + { + "epoch": 2.0918367346938775, + "grad_norm": 8.11129379272461, + "learning_rate": 1.9581632653061227e-05, + "loss": 0.521, + "step": 410 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 14.704895973205566, + "learning_rate": 1.9571428571428572e-05, + "loss": 0.477, + "step": 420 + }, + { + "epoch": 2.193877551020408, + "grad_norm": 13.411709785461426, + "learning_rate": 1.956122448979592e-05, + "loss": 0.47, + "step": 430 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 6.015109062194824, + "learning_rate": 1.9551020408163267e-05, + "loss": 0.3761, + "step": 440 + }, + { + "epoch": 2.295918367346939, + "grad_norm": 6.301174640655518, + "learning_rate": 1.9540816326530613e-05, + "loss": 0.5934, + "step": 450 + }, + { + "epoch": 2.3469387755102042, + "grad_norm": 6.928921699523926, + "learning_rate": 1.9530612244897962e-05, + "loss": 0.3134, + "step": 460 + }, + { + "epoch": 2.3979591836734695, + "grad_norm": 10.26938247680664, + "learning_rate": 1.9520408163265308e-05, + "loss": 0.5905, + "step": 470 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 2.8699488639831543, + "learning_rate": 1.9510204081632654e-05, + "loss": 0.556, + "step": 480 + }, + { + "epoch": 2.5, + "grad_norm": 3.894005298614502, + "learning_rate": 1.95e-05, + "loss": 0.3534, + "step": 490 + }, + { + "epoch": 2.5510204081632653, + "grad_norm": 8.60104751586914, + "learning_rate": 1.948979591836735e-05, + "loss": 0.6635, + "step": 500 + }, + { + "epoch": 2.6020408163265305, + "grad_norm": 1.6992284059524536, + "learning_rate": 1.9479591836734695e-05, + "loss": 0.4121, + "step": 510 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 4.700980186462402, + "learning_rate": 1.9469387755102044e-05, + "loss": 0.4366, + "step": 520 + }, + { + "epoch": 2.704081632653061, + "grad_norm": 23.889366149902344, + "learning_rate": 1.945918367346939e-05, + "loss": 0.6569, + "step": 530 + }, + { + "epoch": 2.7551020408163263, + "grad_norm": 17.792743682861328, + "learning_rate": 1.9448979591836735e-05, + "loss": 0.7444, + "step": 540 + }, + { + "epoch": 2.806122448979592, + "grad_norm": 7.905496120452881, + "learning_rate": 1.9438775510204085e-05, + "loss": 0.3912, + "step": 550 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 11.077522277832031, + "learning_rate": 1.942857142857143e-05, + "loss": 0.4986, + "step": 560 + }, + { + "epoch": 2.9081632653061225, + "grad_norm": 11.153145790100098, + "learning_rate": 1.941836734693878e-05, + "loss": 0.5295, + "step": 570 + }, + { + "epoch": 2.9591836734693877, + "grad_norm": 16.30963134765625, + "learning_rate": 1.9408163265306122e-05, + "loss": 0.5873, + "step": 580 + }, + { + "epoch": 3.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.41384199261665344, + "eval_runtime": 0.9925, + "eval_samples_per_second": 279.099, + "eval_steps_per_second": 35.265, + "step": 588 + }, + { + "epoch": 3.010204081632653, + "grad_norm": 14.340892791748047, + "learning_rate": 1.939795918367347e-05, + "loss": 0.5563, + "step": 590 + }, + { + "epoch": 3.061224489795918, + "grad_norm": 6.730650901794434, + "learning_rate": 1.9387755102040817e-05, + "loss": 0.5747, + "step": 600 + }, + { + "epoch": 3.1122448979591835, + "grad_norm": 8.476055145263672, + "learning_rate": 1.9377551020408166e-05, + "loss": 0.4642, + "step": 610 + }, + { + "epoch": 3.163265306122449, + "grad_norm": 7.402491569519043, + "learning_rate": 1.9367346938775512e-05, + "loss": 0.4699, + "step": 620 + }, + { + "epoch": 3.2142857142857144, + "grad_norm": 8.689496994018555, + "learning_rate": 1.9357142857142858e-05, + "loss": 0.444, + "step": 630 + }, + { + "epoch": 3.2653061224489797, + "grad_norm": 17.033435821533203, + "learning_rate": 1.9346938775510207e-05, + "loss": 0.4847, + "step": 640 + }, + { + "epoch": 3.316326530612245, + "grad_norm": 9.383002281188965, + "learning_rate": 1.9336734693877553e-05, + "loss": 0.4156, + "step": 650 + }, + { + "epoch": 3.36734693877551, + "grad_norm": 17.23059844970703, + "learning_rate": 1.9326530612244902e-05, + "loss": 0.5766, + "step": 660 + }, + { + "epoch": 3.4183673469387754, + "grad_norm": 16.385866165161133, + "learning_rate": 1.9316326530612248e-05, + "loss": 0.498, + "step": 670 + }, + { + "epoch": 3.4693877551020407, + "grad_norm": 3.461780309677124, + "learning_rate": 1.9306122448979593e-05, + "loss": 0.4648, + "step": 680 + }, + { + "epoch": 3.520408163265306, + "grad_norm": 6.998974800109863, + "learning_rate": 1.929591836734694e-05, + "loss": 0.3759, + "step": 690 + }, + { + "epoch": 3.571428571428571, + "grad_norm": 5.785035610198975, + "learning_rate": 1.928571428571429e-05, + "loss": 0.4471, + "step": 700 + }, + { + "epoch": 3.622448979591837, + "grad_norm": 4.5334577560424805, + "learning_rate": 1.9275510204081634e-05, + "loss": 0.4486, + "step": 710 + }, + { + "epoch": 3.673469387755102, + "grad_norm": 11.439085006713867, + "learning_rate": 1.926530612244898e-05, + "loss": 0.6783, + "step": 720 + }, + { + "epoch": 3.7244897959183674, + "grad_norm": 10.517547607421875, + "learning_rate": 1.925510204081633e-05, + "loss": 0.4912, + "step": 730 + }, + { + "epoch": 3.7755102040816326, + "grad_norm": 16.347517013549805, + "learning_rate": 1.9244897959183675e-05, + "loss": 0.3382, + "step": 740 + }, + { + "epoch": 3.826530612244898, + "grad_norm": 4.203049182891846, + "learning_rate": 1.9234693877551024e-05, + "loss": 0.7066, + "step": 750 + }, + { + "epoch": 3.877551020408163, + "grad_norm": 5.946165084838867, + "learning_rate": 1.922448979591837e-05, + "loss": 0.4015, + "step": 760 + }, + { + "epoch": 3.928571428571429, + "grad_norm": 5.714838981628418, + "learning_rate": 1.9214285714285716e-05, + "loss": 0.5023, + "step": 770 + }, + { + "epoch": 3.979591836734694, + "grad_norm": 2.5401012897491455, + "learning_rate": 1.920408163265306e-05, + "loss": 0.4781, + "step": 780 + }, + { + "epoch": 4.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.4718337059020996, + "eval_runtime": 0.996, + "eval_samples_per_second": 278.104, + "eval_steps_per_second": 35.14, + "step": 784 + }, + { + "epoch": 4.030612244897959, + "grad_norm": 4.3500213623046875, + "learning_rate": 1.919387755102041e-05, + "loss": 0.4059, + "step": 790 + }, + { + "epoch": 4.081632653061225, + "grad_norm": 2.535583734512329, + "learning_rate": 1.9183673469387756e-05, + "loss": 0.4828, + "step": 800 + }, + { + "epoch": 4.13265306122449, + "grad_norm": 4.568256378173828, + "learning_rate": 1.9173469387755102e-05, + "loss": 0.3389, + "step": 810 + }, + { + "epoch": 4.183673469387755, + "grad_norm": 14.542338371276855, + "learning_rate": 1.916326530612245e-05, + "loss": 0.457, + "step": 820 + }, + { + "epoch": 4.23469387755102, + "grad_norm": 12.735268592834473, + "learning_rate": 1.9153061224489797e-05, + "loss": 0.5138, + "step": 830 + }, + { + "epoch": 4.285714285714286, + "grad_norm": 8.478500366210938, + "learning_rate": 1.9142857142857146e-05, + "loss": 0.3745, + "step": 840 + }, + { + "epoch": 4.336734693877551, + "grad_norm": 10.12189769744873, + "learning_rate": 1.9132653061224492e-05, + "loss": 0.3876, + "step": 850 + }, + { + "epoch": 4.387755102040816, + "grad_norm": 3.727886438369751, + "learning_rate": 1.9122448979591838e-05, + "loss": 0.4871, + "step": 860 + }, + { + "epoch": 4.438775510204081, + "grad_norm": 7.818300247192383, + "learning_rate": 1.9112244897959184e-05, + "loss": 0.3809, + "step": 870 + }, + { + "epoch": 4.489795918367347, + "grad_norm": 7.874812126159668, + "learning_rate": 1.9102040816326533e-05, + "loss": 0.5693, + "step": 880 + }, + { + "epoch": 4.540816326530612, + "grad_norm": 16.233993530273438, + "learning_rate": 1.909183673469388e-05, + "loss": 0.4754, + "step": 890 + }, + { + "epoch": 4.591836734693878, + "grad_norm": 14.693424224853516, + "learning_rate": 1.9081632653061225e-05, + "loss": 0.5392, + "step": 900 + }, + { + "epoch": 4.642857142857143, + "grad_norm": 18.429035186767578, + "learning_rate": 1.9071428571428574e-05, + "loss": 0.6257, + "step": 910 + }, + { + "epoch": 4.6938775510204085, + "grad_norm": 12.089123725891113, + "learning_rate": 1.906122448979592e-05, + "loss": 0.3743, + "step": 920 + }, + { + "epoch": 4.744897959183674, + "grad_norm": 2.348433256149292, + "learning_rate": 1.905102040816327e-05, + "loss": 0.4041, + "step": 930 + }, + { + "epoch": 4.795918367346939, + "grad_norm": 3.423184633255005, + "learning_rate": 1.9040816326530614e-05, + "loss": 0.4046, + "step": 940 + }, + { + "epoch": 4.846938775510204, + "grad_norm": 16.882726669311523, + "learning_rate": 1.903061224489796e-05, + "loss": 0.4895, + "step": 950 + }, + { + "epoch": 4.8979591836734695, + "grad_norm": 3.8072357177734375, + "learning_rate": 1.9020408163265306e-05, + "loss": 0.4572, + "step": 960 + }, + { + "epoch": 4.948979591836735, + "grad_norm": 16.21807098388672, + "learning_rate": 1.9010204081632655e-05, + "loss": 0.7458, + "step": 970 + }, + { + "epoch": 5.0, + "grad_norm": 18.041349411010742, + "learning_rate": 1.9e-05, + "loss": 0.4483, + "step": 980 + }, + { + "epoch": 5.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.4505952000617981, + "eval_runtime": 1.0691, + "eval_samples_per_second": 259.101, + "eval_steps_per_second": 32.738, + "step": 980 + }, + { + "epoch": 5.051020408163265, + "grad_norm": 12.381630897521973, + "learning_rate": 1.898979591836735e-05, + "loss": 0.3726, + "step": 990 + }, + { + "epoch": 5.1020408163265305, + "grad_norm": 7.038537502288818, + "learning_rate": 1.8979591836734696e-05, + "loss": 0.5766, + "step": 1000 + }, + { + "epoch": 5.153061224489796, + "grad_norm": 4.762441635131836, + "learning_rate": 1.8969387755102042e-05, + "loss": 0.27, + "step": 1010 + }, + { + "epoch": 5.204081632653061, + "grad_norm": 17.126434326171875, + "learning_rate": 1.895918367346939e-05, + "loss": 0.3328, + "step": 1020 + }, + { + "epoch": 5.255102040816326, + "grad_norm": 22.408201217651367, + "learning_rate": 1.8948979591836737e-05, + "loss": 0.6349, + "step": 1030 + }, + { + "epoch": 5.3061224489795915, + "grad_norm": 15.63797664642334, + "learning_rate": 1.8938775510204083e-05, + "loss": 0.4205, + "step": 1040 + }, + { + "epoch": 5.357142857142857, + "grad_norm": 12.86538028717041, + "learning_rate": 1.892857142857143e-05, + "loss": 0.4364, + "step": 1050 + }, + { + "epoch": 5.408163265306122, + "grad_norm": 3.4812543392181396, + "learning_rate": 1.8918367346938778e-05, + "loss": 0.6222, + "step": 1060 + }, + { + "epoch": 5.459183673469388, + "grad_norm": 7.06102180480957, + "learning_rate": 1.8908163265306123e-05, + "loss": 0.3908, + "step": 1070 + }, + { + "epoch": 5.510204081632653, + "grad_norm": 3.900890588760376, + "learning_rate": 1.8897959183673473e-05, + "loss": 0.5371, + "step": 1080 + }, + { + "epoch": 5.561224489795919, + "grad_norm": 4.5728983879089355, + "learning_rate": 1.888775510204082e-05, + "loss": 0.6384, + "step": 1090 + }, + { + "epoch": 5.612244897959184, + "grad_norm": 5.239672660827637, + "learning_rate": 1.8877551020408164e-05, + "loss": 0.7583, + "step": 1100 + }, + { + "epoch": 5.663265306122449, + "grad_norm": 11.084623336791992, + "learning_rate": 1.8867346938775513e-05, + "loss": 0.2961, + "step": 1110 + }, + { + "epoch": 5.714285714285714, + "grad_norm": 5.149864196777344, + "learning_rate": 1.885714285714286e-05, + "loss": 0.4336, + "step": 1120 + }, + { + "epoch": 5.76530612244898, + "grad_norm": 16.929899215698242, + "learning_rate": 1.8846938775510205e-05, + "loss": 0.445, + "step": 1130 + }, + { + "epoch": 5.816326530612245, + "grad_norm": 5.9687981605529785, + "learning_rate": 1.883673469387755e-05, + "loss": 0.3945, + "step": 1140 + }, + { + "epoch": 5.86734693877551, + "grad_norm": 11.565205574035645, + "learning_rate": 1.88265306122449e-05, + "loss": 0.3787, + "step": 1150 + }, + { + "epoch": 5.918367346938775, + "grad_norm": 12.327364921569824, + "learning_rate": 1.8816326530612246e-05, + "loss": 0.3156, + "step": 1160 + }, + { + "epoch": 5.969387755102041, + "grad_norm": 5.013162136077881, + "learning_rate": 1.8806122448979595e-05, + "loss": 0.655, + "step": 1170 + }, + { + "epoch": 6.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.3685191869735718, + "eval_runtime": 1.0054, + "eval_samples_per_second": 275.525, + "eval_steps_per_second": 34.814, + "step": 1176 + }, + { + "epoch": 6.020408163265306, + "grad_norm": 12.914482116699219, + "learning_rate": 1.879591836734694e-05, + "loss": 0.5408, + "step": 1180 + }, + { + "epoch": 6.071428571428571, + "grad_norm": 10.637080192565918, + "learning_rate": 1.8785714285714286e-05, + "loss": 0.3746, + "step": 1190 + }, + { + "epoch": 6.122448979591836, + "grad_norm": 13.593528747558594, + "learning_rate": 1.8775510204081636e-05, + "loss": 0.4054, + "step": 1200 + }, + { + "epoch": 6.173469387755102, + "grad_norm": 3.464900493621826, + "learning_rate": 1.876530612244898e-05, + "loss": 0.3547, + "step": 1210 + }, + { + "epoch": 6.224489795918367, + "grad_norm": 5.895821571350098, + "learning_rate": 1.8755102040816327e-05, + "loss": 0.3833, + "step": 1220 + }, + { + "epoch": 6.275510204081632, + "grad_norm": 4.028122901916504, + "learning_rate": 1.8744897959183673e-05, + "loss": 0.1833, + "step": 1230 + }, + { + "epoch": 6.326530612244898, + "grad_norm": 10.497953414916992, + "learning_rate": 1.8734693877551022e-05, + "loss": 0.3449, + "step": 1240 + }, + { + "epoch": 6.377551020408164, + "grad_norm": 18.080278396606445, + "learning_rate": 1.8724489795918368e-05, + "loss": 0.8633, + "step": 1250 + }, + { + "epoch": 6.428571428571429, + "grad_norm": 10.073883056640625, + "learning_rate": 1.8714285714285717e-05, + "loss": 0.6491, + "step": 1260 + }, + { + "epoch": 6.479591836734694, + "grad_norm": 5.288699626922607, + "learning_rate": 1.8704081632653063e-05, + "loss": 0.4189, + "step": 1270 + }, + { + "epoch": 6.530612244897959, + "grad_norm": 11.190839767456055, + "learning_rate": 1.869387755102041e-05, + "loss": 0.5136, + "step": 1280 + }, + { + "epoch": 6.581632653061225, + "grad_norm": 5.8105621337890625, + "learning_rate": 1.8683673469387758e-05, + "loss": 0.3161, + "step": 1290 + }, + { + "epoch": 6.63265306122449, + "grad_norm": 14.419046401977539, + "learning_rate": 1.8673469387755104e-05, + "loss": 0.5268, + "step": 1300 + }, + { + "epoch": 6.683673469387755, + "grad_norm": 6.938192844390869, + "learning_rate": 1.866326530612245e-05, + "loss": 0.4193, + "step": 1310 + }, + { + "epoch": 6.73469387755102, + "grad_norm": 8.974387168884277, + "learning_rate": 1.8653061224489795e-05, + "loss": 0.3514, + "step": 1320 + }, + { + "epoch": 6.785714285714286, + "grad_norm": 19.685522079467773, + "learning_rate": 1.8642857142857144e-05, + "loss": 0.6266, + "step": 1330 + }, + { + "epoch": 6.836734693877551, + "grad_norm": 3.6391055583953857, + "learning_rate": 1.863265306122449e-05, + "loss": 0.5591, + "step": 1340 + }, + { + "epoch": 6.887755102040816, + "grad_norm": 10.755435943603516, + "learning_rate": 1.862244897959184e-05, + "loss": 0.3627, + "step": 1350 + }, + { + "epoch": 6.938775510204081, + "grad_norm": 12.739008903503418, + "learning_rate": 1.8612244897959185e-05, + "loss": 0.2863, + "step": 1360 + }, + { + "epoch": 6.989795918367347, + "grad_norm": 5.814560890197754, + "learning_rate": 1.860204081632653e-05, + "loss": 0.3441, + "step": 1370 + }, + { + "epoch": 7.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.47507232427597046, + "eval_runtime": 1.001, + "eval_samples_per_second": 276.729, + "eval_steps_per_second": 34.966, + "step": 1372 + }, + { + "epoch": 7.040816326530612, + "grad_norm": 16.30525779724121, + "learning_rate": 1.859183673469388e-05, + "loss": 0.5459, + "step": 1380 + }, + { + "epoch": 7.091836734693878, + "grad_norm": 5.696544170379639, + "learning_rate": 1.8581632653061226e-05, + "loss": 0.2429, + "step": 1390 + }, + { + "epoch": 7.142857142857143, + "grad_norm": 9.33452320098877, + "learning_rate": 1.8571428571428575e-05, + "loss": 0.4018, + "step": 1400 + }, + { + "epoch": 7.1938775510204085, + "grad_norm": 17.027631759643555, + "learning_rate": 1.856122448979592e-05, + "loss": 0.6258, + "step": 1410 + }, + { + "epoch": 7.244897959183674, + "grad_norm": 8.29952621459961, + "learning_rate": 1.8551020408163267e-05, + "loss": 0.3525, + "step": 1420 + }, + { + "epoch": 7.295918367346939, + "grad_norm": 16.669879913330078, + "learning_rate": 1.8540816326530613e-05, + "loss": 0.571, + "step": 1430 + }, + { + "epoch": 7.346938775510204, + "grad_norm": 7.514557361602783, + "learning_rate": 1.853061224489796e-05, + "loss": 0.2177, + "step": 1440 + }, + { + "epoch": 7.3979591836734695, + "grad_norm": 17.391042709350586, + "learning_rate": 1.8520408163265307e-05, + "loss": 0.2053, + "step": 1450 + }, + { + "epoch": 7.448979591836735, + "grad_norm": 10.607004165649414, + "learning_rate": 1.8510204081632653e-05, + "loss": 0.6008, + "step": 1460 + }, + { + "epoch": 7.5, + "grad_norm": 10.863404273986816, + "learning_rate": 1.8500000000000002e-05, + "loss": 0.3618, + "step": 1470 + }, + { + "epoch": 7.551020408163265, + "grad_norm": 10.011690139770508, + "learning_rate": 1.8489795918367348e-05, + "loss": 0.4202, + "step": 1480 + }, + { + "epoch": 7.6020408163265305, + "grad_norm": 11.260356903076172, + "learning_rate": 1.8479591836734697e-05, + "loss": 0.3637, + "step": 1490 + }, + { + "epoch": 7.653061224489796, + "grad_norm": 5.600480079650879, + "learning_rate": 1.8469387755102043e-05, + "loss": 0.7131, + "step": 1500 + }, + { + "epoch": 7.704081632653061, + "grad_norm": 5.015355587005615, + "learning_rate": 1.845918367346939e-05, + "loss": 0.3751, + "step": 1510 + }, + { + "epoch": 7.755102040816326, + "grad_norm": 14.110769271850586, + "learning_rate": 1.8448979591836735e-05, + "loss": 0.4953, + "step": 1520 + }, + { + "epoch": 7.8061224489795915, + "grad_norm": 10.710803031921387, + "learning_rate": 1.8438775510204084e-05, + "loss": 0.3872, + "step": 1530 + }, + { + "epoch": 7.857142857142857, + "grad_norm": 9.471148490905762, + "learning_rate": 1.842857142857143e-05, + "loss": 0.5085, + "step": 1540 + }, + { + "epoch": 7.908163265306122, + "grad_norm": 4.084619522094727, + "learning_rate": 1.8418367346938776e-05, + "loss": 0.3712, + "step": 1550 + }, + { + "epoch": 7.959183673469388, + "grad_norm": 18.75293731689453, + "learning_rate": 1.8408163265306125e-05, + "loss": 0.3166, + "step": 1560 + }, + { + "epoch": 8.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.3796345591545105, + "eval_runtime": 1.0133, + "eval_samples_per_second": 273.377, + "eval_steps_per_second": 34.542, + "step": 1568 + }, + { + "epoch": 8.010204081632653, + "grad_norm": 16.240253448486328, + "learning_rate": 1.839795918367347e-05, + "loss": 0.3991, + "step": 1570 + }, + { + "epoch": 8.061224489795919, + "grad_norm": 16.32931900024414, + "learning_rate": 1.838775510204082e-05, + "loss": 0.4878, + "step": 1580 + }, + { + "epoch": 8.112244897959183, + "grad_norm": 8.901126861572266, + "learning_rate": 1.8377551020408165e-05, + "loss": 0.6001, + "step": 1590 + }, + { + "epoch": 8.16326530612245, + "grad_norm": 1.1584454774856567, + "learning_rate": 1.836734693877551e-05, + "loss": 0.3942, + "step": 1600 + }, + { + "epoch": 8.214285714285714, + "grad_norm": 2.1453287601470947, + "learning_rate": 1.8357142857142857e-05, + "loss": 0.4165, + "step": 1610 + }, + { + "epoch": 8.26530612244898, + "grad_norm": 7.004159450531006, + "learning_rate": 1.8346938775510206e-05, + "loss": 0.4401, + "step": 1620 + }, + { + "epoch": 8.316326530612244, + "grad_norm": 4.452840805053711, + "learning_rate": 1.8336734693877552e-05, + "loss": 0.5952, + "step": 1630 + }, + { + "epoch": 8.36734693877551, + "grad_norm": 3.3416495323181152, + "learning_rate": 1.8326530612244898e-05, + "loss": 0.4834, + "step": 1640 + }, + { + "epoch": 8.418367346938776, + "grad_norm": 9.173787117004395, + "learning_rate": 1.8316326530612247e-05, + "loss": 0.2209, + "step": 1650 + }, + { + "epoch": 8.46938775510204, + "grad_norm": 3.4445853233337402, + "learning_rate": 1.8306122448979593e-05, + "loss": 0.4296, + "step": 1660 + }, + { + "epoch": 8.520408163265307, + "grad_norm": 10.2993803024292, + "learning_rate": 1.8295918367346942e-05, + "loss": 0.386, + "step": 1670 + }, + { + "epoch": 8.571428571428571, + "grad_norm": 9.194501876831055, + "learning_rate": 1.8285714285714288e-05, + "loss": 0.3287, + "step": 1680 + }, + { + "epoch": 8.622448979591837, + "grad_norm": 4.289522647857666, + "learning_rate": 1.8275510204081634e-05, + "loss": 0.2869, + "step": 1690 + }, + { + "epoch": 8.673469387755102, + "grad_norm": 9.634254455566406, + "learning_rate": 1.826530612244898e-05, + "loss": 0.5125, + "step": 1700 + }, + { + "epoch": 8.724489795918368, + "grad_norm": 11.490145683288574, + "learning_rate": 1.825510204081633e-05, + "loss": 0.4965, + "step": 1710 + }, + { + "epoch": 8.775510204081632, + "grad_norm": 2.314307928085327, + "learning_rate": 1.8244897959183674e-05, + "loss": 0.418, + "step": 1720 + }, + { + "epoch": 8.826530612244898, + "grad_norm": 13.09791374206543, + "learning_rate": 1.823469387755102e-05, + "loss": 0.5415, + "step": 1730 + }, + { + "epoch": 8.877551020408163, + "grad_norm": 17.422143936157227, + "learning_rate": 1.822448979591837e-05, + "loss": 0.3746, + "step": 1740 + }, + { + "epoch": 8.928571428571429, + "grad_norm": 1.750675916671753, + "learning_rate": 1.8214285714285715e-05, + "loss": 0.3917, + "step": 1750 + }, + { + "epoch": 8.979591836734693, + "grad_norm": 16.325149536132812, + "learning_rate": 1.8204081632653064e-05, + "loss": 0.5114, + "step": 1760 + }, + { + "epoch": 9.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.4087030291557312, + "eval_runtime": 1.0068, + "eval_samples_per_second": 275.119, + "eval_steps_per_second": 34.762, + "step": 1764 + }, + { + "epoch": 9.03061224489796, + "grad_norm": 19.02857780456543, + "learning_rate": 1.819387755102041e-05, + "loss": 0.4268, + "step": 1770 + }, + { + "epoch": 9.081632653061224, + "grad_norm": 13.512290954589844, + "learning_rate": 1.8183673469387756e-05, + "loss": 0.3005, + "step": 1780 + }, + { + "epoch": 9.13265306122449, + "grad_norm": 4.093715190887451, + "learning_rate": 1.81734693877551e-05, + "loss": 0.4519, + "step": 1790 + }, + { + "epoch": 9.183673469387756, + "grad_norm": 5.021727561950684, + "learning_rate": 1.816326530612245e-05, + "loss": 0.2271, + "step": 1800 + }, + { + "epoch": 9.23469387755102, + "grad_norm": 15.114554405212402, + "learning_rate": 1.8153061224489797e-05, + "loss": 0.3591, + "step": 1810 + }, + { + "epoch": 9.285714285714286, + "grad_norm": 21.10906219482422, + "learning_rate": 1.8142857142857146e-05, + "loss": 0.4232, + "step": 1820 + }, + { + "epoch": 9.33673469387755, + "grad_norm": 9.971729278564453, + "learning_rate": 1.813265306122449e-05, + "loss": 0.4221, + "step": 1830 + }, + { + "epoch": 9.387755102040817, + "grad_norm": 8.30392837524414, + "learning_rate": 1.8122448979591837e-05, + "loss": 0.6234, + "step": 1840 + }, + { + "epoch": 9.438775510204081, + "grad_norm": 14.4780855178833, + "learning_rate": 1.8112244897959187e-05, + "loss": 0.4563, + "step": 1850 + }, + { + "epoch": 9.489795918367347, + "grad_norm": 5.932323455810547, + "learning_rate": 1.8102040816326532e-05, + "loss": 0.2829, + "step": 1860 + }, + { + "epoch": 9.540816326530612, + "grad_norm": 16.308589935302734, + "learning_rate": 1.8091836734693878e-05, + "loss": 0.6267, + "step": 1870 + }, + { + "epoch": 9.591836734693878, + "grad_norm": 16.539588928222656, + "learning_rate": 1.8081632653061224e-05, + "loss": 0.2834, + "step": 1880 + }, + { + "epoch": 9.642857142857142, + "grad_norm": 7.993058204650879, + "learning_rate": 1.8071428571428573e-05, + "loss": 0.3666, + "step": 1890 + }, + { + "epoch": 9.693877551020408, + "grad_norm": 6.237543106079102, + "learning_rate": 1.806122448979592e-05, + "loss": 0.461, + "step": 1900 + }, + { + "epoch": 9.744897959183673, + "grad_norm": 5.224532127380371, + "learning_rate": 1.8051020408163268e-05, + "loss": 0.3886, + "step": 1910 + }, + { + "epoch": 9.795918367346939, + "grad_norm": 7.863698482513428, + "learning_rate": 1.8040816326530614e-05, + "loss": 0.375, + "step": 1920 + }, + { + "epoch": 9.846938775510203, + "grad_norm": 3.1712992191314697, + "learning_rate": 1.803061224489796e-05, + "loss": 0.4075, + "step": 1930 + }, + { + "epoch": 9.89795918367347, + "grad_norm": 6.658506393432617, + "learning_rate": 1.802040816326531e-05, + "loss": 0.3699, + "step": 1940 + }, + { + "epoch": 9.948979591836736, + "grad_norm": 4.4092912673950195, + "learning_rate": 1.8010204081632655e-05, + "loss": 0.3695, + "step": 1950 + }, + { + "epoch": 10.0, + "grad_norm": 10.290538787841797, + "learning_rate": 1.8e-05, + "loss": 0.6452, + "step": 1960 + }, + { + "epoch": 10.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.3760218620300293, + "eval_runtime": 1.002, + "eval_samples_per_second": 276.444, + "eval_steps_per_second": 34.93, + "step": 1960 + }, + { + "epoch": 10.051020408163266, + "grad_norm": 4.432197093963623, + "learning_rate": 1.7989795918367346e-05, + "loss": 0.5627, + "step": 1970 + }, + { + "epoch": 10.10204081632653, + "grad_norm": 4.792144298553467, + "learning_rate": 1.7979591836734695e-05, + "loss": 0.33, + "step": 1980 + }, + { + "epoch": 10.153061224489797, + "grad_norm": 8.579452514648438, + "learning_rate": 1.796938775510204e-05, + "loss": 0.3995, + "step": 1990 + }, + { + "epoch": 10.204081632653061, + "grad_norm": 3.39054536819458, + "learning_rate": 1.795918367346939e-05, + "loss": 0.3255, + "step": 2000 + }, + { + "epoch": 10.255102040816327, + "grad_norm": 10.804977416992188, + "learning_rate": 1.7948979591836736e-05, + "loss": 0.343, + "step": 2010 + }, + { + "epoch": 10.306122448979592, + "grad_norm": 6.238157749176025, + "learning_rate": 1.7938775510204082e-05, + "loss": 0.4181, + "step": 2020 + }, + { + "epoch": 10.357142857142858, + "grad_norm": 1.918572187423706, + "learning_rate": 1.792857142857143e-05, + "loss": 0.3354, + "step": 2030 + }, + { + "epoch": 10.408163265306122, + "grad_norm": 11.103297233581543, + "learning_rate": 1.7918367346938777e-05, + "loss": 0.4262, + "step": 2040 + }, + { + "epoch": 10.459183673469388, + "grad_norm": 16.645551681518555, + "learning_rate": 1.7908163265306123e-05, + "loss": 0.5587, + "step": 2050 + }, + { + "epoch": 10.510204081632653, + "grad_norm": 5.765960216522217, + "learning_rate": 1.789795918367347e-05, + "loss": 0.3401, + "step": 2060 + }, + { + "epoch": 10.561224489795919, + "grad_norm": 2.093567371368408, + "learning_rate": 1.7887755102040818e-05, + "loss": 0.3814, + "step": 2070 + }, + { + "epoch": 10.612244897959183, + "grad_norm": 14.202189445495605, + "learning_rate": 1.7877551020408164e-05, + "loss": 0.2941, + "step": 2080 + }, + { + "epoch": 10.66326530612245, + "grad_norm": 7.447692394256592, + "learning_rate": 1.7867346938775513e-05, + "loss": 0.5027, + "step": 2090 + }, + { + "epoch": 10.714285714285714, + "grad_norm": 5.1438069343566895, + "learning_rate": 1.785714285714286e-05, + "loss": 0.2055, + "step": 2100 + }, + { + "epoch": 10.76530612244898, + "grad_norm": 18.347646713256836, + "learning_rate": 1.7846938775510204e-05, + "loss": 0.3839, + "step": 2110 + }, + { + "epoch": 10.816326530612244, + "grad_norm": 3.191275119781494, + "learning_rate": 1.7836734693877553e-05, + "loss": 0.4602, + "step": 2120 + }, + { + "epoch": 10.86734693877551, + "grad_norm": 19.04853057861328, + "learning_rate": 1.78265306122449e-05, + "loss": 0.3279, + "step": 2130 + }, + { + "epoch": 10.918367346938776, + "grad_norm": 10.617090225219727, + "learning_rate": 1.781632653061225e-05, + "loss": 0.513, + "step": 2140 + }, + { + "epoch": 10.96938775510204, + "grad_norm": 26.807905197143555, + "learning_rate": 1.780612244897959e-05, + "loss": 0.4747, + "step": 2150 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.4222852885723114, + "eval_runtime": 1.0821, + "eval_samples_per_second": 255.988, + "eval_steps_per_second": 32.345, + "step": 2156 + }, + { + "epoch": 11.020408163265307, + "grad_norm": 4.5420613288879395, + "learning_rate": 1.779591836734694e-05, + "loss": 0.2675, + "step": 2160 + }, + { + "epoch": 11.071428571428571, + "grad_norm": 2.1453073024749756, + "learning_rate": 1.7785714285714286e-05, + "loss": 0.377, + "step": 2170 + }, + { + "epoch": 11.122448979591837, + "grad_norm": 17.095077514648438, + "learning_rate": 1.7775510204081635e-05, + "loss": 0.3174, + "step": 2180 + }, + { + "epoch": 11.173469387755102, + "grad_norm": 4.393935203552246, + "learning_rate": 1.776530612244898e-05, + "loss": 0.3452, + "step": 2190 + }, + { + "epoch": 11.224489795918368, + "grad_norm": 20.132598876953125, + "learning_rate": 1.7755102040816327e-05, + "loss": 0.4123, + "step": 2200 + }, + { + "epoch": 11.275510204081632, + "grad_norm": 8.399972915649414, + "learning_rate": 1.7744897959183676e-05, + "loss": 0.6257, + "step": 2210 + }, + { + "epoch": 11.326530612244898, + "grad_norm": 4.09067964553833, + "learning_rate": 1.773469387755102e-05, + "loss": 0.3371, + "step": 2220 + }, + { + "epoch": 11.377551020408163, + "grad_norm": 14.214452743530273, + "learning_rate": 1.772448979591837e-05, + "loss": 0.4791, + "step": 2230 + }, + { + "epoch": 11.428571428571429, + "grad_norm": 18.578773498535156, + "learning_rate": 1.7714285714285717e-05, + "loss": 0.4311, + "step": 2240 + }, + { + "epoch": 11.479591836734693, + "grad_norm": 7.424624919891357, + "learning_rate": 1.7704081632653062e-05, + "loss": 0.5029, + "step": 2250 + }, + { + "epoch": 11.53061224489796, + "grad_norm": 7.086554527282715, + "learning_rate": 1.7693877551020408e-05, + "loss": 0.1884, + "step": 2260 + }, + { + "epoch": 11.581632653061224, + "grad_norm": 7.427938938140869, + "learning_rate": 1.7683673469387757e-05, + "loss": 0.2453, + "step": 2270 + }, + { + "epoch": 11.63265306122449, + "grad_norm": 9.359642028808594, + "learning_rate": 1.7673469387755103e-05, + "loss": 0.2816, + "step": 2280 + }, + { + "epoch": 11.683673469387756, + "grad_norm": 7.296905994415283, + "learning_rate": 1.766326530612245e-05, + "loss": 0.3826, + "step": 2290 + }, + { + "epoch": 11.73469387755102, + "grad_norm": 17.727083206176758, + "learning_rate": 1.7653061224489798e-05, + "loss": 0.746, + "step": 2300 + }, + { + "epoch": 11.785714285714286, + "grad_norm": 9.404768943786621, + "learning_rate": 1.7642857142857144e-05, + "loss": 0.392, + "step": 2310 + }, + { + "epoch": 11.83673469387755, + "grad_norm": 7.348867893218994, + "learning_rate": 1.7632653061224493e-05, + "loss": 0.2281, + "step": 2320 + }, + { + "epoch": 11.887755102040817, + "grad_norm": 12.33675479888916, + "learning_rate": 1.762244897959184e-05, + "loss": 0.463, + "step": 2330 + }, + { + "epoch": 11.938775510204081, + "grad_norm": 2.561669111251831, + "learning_rate": 1.7612244897959185e-05, + "loss": 0.2378, + "step": 2340 + }, + { + "epoch": 11.989795918367347, + "grad_norm": 13.693377494812012, + "learning_rate": 1.760204081632653e-05, + "loss": 0.5145, + "step": 2350 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.5956678700361011, + "eval_loss": 1.1703706979751587, + "eval_runtime": 1.0034, + "eval_samples_per_second": 276.072, + "eval_steps_per_second": 34.883, + "step": 2352 + }, + { + "epoch": 12.040816326530612, + "grad_norm": 7.6706223487854, + "learning_rate": 1.759183673469388e-05, + "loss": 0.2481, + "step": 2360 + }, + { + "epoch": 12.091836734693878, + "grad_norm": 20.112092971801758, + "learning_rate": 1.7581632653061225e-05, + "loss": 0.399, + "step": 2370 + }, + { + "epoch": 12.142857142857142, + "grad_norm": 8.7342529296875, + "learning_rate": 1.757142857142857e-05, + "loss": 0.4516, + "step": 2380 + }, + { + "epoch": 12.193877551020408, + "grad_norm": 5.029404163360596, + "learning_rate": 1.756122448979592e-05, + "loss": 0.3708, + "step": 2390 + }, + { + "epoch": 12.244897959183673, + "grad_norm": 1.2988139390945435, + "learning_rate": 1.7551020408163266e-05, + "loss": 0.4179, + "step": 2400 + }, + { + "epoch": 12.295918367346939, + "grad_norm": 11.41686725616455, + "learning_rate": 1.7540816326530615e-05, + "loss": 0.618, + "step": 2410 + }, + { + "epoch": 12.346938775510203, + "grad_norm": 7.0157623291015625, + "learning_rate": 1.753061224489796e-05, + "loss": 0.4687, + "step": 2420 + }, + { + "epoch": 12.39795918367347, + "grad_norm": 13.440451622009277, + "learning_rate": 1.7520408163265307e-05, + "loss": 0.3505, + "step": 2430 + }, + { + "epoch": 12.448979591836734, + "grad_norm": 5.151823043823242, + "learning_rate": 1.7510204081632653e-05, + "loss": 0.3269, + "step": 2440 + }, + { + "epoch": 12.5, + "grad_norm": 5.66237735748291, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.2818, + "step": 2450 + }, + { + "epoch": 12.551020408163264, + "grad_norm": 9.586088180541992, + "learning_rate": 1.748979591836735e-05, + "loss": 0.458, + "step": 2460 + }, + { + "epoch": 12.60204081632653, + "grad_norm": 2.3081839084625244, + "learning_rate": 1.7479591836734693e-05, + "loss": 0.3326, + "step": 2470 + }, + { + "epoch": 12.653061224489797, + "grad_norm": 10.380943298339844, + "learning_rate": 1.7469387755102043e-05, + "loss": 0.3968, + "step": 2480 + }, + { + "epoch": 12.704081632653061, + "grad_norm": 23.742374420166016, + "learning_rate": 1.745918367346939e-05, + "loss": 0.3304, + "step": 2490 + }, + { + "epoch": 12.755102040816327, + "grad_norm": 17.205135345458984, + "learning_rate": 1.7448979591836738e-05, + "loss": 0.5252, + "step": 2500 + }, + { + "epoch": 12.806122448979592, + "grad_norm": 11.215539932250977, + "learning_rate": 1.7438775510204083e-05, + "loss": 0.2923, + "step": 2510 + }, + { + "epoch": 12.857142857142858, + "grad_norm": 4.2553510665893555, + "learning_rate": 1.742857142857143e-05, + "loss": 0.4831, + "step": 2520 + }, + { + "epoch": 12.908163265306122, + "grad_norm": 8.88278865814209, + "learning_rate": 1.7418367346938775e-05, + "loss": 0.4106, + "step": 2530 + }, + { + "epoch": 12.959183673469388, + "grad_norm": 5.078549861907959, + "learning_rate": 1.7408163265306124e-05, + "loss": 0.1991, + "step": 2540 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.9097472924187726, + "eval_loss": 0.3454380929470062, + "eval_runtime": 1.0066, + "eval_samples_per_second": 275.18, + "eval_steps_per_second": 34.77, + "step": 2548 + }, + { + "epoch": 13.010204081632653, + "grad_norm": 15.419900894165039, + "learning_rate": 1.7397959183673473e-05, + "loss": 0.5303, + "step": 2550 + }, + { + "epoch": 13.061224489795919, + "grad_norm": 21.60919189453125, + "learning_rate": 1.738775510204082e-05, + "loss": 0.4273, + "step": 2560 + }, + { + "epoch": 13.112244897959183, + "grad_norm": 14.65484619140625, + "learning_rate": 1.7377551020408165e-05, + "loss": 0.3702, + "step": 2570 + }, + { + "epoch": 13.16326530612245, + "grad_norm": 14.261983871459961, + "learning_rate": 1.736734693877551e-05, + "loss": 0.2532, + "step": 2580 + }, + { + "epoch": 13.214285714285714, + "grad_norm": 3.0944366455078125, + "learning_rate": 1.735714285714286e-05, + "loss": 0.4152, + "step": 2590 + }, + { + "epoch": 13.26530612244898, + "grad_norm": 2.5983598232269287, + "learning_rate": 1.7346938775510206e-05, + "loss": 0.2743, + "step": 2600 + }, + { + "epoch": 13.316326530612244, + "grad_norm": 15.517622947692871, + "learning_rate": 1.733673469387755e-05, + "loss": 0.4579, + "step": 2610 + }, + { + "epoch": 13.36734693877551, + "grad_norm": 1.4525797367095947, + "learning_rate": 1.7326530612244897e-05, + "loss": 0.419, + "step": 2620 + }, + { + "epoch": 13.418367346938776, + "grad_norm": 9.143267631530762, + "learning_rate": 1.7316326530612246e-05, + "loss": 0.369, + "step": 2630 + }, + { + "epoch": 13.46938775510204, + "grad_norm": 10.418425559997559, + "learning_rate": 1.7306122448979596e-05, + "loss": 0.4429, + "step": 2640 + }, + { + "epoch": 13.520408163265307, + "grad_norm": 19.76718521118164, + "learning_rate": 1.729591836734694e-05, + "loss": 0.4732, + "step": 2650 + }, + { + "epoch": 13.571428571428571, + "grad_norm": 7.403574466705322, + "learning_rate": 1.7285714285714287e-05, + "loss": 0.3883, + "step": 2660 + }, + { + "epoch": 13.622448979591837, + "grad_norm": 4.320155143737793, + "learning_rate": 1.7275510204081633e-05, + "loss": 0.27, + "step": 2670 + }, + { + "epoch": 13.673469387755102, + "grad_norm": 11.817669868469238, + "learning_rate": 1.7265306122448982e-05, + "loss": 0.4301, + "step": 2680 + }, + { + "epoch": 13.724489795918368, + "grad_norm": 14.72208023071289, + "learning_rate": 1.7255102040816328e-05, + "loss": 0.4041, + "step": 2690 + }, + { + "epoch": 13.775510204081632, + "grad_norm": 8.361180305480957, + "learning_rate": 1.7244897959183674e-05, + "loss": 0.3567, + "step": 2700 + }, + { + "epoch": 13.826530612244898, + "grad_norm": 16.659774780273438, + "learning_rate": 1.723469387755102e-05, + "loss": 0.5788, + "step": 2710 + }, + { + "epoch": 13.877551020408163, + "grad_norm": 17.327421188354492, + "learning_rate": 1.722448979591837e-05, + "loss": 0.3298, + "step": 2720 + }, + { + "epoch": 13.928571428571429, + "grad_norm": 9.239471435546875, + "learning_rate": 1.7214285714285718e-05, + "loss": 0.2547, + "step": 2730 + }, + { + "epoch": 13.979591836734693, + "grad_norm": 6.702538967132568, + "learning_rate": 1.7204081632653064e-05, + "loss": 0.2396, + "step": 2740 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.8700361010830325, + "eval_loss": 0.39134401082992554, + "eval_runtime": 1.0101, + "eval_samples_per_second": 274.229, + "eval_steps_per_second": 34.65, + "step": 2744 + }, + { + "epoch": 14.03061224489796, + "grad_norm": 28.766311645507812, + "learning_rate": 1.719387755102041e-05, + "loss": 0.5768, + "step": 2750 + }, + { + "epoch": 14.081632653061224, + "grad_norm": 1.9825541973114014, + "learning_rate": 1.7183673469387755e-05, + "loss": 0.2122, + "step": 2760 + }, + { + "epoch": 14.13265306122449, + "grad_norm": 2.2955780029296875, + "learning_rate": 1.7173469387755104e-05, + "loss": 0.3067, + "step": 2770 + }, + { + "epoch": 14.183673469387756, + "grad_norm": 2.9667458534240723, + "learning_rate": 1.716326530612245e-05, + "loss": 0.3044, + "step": 2780 + }, + { + "epoch": 14.23469387755102, + "grad_norm": 1.5401198863983154, + "learning_rate": 1.7153061224489796e-05, + "loss": 0.3081, + "step": 2790 + }, + { + "epoch": 14.285714285714286, + "grad_norm": 7.035645484924316, + "learning_rate": 1.7142857142857142e-05, + "loss": 0.2379, + "step": 2800 + }, + { + "epoch": 14.33673469387755, + "grad_norm": 13.610828399658203, + "learning_rate": 1.713265306122449e-05, + "loss": 0.3902, + "step": 2810 + }, + { + "epoch": 14.387755102040817, + "grad_norm": 14.626175880432129, + "learning_rate": 1.712244897959184e-05, + "loss": 0.3637, + "step": 2820 + }, + { + "epoch": 14.438775510204081, + "grad_norm": 25.559099197387695, + "learning_rate": 1.7112244897959186e-05, + "loss": 0.3926, + "step": 2830 + }, + { + "epoch": 14.489795918367347, + "grad_norm": 7.964416980743408, + "learning_rate": 1.7102040816326532e-05, + "loss": 0.405, + "step": 2840 + }, + { + "epoch": 14.540816326530612, + "grad_norm": 9.71059799194336, + "learning_rate": 1.7091836734693878e-05, + "loss": 0.4545, + "step": 2850 + }, + { + "epoch": 14.591836734693878, + "grad_norm": 16.295955657958984, + "learning_rate": 1.7081632653061227e-05, + "loss": 0.2238, + "step": 2860 + }, + { + "epoch": 14.642857142857142, + "grad_norm": 1.9360520839691162, + "learning_rate": 1.7071428571428573e-05, + "loss": 0.7506, + "step": 2870 + }, + { + "epoch": 14.693877551020408, + "grad_norm": 19.95923614501953, + "learning_rate": 1.7061224489795922e-05, + "loss": 0.5928, + "step": 2880 + }, + { + "epoch": 14.744897959183673, + "grad_norm": 22.854106903076172, + "learning_rate": 1.7051020408163264e-05, + "loss": 0.3943, + "step": 2890 + }, + { + "epoch": 14.795918367346939, + "grad_norm": 5.517703056335449, + "learning_rate": 1.7040816326530613e-05, + "loss": 0.3957, + "step": 2900 + }, + { + "epoch": 14.846938775510203, + "grad_norm": 24.88918113708496, + "learning_rate": 1.7030612244897962e-05, + "loss": 0.4601, + "step": 2910 + }, + { + "epoch": 14.89795918367347, + "grad_norm": 10.2703218460083, + "learning_rate": 1.7020408163265308e-05, + "loss": 0.3916, + "step": 2920 + }, + { + "epoch": 14.948979591836736, + "grad_norm": 12.027691841125488, + "learning_rate": 1.7010204081632654e-05, + "loss": 0.4524, + "step": 2930 + }, + { + "epoch": 15.0, + "grad_norm": 11.881312370300293, + "learning_rate": 1.7e-05, + "loss": 0.3259, + "step": 2940 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.36894237995147705, + "eval_runtime": 0.9976, + "eval_samples_per_second": 277.67, + "eval_steps_per_second": 35.085, + "step": 2940 + }, + { + "epoch": 15.051020408163266, + "grad_norm": 7.301281452178955, + "learning_rate": 1.698979591836735e-05, + "loss": 0.3635, + "step": 2950 + }, + { + "epoch": 15.10204081632653, + "grad_norm": 4.754505157470703, + "learning_rate": 1.6979591836734695e-05, + "loss": 0.2743, + "step": 2960 + }, + { + "epoch": 15.153061224489797, + "grad_norm": 1.9281086921691895, + "learning_rate": 1.6969387755102044e-05, + "loss": 0.3761, + "step": 2970 + }, + { + "epoch": 15.204081632653061, + "grad_norm": 5.705906391143799, + "learning_rate": 1.695918367346939e-05, + "loss": 0.4318, + "step": 2980 + }, + { + "epoch": 15.255102040816327, + "grad_norm": 6.573225975036621, + "learning_rate": 1.6948979591836736e-05, + "loss": 0.272, + "step": 2990 + }, + { + "epoch": 15.306122448979592, + "grad_norm": 1.6972718238830566, + "learning_rate": 1.6938775510204085e-05, + "loss": 0.2734, + "step": 3000 + }, + { + "epoch": 15.357142857142858, + "grad_norm": 8.178071975708008, + "learning_rate": 1.692857142857143e-05, + "loss": 0.5854, + "step": 3010 + }, + { + "epoch": 15.408163265306122, + "grad_norm": 8.675841331481934, + "learning_rate": 1.6918367346938776e-05, + "loss": 0.3045, + "step": 3020 + }, + { + "epoch": 15.459183673469388, + "grad_norm": 15.598442077636719, + "learning_rate": 1.6908163265306122e-05, + "loss": 0.2188, + "step": 3030 + }, + { + "epoch": 15.510204081632653, + "grad_norm": 3.929029941558838, + "learning_rate": 1.689795918367347e-05, + "loss": 0.1691, + "step": 3040 + }, + { + "epoch": 15.561224489795919, + "grad_norm": 22.549604415893555, + "learning_rate": 1.6887755102040817e-05, + "loss": 0.3308, + "step": 3050 + }, + { + "epoch": 15.612244897959183, + "grad_norm": 8.705903053283691, + "learning_rate": 1.6877551020408166e-05, + "loss": 0.4346, + "step": 3060 + }, + { + "epoch": 15.66326530612245, + "grad_norm": 7.189068794250488, + "learning_rate": 1.6867346938775512e-05, + "loss": 0.2974, + "step": 3070 + }, + { + "epoch": 15.714285714285714, + "grad_norm": 16.522537231445312, + "learning_rate": 1.6857142857142858e-05, + "loss": 0.5328, + "step": 3080 + }, + { + "epoch": 15.76530612244898, + "grad_norm": 9.6983060836792, + "learning_rate": 1.6846938775510207e-05, + "loss": 0.3115, + "step": 3090 + }, + { + "epoch": 15.816326530612244, + "grad_norm": 30.435047149658203, + "learning_rate": 1.6836734693877553e-05, + "loss": 0.7022, + "step": 3100 + }, + { + "epoch": 15.86734693877551, + "grad_norm": 19.050609588623047, + "learning_rate": 1.68265306122449e-05, + "loss": 0.6279, + "step": 3110 + }, + { + "epoch": 15.918367346938776, + "grad_norm": 10.127452850341797, + "learning_rate": 1.6816326530612244e-05, + "loss": 0.4232, + "step": 3120 + }, + { + "epoch": 15.96938775510204, + "grad_norm": 2.200711488723755, + "learning_rate": 1.6806122448979594e-05, + "loss": 0.3434, + "step": 3130 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.3742833137512207, + "eval_runtime": 1.0197, + "eval_samples_per_second": 271.656, + "eval_steps_per_second": 34.325, + "step": 3136 + }, + { + "epoch": 16.020408163265305, + "grad_norm": 13.688541412353516, + "learning_rate": 1.679591836734694e-05, + "loss": 0.2501, + "step": 3140 + }, + { + "epoch": 16.071428571428573, + "grad_norm": 9.806476593017578, + "learning_rate": 1.678571428571429e-05, + "loss": 0.4224, + "step": 3150 + }, + { + "epoch": 16.122448979591837, + "grad_norm": 5.79801607131958, + "learning_rate": 1.6775510204081634e-05, + "loss": 0.2292, + "step": 3160 + }, + { + "epoch": 16.1734693877551, + "grad_norm": 10.158044815063477, + "learning_rate": 1.676530612244898e-05, + "loss": 0.2183, + "step": 3170 + }, + { + "epoch": 16.224489795918366, + "grad_norm": 16.27264404296875, + "learning_rate": 1.675510204081633e-05, + "loss": 0.2128, + "step": 3180 + }, + { + "epoch": 16.275510204081634, + "grad_norm": 8.446306228637695, + "learning_rate": 1.6744897959183675e-05, + "loss": 0.3292, + "step": 3190 + }, + { + "epoch": 16.3265306122449, + "grad_norm": 16.144853591918945, + "learning_rate": 1.673469387755102e-05, + "loss": 0.4493, + "step": 3200 + }, + { + "epoch": 16.377551020408163, + "grad_norm": 1.8377426862716675, + "learning_rate": 1.6724489795918367e-05, + "loss": 0.3244, + "step": 3210 + }, + { + "epoch": 16.428571428571427, + "grad_norm": 13.898979187011719, + "learning_rate": 1.6714285714285716e-05, + "loss": 0.4197, + "step": 3220 + }, + { + "epoch": 16.479591836734695, + "grad_norm": 20.12917137145996, + "learning_rate": 1.6704081632653062e-05, + "loss": 0.3658, + "step": 3230 + }, + { + "epoch": 16.53061224489796, + "grad_norm": 14.025603294372559, + "learning_rate": 1.669387755102041e-05, + "loss": 0.3219, + "step": 3240 + }, + { + "epoch": 16.581632653061224, + "grad_norm": 5.732223987579346, + "learning_rate": 1.6683673469387757e-05, + "loss": 0.196, + "step": 3250 + }, + { + "epoch": 16.632653061224488, + "grad_norm": 15.704209327697754, + "learning_rate": 1.6673469387755102e-05, + "loss": 0.3943, + "step": 3260 + }, + { + "epoch": 16.683673469387756, + "grad_norm": 7.129629611968994, + "learning_rate": 1.666326530612245e-05, + "loss": 0.2839, + "step": 3270 + }, + { + "epoch": 16.73469387755102, + "grad_norm": 17.930339813232422, + "learning_rate": 1.6653061224489797e-05, + "loss": 0.44, + "step": 3280 + }, + { + "epoch": 16.785714285714285, + "grad_norm": 2.109773874282837, + "learning_rate": 1.6642857142857147e-05, + "loss": 0.2534, + "step": 3290 + }, + { + "epoch": 16.836734693877553, + "grad_norm": 19.851903915405273, + "learning_rate": 1.6632653061224492e-05, + "loss": 0.5779, + "step": 3300 + }, + { + "epoch": 16.887755102040817, + "grad_norm": 8.62447452545166, + "learning_rate": 1.6622448979591838e-05, + "loss": 0.2314, + "step": 3310 + }, + { + "epoch": 16.93877551020408, + "grad_norm": 11.833367347717285, + "learning_rate": 1.6612244897959184e-05, + "loss": 0.2982, + "step": 3320 + }, + { + "epoch": 16.989795918367346, + "grad_norm": 13.485107421875, + "learning_rate": 1.6602040816326533e-05, + "loss": 0.389, + "step": 3330 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.9025270758122743, + "eval_loss": 0.36571940779685974, + "eval_runtime": 1.0007, + "eval_samples_per_second": 276.805, + "eval_steps_per_second": 34.975, + "step": 3332 + }, + { + "epoch": 17.040816326530614, + "grad_norm": 16.494752883911133, + "learning_rate": 1.659183673469388e-05, + "loss": 0.4057, + "step": 3340 + }, + { + "epoch": 17.091836734693878, + "grad_norm": 13.02173137664795, + "learning_rate": 1.6581632653061225e-05, + "loss": 0.2569, + "step": 3350 + }, + { + "epoch": 17.142857142857142, + "grad_norm": 7.940774440765381, + "learning_rate": 1.6571428571428574e-05, + "loss": 0.2372, + "step": 3360 + }, + { + "epoch": 17.193877551020407, + "grad_norm": 13.316302299499512, + "learning_rate": 1.656122448979592e-05, + "loss": 0.4476, + "step": 3370 + }, + { + "epoch": 17.244897959183675, + "grad_norm": 26.874502182006836, + "learning_rate": 1.655102040816327e-05, + "loss": 0.5035, + "step": 3380 + }, + { + "epoch": 17.29591836734694, + "grad_norm": 16.006755828857422, + "learning_rate": 1.6540816326530615e-05, + "loss": 0.2176, + "step": 3390 + }, + { + "epoch": 17.346938775510203, + "grad_norm": 1.1815485954284668, + "learning_rate": 1.653061224489796e-05, + "loss": 0.3316, + "step": 3400 + }, + { + "epoch": 17.397959183673468, + "grad_norm": 4.353585243225098, + "learning_rate": 1.6520408163265306e-05, + "loss": 0.4128, + "step": 3410 + }, + { + "epoch": 17.448979591836736, + "grad_norm": 3.6670403480529785, + "learning_rate": 1.6510204081632655e-05, + "loss": 0.3416, + "step": 3420 + }, + { + "epoch": 17.5, + "grad_norm": 1.7283302545547485, + "learning_rate": 1.65e-05, + "loss": 0.4711, + "step": 3430 + }, + { + "epoch": 17.551020408163264, + "grad_norm": 19.934011459350586, + "learning_rate": 1.6489795918367347e-05, + "loss": 0.2961, + "step": 3440 + }, + { + "epoch": 17.602040816326532, + "grad_norm": 8.976653099060059, + "learning_rate": 1.6479591836734696e-05, + "loss": 0.2276, + "step": 3450 + }, + { + "epoch": 17.653061224489797, + "grad_norm": 17.890167236328125, + "learning_rate": 1.6469387755102042e-05, + "loss": 0.3823, + "step": 3460 + }, + { + "epoch": 17.70408163265306, + "grad_norm": 9.597513198852539, + "learning_rate": 1.645918367346939e-05, + "loss": 0.3835, + "step": 3470 + }, + { + "epoch": 17.755102040816325, + "grad_norm": 8.567350387573242, + "learning_rate": 1.6448979591836737e-05, + "loss": 0.5736, + "step": 3480 + }, + { + "epoch": 17.806122448979593, + "grad_norm": 1.6578407287597656, + "learning_rate": 1.6438775510204083e-05, + "loss": 0.5211, + "step": 3490 + }, + { + "epoch": 17.857142857142858, + "grad_norm": 10.160964012145996, + "learning_rate": 1.642857142857143e-05, + "loss": 0.2336, + "step": 3500 + }, + { + "epoch": 17.908163265306122, + "grad_norm": 15.302521705627441, + "learning_rate": 1.6418367346938778e-05, + "loss": 0.3572, + "step": 3510 + }, + { + "epoch": 17.959183673469386, + "grad_norm": 12.789298057556152, + "learning_rate": 1.6408163265306124e-05, + "loss": 0.302, + "step": 3520 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.4218384921550751, + "eval_runtime": 1.0042, + "eval_samples_per_second": 275.834, + "eval_steps_per_second": 34.853, + "step": 3528 + }, + { + "epoch": 18.010204081632654, + "grad_norm": 3.580617904663086, + "learning_rate": 1.639795918367347e-05, + "loss": 0.3941, + "step": 3530 + }, + { + "epoch": 18.06122448979592, + "grad_norm": 1.19637131690979, + "learning_rate": 1.638775510204082e-05, + "loss": 0.3476, + "step": 3540 + }, + { + "epoch": 18.112244897959183, + "grad_norm": 21.28291130065918, + "learning_rate": 1.6377551020408164e-05, + "loss": 0.3359, + "step": 3550 + }, + { + "epoch": 18.163265306122447, + "grad_norm": 19.236520767211914, + "learning_rate": 1.6367346938775513e-05, + "loss": 0.3292, + "step": 3560 + }, + { + "epoch": 18.214285714285715, + "grad_norm": 16.585281372070312, + "learning_rate": 1.635714285714286e-05, + "loss": 0.3443, + "step": 3570 + }, + { + "epoch": 18.26530612244898, + "grad_norm": 24.769123077392578, + "learning_rate": 1.6346938775510205e-05, + "loss": 0.4157, + "step": 3580 + }, + { + "epoch": 18.316326530612244, + "grad_norm": 11.153094291687012, + "learning_rate": 1.633673469387755e-05, + "loss": 0.4492, + "step": 3590 + }, + { + "epoch": 18.367346938775512, + "grad_norm": 4.06447172164917, + "learning_rate": 1.63265306122449e-05, + "loss": 0.4643, + "step": 3600 + }, + { + "epoch": 18.418367346938776, + "grad_norm": 4.228175163269043, + "learning_rate": 1.6316326530612246e-05, + "loss": 0.346, + "step": 3610 + }, + { + "epoch": 18.46938775510204, + "grad_norm": 16.569456100463867, + "learning_rate": 1.630612244897959e-05, + "loss": 0.4032, + "step": 3620 + }, + { + "epoch": 18.520408163265305, + "grad_norm": 13.997138023376465, + "learning_rate": 1.629591836734694e-05, + "loss": 0.4245, + "step": 3630 + }, + { + "epoch": 18.571428571428573, + "grad_norm": 5.629631996154785, + "learning_rate": 1.6285714285714287e-05, + "loss": 0.3134, + "step": 3640 + }, + { + "epoch": 18.622448979591837, + "grad_norm": 21.551403045654297, + "learning_rate": 1.6275510204081636e-05, + "loss": 0.4976, + "step": 3650 + }, + { + "epoch": 18.6734693877551, + "grad_norm": 24.541152954101562, + "learning_rate": 1.626530612244898e-05, + "loss": 0.3738, + "step": 3660 + }, + { + "epoch": 18.724489795918366, + "grad_norm": 3.2568202018737793, + "learning_rate": 1.6255102040816327e-05, + "loss": 0.3532, + "step": 3670 + }, + { + "epoch": 18.775510204081634, + "grad_norm": 9.226503372192383, + "learning_rate": 1.6244897959183673e-05, + "loss": 0.3329, + "step": 3680 + }, + { + "epoch": 18.8265306122449, + "grad_norm": 7.931892395019531, + "learning_rate": 1.6234693877551022e-05, + "loss": 0.1971, + "step": 3690 + }, + { + "epoch": 18.877551020408163, + "grad_norm": 10.871854782104492, + "learning_rate": 1.6224489795918368e-05, + "loss": 0.2566, + "step": 3700 + }, + { + "epoch": 18.928571428571427, + "grad_norm": 20.2423038482666, + "learning_rate": 1.6214285714285717e-05, + "loss": 0.529, + "step": 3710 + }, + { + "epoch": 18.979591836734695, + "grad_norm": 1.280144453048706, + "learning_rate": 1.6204081632653063e-05, + "loss": 0.4693, + "step": 3720 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.32263943552970886, + "eval_runtime": 1.0066, + "eval_samples_per_second": 275.174, + "eval_steps_per_second": 34.769, + "step": 3724 + }, + { + "epoch": 19.03061224489796, + "grad_norm": 7.251153469085693, + "learning_rate": 1.619387755102041e-05, + "loss": 0.4467, + "step": 3730 + }, + { + "epoch": 19.081632653061224, + "grad_norm": 8.741971969604492, + "learning_rate": 1.6183673469387758e-05, + "loss": 0.3344, + "step": 3740 + }, + { + "epoch": 19.132653061224488, + "grad_norm": 3.309220314025879, + "learning_rate": 1.6173469387755104e-05, + "loss": 0.4531, + "step": 3750 + }, + { + "epoch": 19.183673469387756, + "grad_norm": 24.69237518310547, + "learning_rate": 1.616326530612245e-05, + "loss": 0.4573, + "step": 3760 + }, + { + "epoch": 19.23469387755102, + "grad_norm": 12.22867488861084, + "learning_rate": 1.6153061224489795e-05, + "loss": 0.2069, + "step": 3770 + }, + { + "epoch": 19.285714285714285, + "grad_norm": 13.975093841552734, + "learning_rate": 1.6142857142857145e-05, + "loss": 0.338, + "step": 3780 + }, + { + "epoch": 19.336734693877553, + "grad_norm": 7.9862751960754395, + "learning_rate": 1.613265306122449e-05, + "loss": 0.3315, + "step": 3790 + }, + { + "epoch": 19.387755102040817, + "grad_norm": 19.306745529174805, + "learning_rate": 1.612244897959184e-05, + "loss": 0.5595, + "step": 3800 + }, + { + "epoch": 19.43877551020408, + "grad_norm": 11.886868476867676, + "learning_rate": 1.6112244897959185e-05, + "loss": 0.2389, + "step": 3810 + }, + { + "epoch": 19.489795918367346, + "grad_norm": 3.050920009613037, + "learning_rate": 1.610204081632653e-05, + "loss": 0.3521, + "step": 3820 + }, + { + "epoch": 19.540816326530614, + "grad_norm": 11.111821174621582, + "learning_rate": 1.609183673469388e-05, + "loss": 0.264, + "step": 3830 + }, + { + "epoch": 19.591836734693878, + "grad_norm": 14.456820487976074, + "learning_rate": 1.6081632653061226e-05, + "loss": 0.375, + "step": 3840 + }, + { + "epoch": 19.642857142857142, + "grad_norm": 10.985493659973145, + "learning_rate": 1.6071428571428572e-05, + "loss": 0.2618, + "step": 3850 + }, + { + "epoch": 19.693877551020407, + "grad_norm": 2.003209114074707, + "learning_rate": 1.6061224489795918e-05, + "loss": 0.3972, + "step": 3860 + }, + { + "epoch": 19.744897959183675, + "grad_norm": 30.124954223632812, + "learning_rate": 1.6051020408163267e-05, + "loss": 0.39, + "step": 3870 + }, + { + "epoch": 19.79591836734694, + "grad_norm": 4.023131370544434, + "learning_rate": 1.6040816326530613e-05, + "loss": 0.3169, + "step": 3880 + }, + { + "epoch": 19.846938775510203, + "grad_norm": 10.172859191894531, + "learning_rate": 1.6030612244897962e-05, + "loss": 0.2813, + "step": 3890 + }, + { + "epoch": 19.897959183673468, + "grad_norm": 2.260370969772339, + "learning_rate": 1.6020408163265308e-05, + "loss": 0.2165, + "step": 3900 + }, + { + "epoch": 19.948979591836736, + "grad_norm": 1.3247283697128296, + "learning_rate": 1.6010204081632653e-05, + "loss": 0.2577, + "step": 3910 + }, + { + "epoch": 20.0, + "grad_norm": 5.097670078277588, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.6346, + "step": 3920 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.32766658067703247, + "eval_runtime": 1.0015, + "eval_samples_per_second": 276.586, + "eval_steps_per_second": 34.948, + "step": 3920 + }, + { + "epoch": 20.051020408163264, + "grad_norm": 17.960142135620117, + "learning_rate": 1.598979591836735e-05, + "loss": 0.4058, + "step": 3930 + }, + { + "epoch": 20.102040816326532, + "grad_norm": 17.930652618408203, + "learning_rate": 1.5979591836734694e-05, + "loss": 0.313, + "step": 3940 + }, + { + "epoch": 20.153061224489797, + "grad_norm": 5.853415489196777, + "learning_rate": 1.596938775510204e-05, + "loss": 0.3648, + "step": 3950 + }, + { + "epoch": 20.20408163265306, + "grad_norm": 23.82584571838379, + "learning_rate": 1.595918367346939e-05, + "loss": 0.4226, + "step": 3960 + }, + { + "epoch": 20.255102040816325, + "grad_norm": 6.414389610290527, + "learning_rate": 1.5948979591836735e-05, + "loss": 0.3111, + "step": 3970 + }, + { + "epoch": 20.306122448979593, + "grad_norm": 9.15027904510498, + "learning_rate": 1.5938775510204084e-05, + "loss": 0.3707, + "step": 3980 + }, + { + "epoch": 20.357142857142858, + "grad_norm": 9.374833106994629, + "learning_rate": 1.592857142857143e-05, + "loss": 0.4962, + "step": 3990 + }, + { + "epoch": 20.408163265306122, + "grad_norm": 9.610342979431152, + "learning_rate": 1.5918367346938776e-05, + "loss": 0.4891, + "step": 4000 + }, + { + "epoch": 20.459183673469386, + "grad_norm": 8.49427318572998, + "learning_rate": 1.5908163265306125e-05, + "loss": 0.2952, + "step": 4010 + }, + { + "epoch": 20.510204081632654, + "grad_norm": 6.4004950523376465, + "learning_rate": 1.589795918367347e-05, + "loss": 0.2062, + "step": 4020 + }, + { + "epoch": 20.56122448979592, + "grad_norm": 17.373149871826172, + "learning_rate": 1.588775510204082e-05, + "loss": 0.2141, + "step": 4030 + }, + { + "epoch": 20.612244897959183, + "grad_norm": 11.331439971923828, + "learning_rate": 1.5877551020408162e-05, + "loss": 0.182, + "step": 4040 + }, + { + "epoch": 20.663265306122447, + "grad_norm": 3.9347431659698486, + "learning_rate": 1.586734693877551e-05, + "loss": 0.4026, + "step": 4050 + }, + { + "epoch": 20.714285714285715, + "grad_norm": 24.14126968383789, + "learning_rate": 1.5857142857142857e-05, + "loss": 0.2464, + "step": 4060 + }, + { + "epoch": 20.76530612244898, + "grad_norm": 6.619481086730957, + "learning_rate": 1.5846938775510206e-05, + "loss": 0.1703, + "step": 4070 + }, + { + "epoch": 20.816326530612244, + "grad_norm": 16.354969024658203, + "learning_rate": 1.5836734693877552e-05, + "loss": 0.2474, + "step": 4080 + }, + { + "epoch": 20.867346938775512, + "grad_norm": 15.212191581726074, + "learning_rate": 1.5826530612244898e-05, + "loss": 0.1955, + "step": 4090 + }, + { + "epoch": 20.918367346938776, + "grad_norm": 9.520818710327148, + "learning_rate": 1.5816326530612247e-05, + "loss": 0.4123, + "step": 4100 + }, + { + "epoch": 20.96938775510204, + "grad_norm": 5.534724712371826, + "learning_rate": 1.5806122448979593e-05, + "loss": 0.481, + "step": 4110 + }, + { + "epoch": 21.0, + "eval_accuracy": 0.8700361010830325, + "eval_loss": 0.3483729362487793, + "eval_runtime": 1.0073, + "eval_samples_per_second": 274.99, + "eval_steps_per_second": 34.746, + "step": 4116 + }, + { + "epoch": 21.020408163265305, + "grad_norm": 24.056684494018555, + "learning_rate": 1.5795918367346942e-05, + "loss": 0.5853, + "step": 4120 + }, + { + "epoch": 21.071428571428573, + "grad_norm": 33.364036560058594, + "learning_rate": 1.5785714285714288e-05, + "loss": 0.5125, + "step": 4130 + }, + { + "epoch": 21.122448979591837, + "grad_norm": 23.837322235107422, + "learning_rate": 1.5775510204081634e-05, + "loss": 0.3501, + "step": 4140 + }, + { + "epoch": 21.1734693877551, + "grad_norm": 6.065820693969727, + "learning_rate": 1.576530612244898e-05, + "loss": 0.2312, + "step": 4150 + }, + { + "epoch": 21.224489795918366, + "grad_norm": 20.695560455322266, + "learning_rate": 1.575510204081633e-05, + "loss": 0.2974, + "step": 4160 + }, + { + "epoch": 21.275510204081634, + "grad_norm": 0.9739782214164734, + "learning_rate": 1.5744897959183675e-05, + "loss": 0.3103, + "step": 4170 + }, + { + "epoch": 21.3265306122449, + "grad_norm": 21.136749267578125, + "learning_rate": 1.573469387755102e-05, + "loss": 0.3986, + "step": 4180 + }, + { + "epoch": 21.377551020408163, + "grad_norm": 4.020694255828857, + "learning_rate": 1.572448979591837e-05, + "loss": 0.2465, + "step": 4190 + }, + { + "epoch": 21.428571428571427, + "grad_norm": 15.7801513671875, + "learning_rate": 1.5714285714285715e-05, + "loss": 0.3448, + "step": 4200 + }, + { + "epoch": 21.479591836734695, + "grad_norm": 5.353526592254639, + "learning_rate": 1.5704081632653065e-05, + "loss": 0.3713, + "step": 4210 + }, + { + "epoch": 21.53061224489796, + "grad_norm": 2.6576883792877197, + "learning_rate": 1.569387755102041e-05, + "loss": 0.2645, + "step": 4220 + }, + { + "epoch": 21.581632653061224, + "grad_norm": 4.789892673492432, + "learning_rate": 1.5683673469387756e-05, + "loss": 0.3579, + "step": 4230 + }, + { + "epoch": 21.632653061224488, + "grad_norm": 16.272117614746094, + "learning_rate": 1.5673469387755102e-05, + "loss": 0.1917, + "step": 4240 + }, + { + "epoch": 21.683673469387756, + "grad_norm": 15.64486026763916, + "learning_rate": 1.566326530612245e-05, + "loss": 0.2434, + "step": 4250 + }, + { + "epoch": 21.73469387755102, + "grad_norm": 19.42757225036621, + "learning_rate": 1.5653061224489797e-05, + "loss": 0.2864, + "step": 4260 + }, + { + "epoch": 21.785714285714285, + "grad_norm": 11.550178527832031, + "learning_rate": 1.5642857142857143e-05, + "loss": 0.4449, + "step": 4270 + }, + { + "epoch": 21.836734693877553, + "grad_norm": 20.477523803710938, + "learning_rate": 1.5632653061224492e-05, + "loss": 0.5275, + "step": 4280 + }, + { + "epoch": 21.887755102040817, + "grad_norm": 7.140037536621094, + "learning_rate": 1.5622448979591838e-05, + "loss": 0.3824, + "step": 4290 + }, + { + "epoch": 21.93877551020408, + "grad_norm": 26.250823974609375, + "learning_rate": 1.5612244897959187e-05, + "loss": 0.3982, + "step": 4300 + }, + { + "epoch": 21.989795918367346, + "grad_norm": 9.258036613464355, + "learning_rate": 1.5602040816326533e-05, + "loss": 0.2628, + "step": 4310 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.9025270758122743, + "eval_loss": 0.3941589593887329, + "eval_runtime": 1.0011, + "eval_samples_per_second": 276.706, + "eval_steps_per_second": 34.963, + "step": 4312 + }, + { + "epoch": 22.040816326530614, + "grad_norm": 16.95090103149414, + "learning_rate": 1.559183673469388e-05, + "loss": 0.3304, + "step": 4320 + }, + { + "epoch": 22.091836734693878, + "grad_norm": 14.246356964111328, + "learning_rate": 1.5581632653061224e-05, + "loss": 0.4039, + "step": 4330 + }, + { + "epoch": 22.142857142857142, + "grad_norm": 15.981473922729492, + "learning_rate": 1.5571428571428573e-05, + "loss": 0.3328, + "step": 4340 + }, + { + "epoch": 22.193877551020407, + "grad_norm": 31.02018928527832, + "learning_rate": 1.556122448979592e-05, + "loss": 0.4779, + "step": 4350 + }, + { + "epoch": 22.244897959183675, + "grad_norm": 19.655574798583984, + "learning_rate": 1.5551020408163265e-05, + "loss": 0.3275, + "step": 4360 + }, + { + "epoch": 22.29591836734694, + "grad_norm": 20.44514274597168, + "learning_rate": 1.5540816326530614e-05, + "loss": 0.4116, + "step": 4370 + }, + { + "epoch": 22.346938775510203, + "grad_norm": 20.546932220458984, + "learning_rate": 1.553061224489796e-05, + "loss": 0.4797, + "step": 4380 + }, + { + "epoch": 22.397959183673468, + "grad_norm": 7.014615535736084, + "learning_rate": 1.552040816326531e-05, + "loss": 0.2462, + "step": 4390 + }, + { + "epoch": 22.448979591836736, + "grad_norm": 13.318962097167969, + "learning_rate": 1.5510204081632655e-05, + "loss": 0.2696, + "step": 4400 + }, + { + "epoch": 22.5, + "grad_norm": 28.895915985107422, + "learning_rate": 1.55e-05, + "loss": 0.5001, + "step": 4410 + }, + { + "epoch": 22.551020408163264, + "grad_norm": 12.874469757080078, + "learning_rate": 1.5489795918367346e-05, + "loss": 0.3738, + "step": 4420 + }, + { + "epoch": 22.602040816326532, + "grad_norm": 7.524547576904297, + "learning_rate": 1.5479591836734696e-05, + "loss": 0.3449, + "step": 4430 + }, + { + "epoch": 22.653061224489797, + "grad_norm": 18.373628616333008, + "learning_rate": 1.546938775510204e-05, + "loss": 0.2432, + "step": 4440 + }, + { + "epoch": 22.70408163265306, + "grad_norm": 11.600316047668457, + "learning_rate": 1.545918367346939e-05, + "loss": 0.2661, + "step": 4450 + }, + { + "epoch": 22.755102040816325, + "grad_norm": 9.11487865447998, + "learning_rate": 1.5448979591836736e-05, + "loss": 0.2246, + "step": 4460 + }, + { + "epoch": 22.806122448979593, + "grad_norm": 10.225346565246582, + "learning_rate": 1.5438775510204082e-05, + "loss": 0.3571, + "step": 4470 + }, + { + "epoch": 22.857142857142858, + "grad_norm": 3.3592870235443115, + "learning_rate": 1.542857142857143e-05, + "loss": 0.2024, + "step": 4480 + }, + { + "epoch": 22.908163265306122, + "grad_norm": 1.110011339187622, + "learning_rate": 1.5418367346938777e-05, + "loss": 0.3532, + "step": 4490 + }, + { + "epoch": 22.959183673469386, + "grad_norm": 22.233642578125, + "learning_rate": 1.5408163265306123e-05, + "loss": 0.3653, + "step": 4500 + }, + { + "epoch": 23.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.3536584675312042, + "eval_runtime": 0.9995, + "eval_samples_per_second": 277.141, + "eval_steps_per_second": 35.018, + "step": 4508 + }, + { + "epoch": 23.010204081632654, + "grad_norm": 19.268421173095703, + "learning_rate": 1.539795918367347e-05, + "loss": 0.2659, + "step": 4510 + }, + { + "epoch": 23.06122448979592, + "grad_norm": 10.453993797302246, + "learning_rate": 1.5387755102040818e-05, + "loss": 0.3287, + "step": 4520 + }, + { + "epoch": 23.112244897959183, + "grad_norm": 12.176286697387695, + "learning_rate": 1.5377551020408164e-05, + "loss": 0.2958, + "step": 4530 + }, + { + "epoch": 23.163265306122447, + "grad_norm": 27.02992820739746, + "learning_rate": 1.5367346938775513e-05, + "loss": 0.4748, + "step": 4540 + }, + { + "epoch": 23.214285714285715, + "grad_norm": 16.127704620361328, + "learning_rate": 1.535714285714286e-05, + "loss": 0.3475, + "step": 4550 + }, + { + "epoch": 23.26530612244898, + "grad_norm": 8.45223617553711, + "learning_rate": 1.5346938775510204e-05, + "loss": 0.2842, + "step": 4560 + }, + { + "epoch": 23.316326530612244, + "grad_norm": 4.0458173751831055, + "learning_rate": 1.5336734693877554e-05, + "loss": 0.3057, + "step": 4570 + }, + { + "epoch": 23.367346938775512, + "grad_norm": 7.543426513671875, + "learning_rate": 1.53265306122449e-05, + "loss": 0.3122, + "step": 4580 + }, + { + "epoch": 23.418367346938776, + "grad_norm": 16.05493927001953, + "learning_rate": 1.5316326530612245e-05, + "loss": 0.2647, + "step": 4590 + }, + { + "epoch": 23.46938775510204, + "grad_norm": 6.471177577972412, + "learning_rate": 1.530612244897959e-05, + "loss": 0.3331, + "step": 4600 + }, + { + "epoch": 23.520408163265305, + "grad_norm": 12.215070724487305, + "learning_rate": 1.529591836734694e-05, + "loss": 0.3584, + "step": 4610 + }, + { + "epoch": 23.571428571428573, + "grad_norm": 5.147373199462891, + "learning_rate": 1.5285714285714286e-05, + "loss": 0.3692, + "step": 4620 + }, + { + "epoch": 23.622448979591837, + "grad_norm": 17.560205459594727, + "learning_rate": 1.5275510204081635e-05, + "loss": 0.3557, + "step": 4630 + }, + { + "epoch": 23.6734693877551, + "grad_norm": 7.91624116897583, + "learning_rate": 1.526530612244898e-05, + "loss": 0.2358, + "step": 4640 + }, + { + "epoch": 23.724489795918366, + "grad_norm": 1.7177863121032715, + "learning_rate": 1.5255102040816327e-05, + "loss": 0.3093, + "step": 4650 + }, + { + "epoch": 23.775510204081634, + "grad_norm": 22.782413482666016, + "learning_rate": 1.5244897959183676e-05, + "loss": 0.2665, + "step": 4660 + }, + { + "epoch": 23.8265306122449, + "grad_norm": 13.676918983459473, + "learning_rate": 1.5234693877551022e-05, + "loss": 0.4009, + "step": 4670 + }, + { + "epoch": 23.877551020408163, + "grad_norm": 3.520563840866089, + "learning_rate": 1.522448979591837e-05, + "loss": 0.4495, + "step": 4680 + }, + { + "epoch": 23.928571428571427, + "grad_norm": 15.287459373474121, + "learning_rate": 1.5214285714285715e-05, + "loss": 0.2907, + "step": 4690 + }, + { + "epoch": 23.979591836734695, + "grad_norm": 8.540837287902832, + "learning_rate": 1.5204081632653063e-05, + "loss": 0.344, + "step": 4700 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.4757726788520813, + "eval_runtime": 0.9993, + "eval_samples_per_second": 277.188, + "eval_steps_per_second": 35.024, + "step": 4704 + }, + { + "epoch": 24.03061224489796, + "grad_norm": 19.70223045349121, + "learning_rate": 1.5193877551020408e-05, + "loss": 0.3386, + "step": 4710 + }, + { + "epoch": 24.081632653061224, + "grad_norm": 9.351834297180176, + "learning_rate": 1.5183673469387756e-05, + "loss": 0.3604, + "step": 4720 + }, + { + "epoch": 24.132653061224488, + "grad_norm": 9.551629066467285, + "learning_rate": 1.5173469387755105e-05, + "loss": 0.2962, + "step": 4730 + }, + { + "epoch": 24.183673469387756, + "grad_norm": 9.061222076416016, + "learning_rate": 1.516326530612245e-05, + "loss": 0.5333, + "step": 4740 + }, + { + "epoch": 24.23469387755102, + "grad_norm": 4.798241138458252, + "learning_rate": 1.5153061224489798e-05, + "loss": 0.5315, + "step": 4750 + }, + { + "epoch": 24.285714285714285, + "grad_norm": 7.423737049102783, + "learning_rate": 1.5142857142857144e-05, + "loss": 0.2938, + "step": 4760 + }, + { + "epoch": 24.336734693877553, + "grad_norm": 5.41143274307251, + "learning_rate": 1.5132653061224492e-05, + "loss": 0.1947, + "step": 4770 + }, + { + "epoch": 24.387755102040817, + "grad_norm": 2.4057772159576416, + "learning_rate": 1.5122448979591837e-05, + "loss": 0.103, + "step": 4780 + }, + { + "epoch": 24.43877551020408, + "grad_norm": 5.339882850646973, + "learning_rate": 1.5112244897959185e-05, + "loss": 0.2654, + "step": 4790 + }, + { + "epoch": 24.489795918367346, + "grad_norm": 7.864975452423096, + "learning_rate": 1.510204081632653e-05, + "loss": 0.3742, + "step": 4800 + }, + { + "epoch": 24.540816326530614, + "grad_norm": 16.52027130126953, + "learning_rate": 1.5091836734693878e-05, + "loss": 0.4978, + "step": 4810 + }, + { + "epoch": 24.591836734693878, + "grad_norm": 24.7138671875, + "learning_rate": 1.5081632653061227e-05, + "loss": 0.4509, + "step": 4820 + }, + { + "epoch": 24.642857142857142, + "grad_norm": 5.2749176025390625, + "learning_rate": 1.5071428571428573e-05, + "loss": 0.2312, + "step": 4830 + }, + { + "epoch": 24.693877551020407, + "grad_norm": 14.561326026916504, + "learning_rate": 1.506122448979592e-05, + "loss": 0.3655, + "step": 4840 + }, + { + "epoch": 24.744897959183675, + "grad_norm": 14.318037033081055, + "learning_rate": 1.5051020408163266e-05, + "loss": 0.4761, + "step": 4850 + }, + { + "epoch": 24.79591836734694, + "grad_norm": 10.539541244506836, + "learning_rate": 1.5040816326530614e-05, + "loss": 0.2808, + "step": 4860 + }, + { + "epoch": 24.846938775510203, + "grad_norm": 7.134363651275635, + "learning_rate": 1.503061224489796e-05, + "loss": 0.2832, + "step": 4870 + }, + { + "epoch": 24.897959183673468, + "grad_norm": 12.88166332244873, + "learning_rate": 1.5020408163265307e-05, + "loss": 0.309, + "step": 4880 + }, + { + "epoch": 24.948979591836736, + "grad_norm": 2.4710118770599365, + "learning_rate": 1.5010204081632653e-05, + "loss": 0.3716, + "step": 4890 + }, + { + "epoch": 25.0, + "grad_norm": 25.229394912719727, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.2819, + "step": 4900 + }, + { + "epoch": 25.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.4317737817764282, + "eval_runtime": 0.9999, + "eval_samples_per_second": 277.019, + "eval_steps_per_second": 35.002, + "step": 4900 + }, + { + "epoch": 25.051020408163264, + "grad_norm": 22.905338287353516, + "learning_rate": 1.498979591836735e-05, + "loss": 0.4168, + "step": 4910 + }, + { + "epoch": 25.102040816326532, + "grad_norm": 29.783021926879883, + "learning_rate": 1.4979591836734695e-05, + "loss": 0.3811, + "step": 4920 + }, + { + "epoch": 25.153061224489797, + "grad_norm": 27.190210342407227, + "learning_rate": 1.4969387755102043e-05, + "loss": 0.2427, + "step": 4930 + }, + { + "epoch": 25.20408163265306, + "grad_norm": 0.8235872983932495, + "learning_rate": 1.4959183673469389e-05, + "loss": 0.4389, + "step": 4940 + }, + { + "epoch": 25.255102040816325, + "grad_norm": 19.529327392578125, + "learning_rate": 1.4948979591836736e-05, + "loss": 0.4845, + "step": 4950 + }, + { + "epoch": 25.306122448979593, + "grad_norm": 10.911797523498535, + "learning_rate": 1.4938775510204082e-05, + "loss": 0.2379, + "step": 4960 + }, + { + "epoch": 25.357142857142858, + "grad_norm": 5.454984188079834, + "learning_rate": 1.492857142857143e-05, + "loss": 0.3351, + "step": 4970 + }, + { + "epoch": 25.408163265306122, + "grad_norm": 3.233513355255127, + "learning_rate": 1.4918367346938775e-05, + "loss": 0.3645, + "step": 4980 + }, + { + "epoch": 25.459183673469386, + "grad_norm": 15.120831489562988, + "learning_rate": 1.4908163265306124e-05, + "loss": 0.2244, + "step": 4990 + }, + { + "epoch": 25.510204081632654, + "grad_norm": 10.843507766723633, + "learning_rate": 1.4897959183673472e-05, + "loss": 0.2098, + "step": 5000 + }, + { + "epoch": 25.56122448979592, + "grad_norm": 3.795426845550537, + "learning_rate": 1.4887755102040818e-05, + "loss": 0.2247, + "step": 5010 + }, + { + "epoch": 25.612244897959183, + "grad_norm": 19.793106079101562, + "learning_rate": 1.4877551020408165e-05, + "loss": 0.3439, + "step": 5020 + }, + { + "epoch": 25.663265306122447, + "grad_norm": 2.860851287841797, + "learning_rate": 1.4867346938775511e-05, + "loss": 0.3408, + "step": 5030 + }, + { + "epoch": 25.714285714285715, + "grad_norm": 17.430416107177734, + "learning_rate": 1.4857142857142858e-05, + "loss": 0.2864, + "step": 5040 + }, + { + "epoch": 25.76530612244898, + "grad_norm": 16.919153213500977, + "learning_rate": 1.4846938775510204e-05, + "loss": 0.3402, + "step": 5050 + }, + { + "epoch": 25.816326530612244, + "grad_norm": 17.400094985961914, + "learning_rate": 1.4836734693877552e-05, + "loss": 0.2115, + "step": 5060 + }, + { + "epoch": 25.867346938775512, + "grad_norm": 18.359149932861328, + "learning_rate": 1.4826530612244897e-05, + "loss": 0.3076, + "step": 5070 + }, + { + "epoch": 25.918367346938776, + "grad_norm": 20.640539169311523, + "learning_rate": 1.4816326530612247e-05, + "loss": 0.4399, + "step": 5080 + }, + { + "epoch": 25.96938775510204, + "grad_norm": 17.191911697387695, + "learning_rate": 1.4806122448979594e-05, + "loss": 0.513, + "step": 5090 + }, + { + "epoch": 26.0, + "eval_accuracy": 0.8411552346570397, + "eval_loss": 0.4276503324508667, + "eval_runtime": 1.0085, + "eval_samples_per_second": 274.665, + "eval_steps_per_second": 34.705, + "step": 5096 + }, + { + "epoch": 26.020408163265305, + "grad_norm": 2.7476930618286133, + "learning_rate": 1.479591836734694e-05, + "loss": 0.1332, + "step": 5100 + }, + { + "epoch": 26.071428571428573, + "grad_norm": 13.904044151306152, + "learning_rate": 1.4785714285714287e-05, + "loss": 0.3359, + "step": 5110 + }, + { + "epoch": 26.122448979591837, + "grad_norm": 4.479464054107666, + "learning_rate": 1.4775510204081633e-05, + "loss": 0.2455, + "step": 5120 + }, + { + "epoch": 26.1734693877551, + "grad_norm": 1.6331404447555542, + "learning_rate": 1.476530612244898e-05, + "loss": 0.2045, + "step": 5130 + }, + { + "epoch": 26.224489795918366, + "grad_norm": 16.57945442199707, + "learning_rate": 1.4755102040816326e-05, + "loss": 0.299, + "step": 5140 + }, + { + "epoch": 26.275510204081634, + "grad_norm": 17.72007179260254, + "learning_rate": 1.4744897959183676e-05, + "loss": 0.3895, + "step": 5150 + }, + { + "epoch": 26.3265306122449, + "grad_norm": 3.837421417236328, + "learning_rate": 1.4734693877551021e-05, + "loss": 0.2555, + "step": 5160 + }, + { + "epoch": 26.377551020408163, + "grad_norm": 4.05374002456665, + "learning_rate": 1.4724489795918369e-05, + "loss": 0.3095, + "step": 5170 + }, + { + "epoch": 26.428571428571427, + "grad_norm": 1.146775722503662, + "learning_rate": 1.4714285714285716e-05, + "loss": 0.4116, + "step": 5180 + }, + { + "epoch": 26.479591836734695, + "grad_norm": 10.186199188232422, + "learning_rate": 1.4704081632653062e-05, + "loss": 0.3021, + "step": 5190 + }, + { + "epoch": 26.53061224489796, + "grad_norm": 9.732986450195312, + "learning_rate": 1.469387755102041e-05, + "loss": 0.3106, + "step": 5200 + }, + { + "epoch": 26.581632653061224, + "grad_norm": 16.072511672973633, + "learning_rate": 1.4683673469387756e-05, + "loss": 0.2345, + "step": 5210 + }, + { + "epoch": 26.632653061224488, + "grad_norm": 20.76091194152832, + "learning_rate": 1.4673469387755103e-05, + "loss": 0.3601, + "step": 5220 + }, + { + "epoch": 26.683673469387756, + "grad_norm": 12.08093547821045, + "learning_rate": 1.4663265306122449e-05, + "loss": 0.3567, + "step": 5230 + }, + { + "epoch": 26.73469387755102, + "grad_norm": 14.45045280456543, + "learning_rate": 1.4653061224489798e-05, + "loss": 0.3076, + "step": 5240 + }, + { + "epoch": 26.785714285714285, + "grad_norm": 35.13410949707031, + "learning_rate": 1.4642857142857144e-05, + "loss": 0.4542, + "step": 5250 + }, + { + "epoch": 26.836734693877553, + "grad_norm": 2.0448801517486572, + "learning_rate": 1.4632653061224491e-05, + "loss": 0.3049, + "step": 5260 + }, + { + "epoch": 26.887755102040817, + "grad_norm": 2.505708932876587, + "learning_rate": 1.4622448979591839e-05, + "loss": 0.3942, + "step": 5270 + }, + { + "epoch": 26.93877551020408, + "grad_norm": 4.900257110595703, + "learning_rate": 1.4612244897959185e-05, + "loss": 0.2056, + "step": 5280 + }, + { + "epoch": 26.989795918367346, + "grad_norm": 9.844413757324219, + "learning_rate": 1.4602040816326532e-05, + "loss": 0.201, + "step": 5290 + }, + { + "epoch": 27.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.39148736000061035, + "eval_runtime": 1.0178, + "eval_samples_per_second": 272.162, + "eval_steps_per_second": 34.389, + "step": 5292 + }, + { + "epoch": 27.040816326530614, + "grad_norm": 1.834780216217041, + "learning_rate": 1.4591836734693878e-05, + "loss": 0.1804, + "step": 5300 + }, + { + "epoch": 27.091836734693878, + "grad_norm": 19.975610733032227, + "learning_rate": 1.4581632653061227e-05, + "loss": 0.6168, + "step": 5310 + }, + { + "epoch": 27.142857142857142, + "grad_norm": 17.089984893798828, + "learning_rate": 1.4571428571428573e-05, + "loss": 0.364, + "step": 5320 + }, + { + "epoch": 27.193877551020407, + "grad_norm": 4.962829113006592, + "learning_rate": 1.456122448979592e-05, + "loss": 0.205, + "step": 5330 + }, + { + "epoch": 27.244897959183675, + "grad_norm": 13.654166221618652, + "learning_rate": 1.4551020408163266e-05, + "loss": 0.4686, + "step": 5340 + }, + { + "epoch": 27.29591836734694, + "grad_norm": 9.818408966064453, + "learning_rate": 1.4540816326530614e-05, + "loss": 0.3038, + "step": 5350 + }, + { + "epoch": 27.346938775510203, + "grad_norm": 24.390851974487305, + "learning_rate": 1.4530612244897961e-05, + "loss": 0.2355, + "step": 5360 + }, + { + "epoch": 27.397959183673468, + "grad_norm": 4.868263244628906, + "learning_rate": 1.4520408163265307e-05, + "loss": 0.6586, + "step": 5370 + }, + { + "epoch": 27.448979591836736, + "grad_norm": 1.5648664236068726, + "learning_rate": 1.4510204081632654e-05, + "loss": 0.2769, + "step": 5380 + }, + { + "epoch": 27.5, + "grad_norm": 20.872623443603516, + "learning_rate": 1.45e-05, + "loss": 0.2246, + "step": 5390 + }, + { + "epoch": 27.551020408163264, + "grad_norm": 23.290847778320312, + "learning_rate": 1.448979591836735e-05, + "loss": 0.3162, + "step": 5400 + }, + { + "epoch": 27.602040816326532, + "grad_norm": 7.259654521942139, + "learning_rate": 1.4479591836734695e-05, + "loss": 0.2266, + "step": 5410 + }, + { + "epoch": 27.653061224489797, + "grad_norm": 23.416406631469727, + "learning_rate": 1.4469387755102043e-05, + "loss": 0.3571, + "step": 5420 + }, + { + "epoch": 27.70408163265306, + "grad_norm": 11.246001243591309, + "learning_rate": 1.4459183673469388e-05, + "loss": 0.4936, + "step": 5430 + }, + { + "epoch": 27.755102040816325, + "grad_norm": 12.674735069274902, + "learning_rate": 1.4448979591836736e-05, + "loss": 0.3005, + "step": 5440 + }, + { + "epoch": 27.806122448979593, + "grad_norm": 4.631349563598633, + "learning_rate": 1.4438775510204083e-05, + "loss": 0.4068, + "step": 5450 + }, + { + "epoch": 27.857142857142858, + "grad_norm": 19.165470123291016, + "learning_rate": 1.4428571428571429e-05, + "loss": 0.548, + "step": 5460 + }, + { + "epoch": 27.908163265306122, + "grad_norm": 32.93234634399414, + "learning_rate": 1.4418367346938778e-05, + "loss": 0.2997, + "step": 5470 + }, + { + "epoch": 27.959183673469386, + "grad_norm": 4.722479820251465, + "learning_rate": 1.4408163265306122e-05, + "loss": 0.2696, + "step": 5480 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.44008469581604004, + "eval_runtime": 1.0128, + "eval_samples_per_second": 273.489, + "eval_steps_per_second": 34.556, + "step": 5488 + }, + { + "epoch": 28.010204081632654, + "grad_norm": 25.07076072692871, + "learning_rate": 1.4397959183673472e-05, + "loss": 0.4141, + "step": 5490 + }, + { + "epoch": 28.06122448979592, + "grad_norm": 4.1565070152282715, + "learning_rate": 1.4387755102040817e-05, + "loss": 0.1632, + "step": 5500 + }, + { + "epoch": 28.112244897959183, + "grad_norm": 28.585725784301758, + "learning_rate": 1.4377551020408165e-05, + "loss": 0.3566, + "step": 5510 + }, + { + "epoch": 28.163265306122447, + "grad_norm": 18.321834564208984, + "learning_rate": 1.436734693877551e-05, + "loss": 0.3612, + "step": 5520 + }, + { + "epoch": 28.214285714285715, + "grad_norm": 12.811814308166504, + "learning_rate": 1.4357142857142858e-05, + "loss": 0.4465, + "step": 5530 + }, + { + "epoch": 28.26530612244898, + "grad_norm": 4.566598415374756, + "learning_rate": 1.4346938775510206e-05, + "loss": 0.2903, + "step": 5540 + }, + { + "epoch": 28.316326530612244, + "grad_norm": 7.645042419433594, + "learning_rate": 1.4336734693877551e-05, + "loss": 0.1417, + "step": 5550 + }, + { + "epoch": 28.367346938775512, + "grad_norm": 20.164094924926758, + "learning_rate": 1.43265306122449e-05, + "loss": 0.2403, + "step": 5560 + }, + { + "epoch": 28.418367346938776, + "grad_norm": 12.705169677734375, + "learning_rate": 1.4316326530612246e-05, + "loss": 0.2522, + "step": 5570 + }, + { + "epoch": 28.46938775510204, + "grad_norm": 13.839006423950195, + "learning_rate": 1.4306122448979594e-05, + "loss": 0.6352, + "step": 5580 + }, + { + "epoch": 28.520408163265305, + "grad_norm": 11.320032119750977, + "learning_rate": 1.429591836734694e-05, + "loss": 0.3408, + "step": 5590 + }, + { + "epoch": 28.571428571428573, + "grad_norm": 12.19134521484375, + "learning_rate": 1.4285714285714287e-05, + "loss": 0.3551, + "step": 5600 + }, + { + "epoch": 28.622448979591837, + "grad_norm": 12.064301490783691, + "learning_rate": 1.4275510204081633e-05, + "loss": 0.1987, + "step": 5610 + }, + { + "epoch": 28.6734693877551, + "grad_norm": 7.575666427612305, + "learning_rate": 1.426530612244898e-05, + "loss": 0.5542, + "step": 5620 + }, + { + "epoch": 28.724489795918366, + "grad_norm": 5.452394008636475, + "learning_rate": 1.425510204081633e-05, + "loss": 0.3024, + "step": 5630 + }, + { + "epoch": 28.775510204081634, + "grad_norm": 18.780208587646484, + "learning_rate": 1.4244897959183674e-05, + "loss": 0.2462, + "step": 5640 + }, + { + "epoch": 28.8265306122449, + "grad_norm": 10.71102523803711, + "learning_rate": 1.4234693877551023e-05, + "loss": 0.3256, + "step": 5650 + }, + { + "epoch": 28.877551020408163, + "grad_norm": 4.647825241088867, + "learning_rate": 1.4224489795918369e-05, + "loss": 0.2888, + "step": 5660 + }, + { + "epoch": 28.928571428571427, + "grad_norm": 4.365809917449951, + "learning_rate": 1.4214285714285716e-05, + "loss": 0.3827, + "step": 5670 + }, + { + "epoch": 28.979591836734695, + "grad_norm": 21.51952362060547, + "learning_rate": 1.4204081632653062e-05, + "loss": 0.4204, + "step": 5680 + }, + { + "epoch": 29.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.385593056678772, + "eval_runtime": 1.0873, + "eval_samples_per_second": 254.759, + "eval_steps_per_second": 32.19, + "step": 5684 + }, + { + "epoch": 29.03061224489796, + "grad_norm": 21.908313751220703, + "learning_rate": 1.419387755102041e-05, + "loss": 0.3382, + "step": 5690 + }, + { + "epoch": 29.081632653061224, + "grad_norm": 16.30230140686035, + "learning_rate": 1.4183673469387755e-05, + "loss": 0.2409, + "step": 5700 + }, + { + "epoch": 29.132653061224488, + "grad_norm": 29.51148223876953, + "learning_rate": 1.4173469387755103e-05, + "loss": 0.5759, + "step": 5710 + }, + { + "epoch": 29.183673469387756, + "grad_norm": 3.673567533493042, + "learning_rate": 1.4163265306122452e-05, + "loss": 0.3792, + "step": 5720 + }, + { + "epoch": 29.23469387755102, + "grad_norm": 5.580427646636963, + "learning_rate": 1.4153061224489798e-05, + "loss": 0.629, + "step": 5730 + }, + { + "epoch": 29.285714285714285, + "grad_norm": 9.82301139831543, + "learning_rate": 1.4142857142857145e-05, + "loss": 0.3223, + "step": 5740 + }, + { + "epoch": 29.336734693877553, + "grad_norm": 2.7968478202819824, + "learning_rate": 1.4132653061224491e-05, + "loss": 0.3176, + "step": 5750 + }, + { + "epoch": 29.387755102040817, + "grad_norm": 23.745586395263672, + "learning_rate": 1.4122448979591838e-05, + "loss": 0.2588, + "step": 5760 + }, + { + "epoch": 29.43877551020408, + "grad_norm": 12.200143814086914, + "learning_rate": 1.4112244897959184e-05, + "loss": 0.3084, + "step": 5770 + }, + { + "epoch": 29.489795918367346, + "grad_norm": 3.805999755859375, + "learning_rate": 1.4102040816326532e-05, + "loss": 0.1969, + "step": 5780 + }, + { + "epoch": 29.540816326530614, + "grad_norm": 14.452210426330566, + "learning_rate": 1.4091836734693877e-05, + "loss": 0.3836, + "step": 5790 + }, + { + "epoch": 29.591836734693878, + "grad_norm": 13.828787803649902, + "learning_rate": 1.4081632653061225e-05, + "loss": 0.3301, + "step": 5800 + }, + { + "epoch": 29.642857142857142, + "grad_norm": 2.7561838626861572, + "learning_rate": 1.4071428571428574e-05, + "loss": 0.3378, + "step": 5810 + }, + { + "epoch": 29.693877551020407, + "grad_norm": 2.1031112670898438, + "learning_rate": 1.406122448979592e-05, + "loss": 0.3025, + "step": 5820 + }, + { + "epoch": 29.744897959183675, + "grad_norm": 21.27263069152832, + "learning_rate": 1.4051020408163267e-05, + "loss": 0.272, + "step": 5830 + }, + { + "epoch": 29.79591836734694, + "grad_norm": 6.730413436889648, + "learning_rate": 1.4040816326530613e-05, + "loss": 0.2741, + "step": 5840 + }, + { + "epoch": 29.846938775510203, + "grad_norm": 12.93132495880127, + "learning_rate": 1.403061224489796e-05, + "loss": 0.3056, + "step": 5850 + }, + { + "epoch": 29.897959183673468, + "grad_norm": 3.3267087936401367, + "learning_rate": 1.4020408163265307e-05, + "loss": 0.209, + "step": 5860 + }, + { + "epoch": 29.948979591836736, + "grad_norm": 6.220228672027588, + "learning_rate": 1.4010204081632654e-05, + "loss": 0.3034, + "step": 5870 + }, + { + "epoch": 30.0, + "grad_norm": 29.713401794433594, + "learning_rate": 1.4e-05, + "loss": 0.316, + "step": 5880 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.35761767625808716, + "eval_runtime": 1.0042, + "eval_samples_per_second": 275.845, + "eval_steps_per_second": 34.854, + "step": 5880 + }, + { + "epoch": 30.051020408163264, + "grad_norm": 5.262813568115234, + "learning_rate": 1.3989795918367349e-05, + "loss": 0.1933, + "step": 5890 + }, + { + "epoch": 30.102040816326532, + "grad_norm": 11.50768756866455, + "learning_rate": 1.3979591836734696e-05, + "loss": 0.3883, + "step": 5900 + }, + { + "epoch": 30.153061224489797, + "grad_norm": 30.632522583007812, + "learning_rate": 1.3969387755102042e-05, + "loss": 0.5667, + "step": 5910 + }, + { + "epoch": 30.20408163265306, + "grad_norm": 12.896961212158203, + "learning_rate": 1.395918367346939e-05, + "loss": 0.1942, + "step": 5920 + }, + { + "epoch": 30.255102040816325, + "grad_norm": 2.1163856983184814, + "learning_rate": 1.3948979591836736e-05, + "loss": 0.3992, + "step": 5930 + }, + { + "epoch": 30.306122448979593, + "grad_norm": 12.458740234375, + "learning_rate": 1.3938775510204083e-05, + "loss": 0.264, + "step": 5940 + }, + { + "epoch": 30.357142857142858, + "grad_norm": 10.592092514038086, + "learning_rate": 1.3928571428571429e-05, + "loss": 0.3937, + "step": 5950 + }, + { + "epoch": 30.408163265306122, + "grad_norm": 17.164011001586914, + "learning_rate": 1.3918367346938776e-05, + "loss": 0.2325, + "step": 5960 + }, + { + "epoch": 30.459183673469386, + "grad_norm": 22.710243225097656, + "learning_rate": 1.3908163265306122e-05, + "loss": 0.3232, + "step": 5970 + }, + { + "epoch": 30.510204081632654, + "grad_norm": 17.9942626953125, + "learning_rate": 1.3897959183673471e-05, + "loss": 0.3218, + "step": 5980 + }, + { + "epoch": 30.56122448979592, + "grad_norm": 10.71388053894043, + "learning_rate": 1.3887755102040819e-05, + "loss": 0.1924, + "step": 5990 + }, + { + "epoch": 30.612244897959183, + "grad_norm": 4.5374298095703125, + "learning_rate": 1.3877551020408165e-05, + "loss": 0.1991, + "step": 6000 + }, + { + "epoch": 30.663265306122447, + "grad_norm": 20.602048873901367, + "learning_rate": 1.3867346938775512e-05, + "loss": 0.2749, + "step": 6010 + }, + { + "epoch": 30.714285714285715, + "grad_norm": 21.387718200683594, + "learning_rate": 1.3857142857142858e-05, + "loss": 0.2354, + "step": 6020 + }, + { + "epoch": 30.76530612244898, + "grad_norm": 8.55083179473877, + "learning_rate": 1.3846938775510205e-05, + "loss": 0.2413, + "step": 6030 + }, + { + "epoch": 30.816326530612244, + "grad_norm": 0.4691735804080963, + "learning_rate": 1.3836734693877551e-05, + "loss": 0.1936, + "step": 6040 + }, + { + "epoch": 30.867346938775512, + "grad_norm": 17.750972747802734, + "learning_rate": 1.38265306122449e-05, + "loss": 0.3542, + "step": 6050 + }, + { + "epoch": 30.918367346938776, + "grad_norm": 21.47021484375, + "learning_rate": 1.3816326530612244e-05, + "loss": 0.2644, + "step": 6060 + }, + { + "epoch": 30.96938775510204, + "grad_norm": 26.4416446685791, + "learning_rate": 1.3806122448979594e-05, + "loss": 0.3102, + "step": 6070 + }, + { + "epoch": 31.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.41550442576408386, + "eval_runtime": 0.9958, + "eval_samples_per_second": 278.18, + "eval_steps_per_second": 35.149, + "step": 6076 + }, + { + "epoch": 31.020408163265305, + "grad_norm": 1.689475655555725, + "learning_rate": 1.3795918367346941e-05, + "loss": 0.1551, + "step": 6080 + }, + { + "epoch": 31.071428571428573, + "grad_norm": 2.0812947750091553, + "learning_rate": 1.3785714285714287e-05, + "loss": 0.3782, + "step": 6090 + }, + { + "epoch": 31.122448979591837, + "grad_norm": 1.7405927181243896, + "learning_rate": 1.3775510204081634e-05, + "loss": 0.2421, + "step": 6100 + }, + { + "epoch": 31.1734693877551, + "grad_norm": 15.318486213684082, + "learning_rate": 1.376530612244898e-05, + "loss": 0.2474, + "step": 6110 + }, + { + "epoch": 31.224489795918366, + "grad_norm": 17.843570709228516, + "learning_rate": 1.3755102040816328e-05, + "loss": 0.2283, + "step": 6120 + }, + { + "epoch": 31.275510204081634, + "grad_norm": 15.758756637573242, + "learning_rate": 1.3744897959183673e-05, + "loss": 0.3602, + "step": 6130 + }, + { + "epoch": 31.3265306122449, + "grad_norm": 20.06702423095703, + "learning_rate": 1.3734693877551023e-05, + "loss": 0.2499, + "step": 6140 + }, + { + "epoch": 31.377551020408163, + "grad_norm": 27.892908096313477, + "learning_rate": 1.3724489795918368e-05, + "loss": 0.4033, + "step": 6150 + }, + { + "epoch": 31.428571428571427, + "grad_norm": 14.079084396362305, + "learning_rate": 1.3714285714285716e-05, + "loss": 0.328, + "step": 6160 + }, + { + "epoch": 31.479591836734695, + "grad_norm": 4.357316970825195, + "learning_rate": 1.3704081632653062e-05, + "loss": 0.1255, + "step": 6170 + }, + { + "epoch": 31.53061224489796, + "grad_norm": 13.244762420654297, + "learning_rate": 1.3693877551020409e-05, + "loss": 0.2623, + "step": 6180 + }, + { + "epoch": 31.581632653061224, + "grad_norm": 14.224111557006836, + "learning_rate": 1.3683673469387757e-05, + "loss": 0.4046, + "step": 6190 + }, + { + "epoch": 31.632653061224488, + "grad_norm": 12.079656600952148, + "learning_rate": 1.3673469387755102e-05, + "loss": 0.3786, + "step": 6200 + }, + { + "epoch": 31.683673469387756, + "grad_norm": 7.610096454620361, + "learning_rate": 1.366326530612245e-05, + "loss": 0.448, + "step": 6210 + }, + { + "epoch": 31.73469387755102, + "grad_norm": 10.16573715209961, + "learning_rate": 1.3653061224489796e-05, + "loss": 0.173, + "step": 6220 + }, + { + "epoch": 31.785714285714285, + "grad_norm": 0.7280598282814026, + "learning_rate": 1.3642857142857145e-05, + "loss": 0.1907, + "step": 6230 + }, + { + "epoch": 31.836734693877553, + "grad_norm": 2.926581859588623, + "learning_rate": 1.363265306122449e-05, + "loss": 0.3869, + "step": 6240 + }, + { + "epoch": 31.887755102040817, + "grad_norm": 12.126615524291992, + "learning_rate": 1.3622448979591838e-05, + "loss": 0.2023, + "step": 6250 + }, + { + "epoch": 31.93877551020408, + "grad_norm": 15.220383644104004, + "learning_rate": 1.3612244897959184e-05, + "loss": 0.282, + "step": 6260 + }, + { + "epoch": 31.989795918367346, + "grad_norm": 3.091996431350708, + "learning_rate": 1.3602040816326531e-05, + "loss": 0.1489, + "step": 6270 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.414660781621933, + "eval_runtime": 0.9916, + "eval_samples_per_second": 279.355, + "eval_steps_per_second": 35.298, + "step": 6272 + }, + { + "epoch": 32.04081632653061, + "grad_norm": 2.010187864303589, + "learning_rate": 1.3591836734693879e-05, + "loss": 0.3755, + "step": 6280 + }, + { + "epoch": 32.09183673469388, + "grad_norm": 11.569319725036621, + "learning_rate": 1.3581632653061225e-05, + "loss": 0.4776, + "step": 6290 + }, + { + "epoch": 32.142857142857146, + "grad_norm": 25.95221710205078, + "learning_rate": 1.3571428571428574e-05, + "loss": 0.3517, + "step": 6300 + }, + { + "epoch": 32.19387755102041, + "grad_norm": 12.648597717285156, + "learning_rate": 1.356122448979592e-05, + "loss": 0.2227, + "step": 6310 + }, + { + "epoch": 32.244897959183675, + "grad_norm": 18.473045349121094, + "learning_rate": 1.3551020408163267e-05, + "loss": 0.2718, + "step": 6320 + }, + { + "epoch": 32.295918367346935, + "grad_norm": 11.099507331848145, + "learning_rate": 1.3540816326530613e-05, + "loss": 0.3097, + "step": 6330 + }, + { + "epoch": 32.3469387755102, + "grad_norm": 26.682863235473633, + "learning_rate": 1.353061224489796e-05, + "loss": 0.3769, + "step": 6340 + }, + { + "epoch": 32.39795918367347, + "grad_norm": 28.082658767700195, + "learning_rate": 1.3520408163265306e-05, + "loss": 0.3046, + "step": 6350 + }, + { + "epoch": 32.44897959183673, + "grad_norm": 3.026536464691162, + "learning_rate": 1.3510204081632654e-05, + "loss": 0.224, + "step": 6360 + }, + { + "epoch": 32.5, + "grad_norm": 15.811055183410645, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.3271, + "step": 6370 + }, + { + "epoch": 32.55102040816327, + "grad_norm": 32.113582611083984, + "learning_rate": 1.3489795918367347e-05, + "loss": 0.4922, + "step": 6380 + }, + { + "epoch": 32.60204081632653, + "grad_norm": 28.253660202026367, + "learning_rate": 1.3479591836734696e-05, + "loss": 0.3316, + "step": 6390 + }, + { + "epoch": 32.6530612244898, + "grad_norm": 4.436378479003906, + "learning_rate": 1.3469387755102042e-05, + "loss": 0.2818, + "step": 6400 + }, + { + "epoch": 32.704081632653065, + "grad_norm": 20.662548065185547, + "learning_rate": 1.345918367346939e-05, + "loss": 0.336, + "step": 6410 + }, + { + "epoch": 32.755102040816325, + "grad_norm": 18.65795135498047, + "learning_rate": 1.3448979591836735e-05, + "loss": 0.5264, + "step": 6420 + }, + { + "epoch": 32.80612244897959, + "grad_norm": 12.983723640441895, + "learning_rate": 1.3438775510204083e-05, + "loss": 0.1678, + "step": 6430 + }, + { + "epoch": 32.857142857142854, + "grad_norm": 25.585878372192383, + "learning_rate": 1.3428571428571429e-05, + "loss": 0.2327, + "step": 6440 + }, + { + "epoch": 32.90816326530612, + "grad_norm": 3.003679037094116, + "learning_rate": 1.3418367346938776e-05, + "loss": 0.3554, + "step": 6450 + }, + { + "epoch": 32.95918367346939, + "grad_norm": 19.482391357421875, + "learning_rate": 1.3408163265306125e-05, + "loss": 0.3302, + "step": 6460 + }, + { + "epoch": 33.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.421672523021698, + "eval_runtime": 1.0893, + "eval_samples_per_second": 254.289, + "eval_steps_per_second": 32.13, + "step": 6468 + }, + { + "epoch": 33.01020408163265, + "grad_norm": 4.783666610717773, + "learning_rate": 1.3397959183673471e-05, + "loss": 0.237, + "step": 6470 + }, + { + "epoch": 33.06122448979592, + "grad_norm": 13.98491382598877, + "learning_rate": 1.3387755102040818e-05, + "loss": 0.5504, + "step": 6480 + }, + { + "epoch": 33.11224489795919, + "grad_norm": 7.5733232498168945, + "learning_rate": 1.3377551020408164e-05, + "loss": 0.423, + "step": 6490 + }, + { + "epoch": 33.16326530612245, + "grad_norm": 11.854218482971191, + "learning_rate": 1.3367346938775512e-05, + "loss": 0.2428, + "step": 6500 + }, + { + "epoch": 33.214285714285715, + "grad_norm": 1.5104568004608154, + "learning_rate": 1.3357142857142858e-05, + "loss": 0.1456, + "step": 6510 + }, + { + "epoch": 33.265306122448976, + "grad_norm": 22.483104705810547, + "learning_rate": 1.3346938775510205e-05, + "loss": 0.2623, + "step": 6520 + }, + { + "epoch": 33.316326530612244, + "grad_norm": 9.039490699768066, + "learning_rate": 1.333673469387755e-05, + "loss": 0.2224, + "step": 6530 + }, + { + "epoch": 33.36734693877551, + "grad_norm": 3.5957486629486084, + "learning_rate": 1.3326530612244898e-05, + "loss": 0.1757, + "step": 6540 + }, + { + "epoch": 33.41836734693877, + "grad_norm": 19.635908126831055, + "learning_rate": 1.3316326530612247e-05, + "loss": 0.2706, + "step": 6550 + }, + { + "epoch": 33.46938775510204, + "grad_norm": 5.550487518310547, + "learning_rate": 1.3306122448979593e-05, + "loss": 0.3051, + "step": 6560 + }, + { + "epoch": 33.52040816326531, + "grad_norm": 3.6563050746917725, + "learning_rate": 1.329591836734694e-05, + "loss": 0.262, + "step": 6570 + }, + { + "epoch": 33.57142857142857, + "grad_norm": 9.856231689453125, + "learning_rate": 1.3285714285714287e-05, + "loss": 0.5495, + "step": 6580 + }, + { + "epoch": 33.62244897959184, + "grad_norm": 25.28274154663086, + "learning_rate": 1.3275510204081634e-05, + "loss": 0.3389, + "step": 6590 + }, + { + "epoch": 33.673469387755105, + "grad_norm": 2.5876970291137695, + "learning_rate": 1.326530612244898e-05, + "loss": 0.4269, + "step": 6600 + }, + { + "epoch": 33.724489795918366, + "grad_norm": 1.3976984024047852, + "learning_rate": 1.3255102040816327e-05, + "loss": 0.3466, + "step": 6610 + }, + { + "epoch": 33.775510204081634, + "grad_norm": 8.693070411682129, + "learning_rate": 1.3244897959183673e-05, + "loss": 0.1979, + "step": 6620 + }, + { + "epoch": 33.826530612244895, + "grad_norm": 14.884737014770508, + "learning_rate": 1.323469387755102e-05, + "loss": 0.2377, + "step": 6630 + }, + { + "epoch": 33.87755102040816, + "grad_norm": 1.1342506408691406, + "learning_rate": 1.322448979591837e-05, + "loss": 0.3088, + "step": 6640 + }, + { + "epoch": 33.92857142857143, + "grad_norm": 19.607181549072266, + "learning_rate": 1.3214285714285716e-05, + "loss": 0.3551, + "step": 6650 + }, + { + "epoch": 33.97959183673469, + "grad_norm": 34.719234466552734, + "learning_rate": 1.3204081632653063e-05, + "loss": 0.3271, + "step": 6660 + }, + { + "epoch": 34.0, + "eval_accuracy": 0.9097472924187726, + "eval_loss": 0.33211877942085266, + "eval_runtime": 1.0891, + "eval_samples_per_second": 254.349, + "eval_steps_per_second": 32.138, + "step": 6664 + }, + { + "epoch": 34.03061224489796, + "grad_norm": 22.916015625, + "learning_rate": 1.3193877551020409e-05, + "loss": 0.3623, + "step": 6670 + }, + { + "epoch": 34.08163265306123, + "grad_norm": 4.890042304992676, + "learning_rate": 1.3183673469387756e-05, + "loss": 0.2825, + "step": 6680 + }, + { + "epoch": 34.13265306122449, + "grad_norm": 25.53321075439453, + "learning_rate": 1.3173469387755102e-05, + "loss": 0.3848, + "step": 6690 + }, + { + "epoch": 34.183673469387756, + "grad_norm": 2.501490592956543, + "learning_rate": 1.316326530612245e-05, + "loss": 0.2237, + "step": 6700 + }, + { + "epoch": 34.234693877551024, + "grad_norm": 3.4242489337921143, + "learning_rate": 1.3153061224489795e-05, + "loss": 0.3331, + "step": 6710 + }, + { + "epoch": 34.285714285714285, + "grad_norm": 14.375162124633789, + "learning_rate": 1.3142857142857145e-05, + "loss": 0.1626, + "step": 6720 + }, + { + "epoch": 34.33673469387755, + "grad_norm": 3.5311148166656494, + "learning_rate": 1.3132653061224492e-05, + "loss": 0.3287, + "step": 6730 + }, + { + "epoch": 34.38775510204081, + "grad_norm": 4.28089714050293, + "learning_rate": 1.3122448979591838e-05, + "loss": 0.2532, + "step": 6740 + }, + { + "epoch": 34.43877551020408, + "grad_norm": 9.1922607421875, + "learning_rate": 1.3112244897959185e-05, + "loss": 0.1887, + "step": 6750 + }, + { + "epoch": 34.48979591836735, + "grad_norm": 12.233587265014648, + "learning_rate": 1.3102040816326531e-05, + "loss": 0.1482, + "step": 6760 + }, + { + "epoch": 34.54081632653061, + "grad_norm": 3.4384610652923584, + "learning_rate": 1.3091836734693879e-05, + "loss": 0.3258, + "step": 6770 + }, + { + "epoch": 34.59183673469388, + "grad_norm": 0.626457929611206, + "learning_rate": 1.3081632653061224e-05, + "loss": 0.2611, + "step": 6780 + }, + { + "epoch": 34.642857142857146, + "grad_norm": 22.938907623291016, + "learning_rate": 1.3071428571428572e-05, + "loss": 0.4706, + "step": 6790 + }, + { + "epoch": 34.69387755102041, + "grad_norm": 29.896095275878906, + "learning_rate": 1.3061224489795918e-05, + "loss": 0.3081, + "step": 6800 + }, + { + "epoch": 34.744897959183675, + "grad_norm": 23.988367080688477, + "learning_rate": 1.3051020408163267e-05, + "loss": 0.5593, + "step": 6810 + }, + { + "epoch": 34.795918367346935, + "grad_norm": 23.784255981445312, + "learning_rate": 1.3040816326530614e-05, + "loss": 0.3245, + "step": 6820 + }, + { + "epoch": 34.8469387755102, + "grad_norm": 4.192731857299805, + "learning_rate": 1.303061224489796e-05, + "loss": 0.2971, + "step": 6830 + }, + { + "epoch": 34.89795918367347, + "grad_norm": 2.2600066661834717, + "learning_rate": 1.3020408163265308e-05, + "loss": 0.3537, + "step": 6840 + }, + { + "epoch": 34.94897959183673, + "grad_norm": 21.234739303588867, + "learning_rate": 1.3010204081632653e-05, + "loss": 0.4262, + "step": 6850 + }, + { + "epoch": 35.0, + "grad_norm": 3.5019752979278564, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.3481, + "step": 6860 + }, + { + "epoch": 35.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.3828442692756653, + "eval_runtime": 1.0106, + "eval_samples_per_second": 274.082, + "eval_steps_per_second": 34.631, + "step": 6860 + }, + { + "epoch": 35.05102040816327, + "grad_norm": 18.692649841308594, + "learning_rate": 1.2989795918367347e-05, + "loss": 0.1811, + "step": 6870 + }, + { + "epoch": 35.10204081632653, + "grad_norm": 7.501255035400391, + "learning_rate": 1.2979591836734696e-05, + "loss": 0.5427, + "step": 6880 + }, + { + "epoch": 35.1530612244898, + "grad_norm": 5.126681327819824, + "learning_rate": 1.2969387755102042e-05, + "loss": 0.2387, + "step": 6890 + }, + { + "epoch": 35.204081632653065, + "grad_norm": 1.2979735136032104, + "learning_rate": 1.2959183673469389e-05, + "loss": 0.2105, + "step": 6900 + }, + { + "epoch": 35.255102040816325, + "grad_norm": 5.969475746154785, + "learning_rate": 1.2948979591836737e-05, + "loss": 0.1645, + "step": 6910 + }, + { + "epoch": 35.30612244897959, + "grad_norm": 1.7133474349975586, + "learning_rate": 1.2938775510204082e-05, + "loss": 0.4852, + "step": 6920 + }, + { + "epoch": 35.357142857142854, + "grad_norm": 14.442985534667969, + "learning_rate": 1.292857142857143e-05, + "loss": 0.2917, + "step": 6930 + }, + { + "epoch": 35.40816326530612, + "grad_norm": 2.490236520767212, + "learning_rate": 1.2918367346938776e-05, + "loss": 0.3352, + "step": 6940 + }, + { + "epoch": 35.45918367346939, + "grad_norm": 11.736788749694824, + "learning_rate": 1.2908163265306123e-05, + "loss": 0.2837, + "step": 6950 + }, + { + "epoch": 35.51020408163265, + "grad_norm": 11.13012981414795, + "learning_rate": 1.2897959183673469e-05, + "loss": 0.2303, + "step": 6960 + }, + { + "epoch": 35.56122448979592, + "grad_norm": 10.291847229003906, + "learning_rate": 1.2887755102040818e-05, + "loss": 0.5007, + "step": 6970 + }, + { + "epoch": 35.61224489795919, + "grad_norm": 10.566892623901367, + "learning_rate": 1.2877551020408164e-05, + "loss": 0.2897, + "step": 6980 + }, + { + "epoch": 35.66326530612245, + "grad_norm": 19.692846298217773, + "learning_rate": 1.2867346938775511e-05, + "loss": 0.4057, + "step": 6990 + }, + { + "epoch": 35.714285714285715, + "grad_norm": 12.77248764038086, + "learning_rate": 1.2857142857142859e-05, + "loss": 0.1773, + "step": 7000 + }, + { + "epoch": 35.765306122448976, + "grad_norm": 12.873383522033691, + "learning_rate": 1.2846938775510205e-05, + "loss": 0.5471, + "step": 7010 + }, + { + "epoch": 35.816326530612244, + "grad_norm": 7.839117527008057, + "learning_rate": 1.2836734693877552e-05, + "loss": 0.4321, + "step": 7020 + }, + { + "epoch": 35.86734693877551, + "grad_norm": 4.145122051239014, + "learning_rate": 1.2826530612244898e-05, + "loss": 0.2516, + "step": 7030 + }, + { + "epoch": 35.91836734693877, + "grad_norm": 21.064868927001953, + "learning_rate": 1.2816326530612247e-05, + "loss": 0.3976, + "step": 7040 + }, + { + "epoch": 35.96938775510204, + "grad_norm": 22.921911239624023, + "learning_rate": 1.2806122448979591e-05, + "loss": 0.3329, + "step": 7050 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.8700361010830325, + "eval_loss": 0.40445831418037415, + "eval_runtime": 1.002, + "eval_samples_per_second": 276.444, + "eval_steps_per_second": 34.93, + "step": 7056 + }, + { + "epoch": 36.02040816326531, + "grad_norm": 6.55795431137085, + "learning_rate": 1.279591836734694e-05, + "loss": 0.057, + "step": 7060 + }, + { + "epoch": 36.07142857142857, + "grad_norm": 5.174421787261963, + "learning_rate": 1.2785714285714286e-05, + "loss": 0.413, + "step": 7070 + }, + { + "epoch": 36.12244897959184, + "grad_norm": 15.93181037902832, + "learning_rate": 1.2775510204081634e-05, + "loss": 0.417, + "step": 7080 + }, + { + "epoch": 36.173469387755105, + "grad_norm": 18.251239776611328, + "learning_rate": 1.2765306122448981e-05, + "loss": 0.2599, + "step": 7090 + }, + { + "epoch": 36.224489795918366, + "grad_norm": 20.241962432861328, + "learning_rate": 1.2755102040816327e-05, + "loss": 0.2203, + "step": 7100 + }, + { + "epoch": 36.275510204081634, + "grad_norm": 27.565418243408203, + "learning_rate": 1.2744897959183674e-05, + "loss": 0.2819, + "step": 7110 + }, + { + "epoch": 36.326530612244895, + "grad_norm": 3.0462327003479004, + "learning_rate": 1.273469387755102e-05, + "loss": 0.1527, + "step": 7120 + }, + { + "epoch": 36.37755102040816, + "grad_norm": 16.962770462036133, + "learning_rate": 1.272448979591837e-05, + "loss": 0.338, + "step": 7130 + }, + { + "epoch": 36.42857142857143, + "grad_norm": 20.103633880615234, + "learning_rate": 1.2714285714285715e-05, + "loss": 0.2093, + "step": 7140 + }, + { + "epoch": 36.47959183673469, + "grad_norm": 17.761137008666992, + "learning_rate": 1.2704081632653063e-05, + "loss": 0.4383, + "step": 7150 + }, + { + "epoch": 36.53061224489796, + "grad_norm": 20.75181770324707, + "learning_rate": 1.2693877551020409e-05, + "loss": 0.3117, + "step": 7160 + }, + { + "epoch": 36.58163265306123, + "grad_norm": 16.521347045898438, + "learning_rate": 1.2683673469387756e-05, + "loss": 0.1105, + "step": 7170 + }, + { + "epoch": 36.63265306122449, + "grad_norm": 6.464329719543457, + "learning_rate": 1.2673469387755104e-05, + "loss": 0.19, + "step": 7180 + }, + { + "epoch": 36.683673469387756, + "grad_norm": 17.33024024963379, + "learning_rate": 1.266326530612245e-05, + "loss": 0.256, + "step": 7190 + }, + { + "epoch": 36.734693877551024, + "grad_norm": 22.29949951171875, + "learning_rate": 1.2653061224489798e-05, + "loss": 0.689, + "step": 7200 + }, + { + "epoch": 36.785714285714285, + "grad_norm": 1.2629824876785278, + "learning_rate": 1.2642857142857143e-05, + "loss": 0.3282, + "step": 7210 + }, + { + "epoch": 36.83673469387755, + "grad_norm": 29.325130462646484, + "learning_rate": 1.2632653061224492e-05, + "loss": 0.3625, + "step": 7220 + }, + { + "epoch": 36.88775510204081, + "grad_norm": 1.556731104850769, + "learning_rate": 1.2622448979591838e-05, + "loss": 0.168, + "step": 7230 + }, + { + "epoch": 36.93877551020408, + "grad_norm": 22.19536590576172, + "learning_rate": 1.2612244897959185e-05, + "loss": 0.4576, + "step": 7240 + }, + { + "epoch": 36.98979591836735, + "grad_norm": 12.314927101135254, + "learning_rate": 1.260204081632653e-05, + "loss": 0.2471, + "step": 7250 + }, + { + "epoch": 37.0, + "eval_accuracy": 0.8664259927797834, + "eval_loss": 0.5535942912101746, + "eval_runtime": 1.0051, + "eval_samples_per_second": 275.599, + "eval_steps_per_second": 34.823, + "step": 7252 + }, + { + "epoch": 37.04081632653061, + "grad_norm": 1.0027203559875488, + "learning_rate": 1.2591836734693878e-05, + "loss": 0.1253, + "step": 7260 + }, + { + "epoch": 37.09183673469388, + "grad_norm": 2.7528111934661865, + "learning_rate": 1.2581632653061226e-05, + "loss": 0.305, + "step": 7270 + }, + { + "epoch": 37.142857142857146, + "grad_norm": 11.119230270385742, + "learning_rate": 1.2571428571428572e-05, + "loss": 0.3205, + "step": 7280 + }, + { + "epoch": 37.19387755102041, + "grad_norm": 18.911418914794922, + "learning_rate": 1.256122448979592e-05, + "loss": 0.3724, + "step": 7290 + }, + { + "epoch": 37.244897959183675, + "grad_norm": 1.9840826988220215, + "learning_rate": 1.2551020408163267e-05, + "loss": 0.2847, + "step": 7300 + }, + { + "epoch": 37.295918367346935, + "grad_norm": 20.070383071899414, + "learning_rate": 1.2540816326530614e-05, + "loss": 0.3686, + "step": 7310 + }, + { + "epoch": 37.3469387755102, + "grad_norm": 19.21637725830078, + "learning_rate": 1.253061224489796e-05, + "loss": 0.3369, + "step": 7320 + }, + { + "epoch": 37.39795918367347, + "grad_norm": 6.611429214477539, + "learning_rate": 1.2520408163265307e-05, + "loss": 0.224, + "step": 7330 + }, + { + "epoch": 37.44897959183673, + "grad_norm": 4.903932094573975, + "learning_rate": 1.2510204081632653e-05, + "loss": 0.3194, + "step": 7340 + }, + { + "epoch": 37.5, + "grad_norm": 20.827777862548828, + "learning_rate": 1.25e-05, + "loss": 0.4181, + "step": 7350 + }, + { + "epoch": 37.55102040816327, + "grad_norm": 17.60271453857422, + "learning_rate": 1.248979591836735e-05, + "loss": 0.4379, + "step": 7360 + }, + { + "epoch": 37.60204081632653, + "grad_norm": 3.3018884658813477, + "learning_rate": 1.2479591836734694e-05, + "loss": 0.4048, + "step": 7370 + }, + { + "epoch": 37.6530612244898, + "grad_norm": 2.094794511795044, + "learning_rate": 1.2469387755102043e-05, + "loss": 0.2347, + "step": 7380 + }, + { + "epoch": 37.704081632653065, + "grad_norm": 10.990236282348633, + "learning_rate": 1.2459183673469389e-05, + "loss": 0.3287, + "step": 7390 + }, + { + "epoch": 37.755102040816325, + "grad_norm": 16.42524528503418, + "learning_rate": 1.2448979591836736e-05, + "loss": 0.2203, + "step": 7400 + }, + { + "epoch": 37.80612244897959, + "grad_norm": 14.186802864074707, + "learning_rate": 1.2438775510204082e-05, + "loss": 0.1834, + "step": 7410 + }, + { + "epoch": 37.857142857142854, + "grad_norm": 2.535600185394287, + "learning_rate": 1.242857142857143e-05, + "loss": 0.3713, + "step": 7420 + }, + { + "epoch": 37.90816326530612, + "grad_norm": 6.273879528045654, + "learning_rate": 1.2418367346938775e-05, + "loss": 0.192, + "step": 7430 + }, + { + "epoch": 37.95918367346939, + "grad_norm": 13.878371238708496, + "learning_rate": 1.2408163265306123e-05, + "loss": 0.2007, + "step": 7440 + }, + { + "epoch": 38.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.3503141403198242, + "eval_runtime": 1.0006, + "eval_samples_per_second": 276.83, + "eval_steps_per_second": 34.979, + "step": 7448 + }, + { + "epoch": 38.01020408163265, + "grad_norm": 19.95601463317871, + "learning_rate": 1.2397959183673472e-05, + "loss": 0.3343, + "step": 7450 + }, + { + "epoch": 38.06122448979592, + "grad_norm": 9.128788948059082, + "learning_rate": 1.2387755102040818e-05, + "loss": 0.3976, + "step": 7460 + }, + { + "epoch": 38.11224489795919, + "grad_norm": 17.617860794067383, + "learning_rate": 1.2377551020408165e-05, + "loss": 0.1981, + "step": 7470 + }, + { + "epoch": 38.16326530612245, + "grad_norm": 9.201713562011719, + "learning_rate": 1.2367346938775511e-05, + "loss": 0.3274, + "step": 7480 + }, + { + "epoch": 38.214285714285715, + "grad_norm": 25.265117645263672, + "learning_rate": 1.2357142857142859e-05, + "loss": 0.2634, + "step": 7490 + }, + { + "epoch": 38.265306122448976, + "grad_norm": 30.585693359375, + "learning_rate": 1.2346938775510204e-05, + "loss": 0.3861, + "step": 7500 + }, + { + "epoch": 38.316326530612244, + "grad_norm": 14.335867881774902, + "learning_rate": 1.2336734693877552e-05, + "loss": 0.3166, + "step": 7510 + }, + { + "epoch": 38.36734693877551, + "grad_norm": 28.094099044799805, + "learning_rate": 1.2326530612244898e-05, + "loss": 0.2284, + "step": 7520 + }, + { + "epoch": 38.41836734693877, + "grad_norm": 14.234325408935547, + "learning_rate": 1.2316326530612245e-05, + "loss": 0.4422, + "step": 7530 + }, + { + "epoch": 38.46938775510204, + "grad_norm": 19.829790115356445, + "learning_rate": 1.2306122448979594e-05, + "loss": 0.3144, + "step": 7540 + }, + { + "epoch": 38.52040816326531, + "grad_norm": 11.633485794067383, + "learning_rate": 1.229591836734694e-05, + "loss": 0.2291, + "step": 7550 + }, + { + "epoch": 38.57142857142857, + "grad_norm": 4.391861438751221, + "learning_rate": 1.2285714285714288e-05, + "loss": 0.2305, + "step": 7560 + }, + { + "epoch": 38.62244897959184, + "grad_norm": 14.22461986541748, + "learning_rate": 1.2275510204081633e-05, + "loss": 0.2359, + "step": 7570 + }, + { + "epoch": 38.673469387755105, + "grad_norm": 17.062314987182617, + "learning_rate": 1.2265306122448981e-05, + "loss": 0.2513, + "step": 7580 + }, + { + "epoch": 38.724489795918366, + "grad_norm": 8.868117332458496, + "learning_rate": 1.2255102040816327e-05, + "loss": 0.2274, + "step": 7590 + }, + { + "epoch": 38.775510204081634, + "grad_norm": 13.727202415466309, + "learning_rate": 1.2244897959183674e-05, + "loss": 0.3451, + "step": 7600 + }, + { + "epoch": 38.826530612244895, + "grad_norm": 1.1569433212280273, + "learning_rate": 1.223469387755102e-05, + "loss": 0.1574, + "step": 7610 + }, + { + "epoch": 38.87755102040816, + "grad_norm": 7.617978096008301, + "learning_rate": 1.222448979591837e-05, + "loss": 0.0933, + "step": 7620 + }, + { + "epoch": 38.92857142857143, + "grad_norm": 16.137170791625977, + "learning_rate": 1.2214285714285717e-05, + "loss": 0.3403, + "step": 7630 + }, + { + "epoch": 38.97959183673469, + "grad_norm": 33.752437591552734, + "learning_rate": 1.2204081632653062e-05, + "loss": 0.7535, + "step": 7640 + }, + { + "epoch": 39.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.48185965418815613, + "eval_runtime": 0.9998, + "eval_samples_per_second": 277.066, + "eval_steps_per_second": 35.008, + "step": 7644 + }, + { + "epoch": 39.03061224489796, + "grad_norm": 2.4557836055755615, + "learning_rate": 1.219387755102041e-05, + "loss": 0.1629, + "step": 7650 + }, + { + "epoch": 39.08163265306123, + "grad_norm": 25.87775993347168, + "learning_rate": 1.2183673469387756e-05, + "loss": 0.1883, + "step": 7660 + }, + { + "epoch": 39.13265306122449, + "grad_norm": 14.176460266113281, + "learning_rate": 1.2173469387755103e-05, + "loss": 0.3217, + "step": 7670 + }, + { + "epoch": 39.183673469387756, + "grad_norm": 2.4303107261657715, + "learning_rate": 1.2163265306122449e-05, + "loss": 0.1258, + "step": 7680 + }, + { + "epoch": 39.234693877551024, + "grad_norm": 1.8486857414245605, + "learning_rate": 1.2153061224489796e-05, + "loss": 0.422, + "step": 7690 + }, + { + "epoch": 39.285714285714285, + "grad_norm": 8.261406898498535, + "learning_rate": 1.2142857142857142e-05, + "loss": 0.1813, + "step": 7700 + }, + { + "epoch": 39.33673469387755, + "grad_norm": 18.07621192932129, + "learning_rate": 1.2132653061224491e-05, + "loss": 0.2892, + "step": 7710 + }, + { + "epoch": 39.38775510204081, + "grad_norm": 15.600359916687012, + "learning_rate": 1.2122448979591839e-05, + "loss": 0.35, + "step": 7720 + }, + { + "epoch": 39.43877551020408, + "grad_norm": 2.330838680267334, + "learning_rate": 1.2112244897959185e-05, + "loss": 0.3566, + "step": 7730 + }, + { + "epoch": 39.48979591836735, + "grad_norm": 5.433841705322266, + "learning_rate": 1.2102040816326532e-05, + "loss": 0.292, + "step": 7740 + }, + { + "epoch": 39.54081632653061, + "grad_norm": 26.33514404296875, + "learning_rate": 1.2091836734693878e-05, + "loss": 0.3071, + "step": 7750 + }, + { + "epoch": 39.59183673469388, + "grad_norm": 8.727425575256348, + "learning_rate": 1.2081632653061225e-05, + "loss": 0.2762, + "step": 7760 + }, + { + "epoch": 39.642857142857146, + "grad_norm": 15.274565696716309, + "learning_rate": 1.2071428571428571e-05, + "loss": 0.2448, + "step": 7770 + }, + { + "epoch": 39.69387755102041, + "grad_norm": 2.2880454063415527, + "learning_rate": 1.206122448979592e-05, + "loss": 0.2161, + "step": 7780 + }, + { + "epoch": 39.744897959183675, + "grad_norm": 12.748868942260742, + "learning_rate": 1.2051020408163265e-05, + "loss": 0.2902, + "step": 7790 + }, + { + "epoch": 39.795918367346935, + "grad_norm": 21.565149307250977, + "learning_rate": 1.2040816326530614e-05, + "loss": 0.3526, + "step": 7800 + }, + { + "epoch": 39.8469387755102, + "grad_norm": 30.639131546020508, + "learning_rate": 1.2030612244897961e-05, + "loss": 0.4223, + "step": 7810 + }, + { + "epoch": 39.89795918367347, + "grad_norm": 8.408995628356934, + "learning_rate": 1.2020408163265307e-05, + "loss": 0.4069, + "step": 7820 + }, + { + "epoch": 39.94897959183673, + "grad_norm": 14.231453895568848, + "learning_rate": 1.2010204081632655e-05, + "loss": 0.1139, + "step": 7830 + }, + { + "epoch": 40.0, + "grad_norm": 11.00492000579834, + "learning_rate": 1.2e-05, + "loss": 0.1851, + "step": 7840 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.37624555826187134, + "eval_runtime": 1.0501, + "eval_samples_per_second": 263.79, + "eval_steps_per_second": 33.331, + "step": 7840 + }, + { + "epoch": 40.05102040816327, + "grad_norm": 23.236454010009766, + "learning_rate": 1.1989795918367348e-05, + "loss": 0.2318, + "step": 7850 + }, + { + "epoch": 40.10204081632653, + "grad_norm": 1.7677274942398071, + "learning_rate": 1.1979591836734694e-05, + "loss": 0.1826, + "step": 7860 + }, + { + "epoch": 40.1530612244898, + "grad_norm": 6.884891510009766, + "learning_rate": 1.1969387755102043e-05, + "loss": 0.2115, + "step": 7870 + }, + { + "epoch": 40.204081632653065, + "grad_norm": 3.632115125656128, + "learning_rate": 1.1959183673469389e-05, + "loss": 0.4134, + "step": 7880 + }, + { + "epoch": 40.255102040816325, + "grad_norm": 0.6816219687461853, + "learning_rate": 1.1948979591836736e-05, + "loss": 0.1937, + "step": 7890 + }, + { + "epoch": 40.30612244897959, + "grad_norm": 17.770343780517578, + "learning_rate": 1.1938775510204084e-05, + "loss": 0.4018, + "step": 7900 + }, + { + "epoch": 40.357142857142854, + "grad_norm": 13.780721664428711, + "learning_rate": 1.192857142857143e-05, + "loss": 0.3771, + "step": 7910 + }, + { + "epoch": 40.40816326530612, + "grad_norm": 7.485072135925293, + "learning_rate": 1.1918367346938777e-05, + "loss": 0.2585, + "step": 7920 + }, + { + "epoch": 40.45918367346939, + "grad_norm": 1.9782928228378296, + "learning_rate": 1.1908163265306123e-05, + "loss": 0.396, + "step": 7930 + }, + { + "epoch": 40.51020408163265, + "grad_norm": 19.686613082885742, + "learning_rate": 1.189795918367347e-05, + "loss": 0.3408, + "step": 7940 + }, + { + "epoch": 40.56122448979592, + "grad_norm": 23.277484893798828, + "learning_rate": 1.1887755102040816e-05, + "loss": 0.4797, + "step": 7950 + }, + { + "epoch": 40.61224489795919, + "grad_norm": 6.282851219177246, + "learning_rate": 1.1877551020408165e-05, + "loss": 0.1736, + "step": 7960 + }, + { + "epoch": 40.66326530612245, + "grad_norm": 21.922382354736328, + "learning_rate": 1.186734693877551e-05, + "loss": 0.2012, + "step": 7970 + }, + { + "epoch": 40.714285714285715, + "grad_norm": 2.7238967418670654, + "learning_rate": 1.1857142857142858e-05, + "loss": 0.3773, + "step": 7980 + }, + { + "epoch": 40.765306122448976, + "grad_norm": 23.622583389282227, + "learning_rate": 1.1846938775510206e-05, + "loss": 0.5247, + "step": 7990 + }, + { + "epoch": 40.816326530612244, + "grad_norm": 9.187670707702637, + "learning_rate": 1.1836734693877552e-05, + "loss": 0.1915, + "step": 8000 + }, + { + "epoch": 40.86734693877551, + "grad_norm": 5.409439563751221, + "learning_rate": 1.1826530612244899e-05, + "loss": 0.1435, + "step": 8010 + }, + { + "epoch": 40.91836734693877, + "grad_norm": 23.586383819580078, + "learning_rate": 1.1816326530612245e-05, + "loss": 0.3216, + "step": 8020 + }, + { + "epoch": 40.96938775510204, + "grad_norm": 19.306135177612305, + "learning_rate": 1.1806122448979594e-05, + "loss": 0.2329, + "step": 8030 + }, + { + "epoch": 41.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.4465399384498596, + "eval_runtime": 1.0523, + "eval_samples_per_second": 263.224, + "eval_steps_per_second": 33.259, + "step": 8036 + }, + { + "epoch": 41.02040816326531, + "grad_norm": 3.706423044204712, + "learning_rate": 1.179591836734694e-05, + "loss": 0.5632, + "step": 8040 + }, + { + "epoch": 41.07142857142857, + "grad_norm": 2.856626033782959, + "learning_rate": 1.1785714285714287e-05, + "loss": 0.3427, + "step": 8050 + }, + { + "epoch": 41.12244897959184, + "grad_norm": 1.847086787223816, + "learning_rate": 1.1775510204081633e-05, + "loss": 0.1434, + "step": 8060 + }, + { + "epoch": 41.173469387755105, + "grad_norm": 8.23192310333252, + "learning_rate": 1.176530612244898e-05, + "loss": 0.3319, + "step": 8070 + }, + { + "epoch": 41.224489795918366, + "grad_norm": 10.342353820800781, + "learning_rate": 1.1755102040816328e-05, + "loss": 0.2896, + "step": 8080 + }, + { + "epoch": 41.275510204081634, + "grad_norm": 25.426326751708984, + "learning_rate": 1.1744897959183674e-05, + "loss": 0.2579, + "step": 8090 + }, + { + "epoch": 41.326530612244895, + "grad_norm": 21.861122131347656, + "learning_rate": 1.1734693877551021e-05, + "loss": 0.3948, + "step": 8100 + }, + { + "epoch": 41.37755102040816, + "grad_norm": 31.08426284790039, + "learning_rate": 1.1724489795918367e-05, + "loss": 0.2187, + "step": 8110 + }, + { + "epoch": 41.42857142857143, + "grad_norm": 1.3915305137634277, + "learning_rate": 1.1714285714285716e-05, + "loss": 0.3311, + "step": 8120 + }, + { + "epoch": 41.47959183673469, + "grad_norm": 6.71683406829834, + "learning_rate": 1.1704081632653062e-05, + "loss": 0.2503, + "step": 8130 + }, + { + "epoch": 41.53061224489796, + "grad_norm": 4.324688911437988, + "learning_rate": 1.169387755102041e-05, + "loss": 0.152, + "step": 8140 + }, + { + "epoch": 41.58163265306123, + "grad_norm": 19.36434555053711, + "learning_rate": 1.1683673469387755e-05, + "loss": 0.1851, + "step": 8150 + }, + { + "epoch": 41.63265306122449, + "grad_norm": 11.209128379821777, + "learning_rate": 1.1673469387755103e-05, + "loss": 0.4962, + "step": 8160 + }, + { + "epoch": 41.683673469387756, + "grad_norm": 5.311439514160156, + "learning_rate": 1.166326530612245e-05, + "loss": 0.4635, + "step": 8170 + }, + { + "epoch": 41.734693877551024, + "grad_norm": 28.95256996154785, + "learning_rate": 1.1653061224489796e-05, + "loss": 0.2224, + "step": 8180 + }, + { + "epoch": 41.785714285714285, + "grad_norm": 15.372788429260254, + "learning_rate": 1.1642857142857145e-05, + "loss": 0.1605, + "step": 8190 + }, + { + "epoch": 41.83673469387755, + "grad_norm": 16.921295166015625, + "learning_rate": 1.1632653061224491e-05, + "loss": 0.3375, + "step": 8200 + }, + { + "epoch": 41.88775510204081, + "grad_norm": 3.905909299850464, + "learning_rate": 1.1622448979591839e-05, + "loss": 0.2415, + "step": 8210 + }, + { + "epoch": 41.93877551020408, + "grad_norm": 10.678994178771973, + "learning_rate": 1.1612244897959184e-05, + "loss": 0.2091, + "step": 8220 + }, + { + "epoch": 41.98979591836735, + "grad_norm": 7.085869789123535, + "learning_rate": 1.1602040816326532e-05, + "loss": 0.2889, + "step": 8230 + }, + { + "epoch": 42.0, + "eval_accuracy": 0.9061371841155235, + "eval_loss": 0.46959710121154785, + "eval_runtime": 1.001, + "eval_samples_per_second": 276.721, + "eval_steps_per_second": 34.965, + "step": 8232 + }, + { + "epoch": 42.04081632653061, + "grad_norm": 8.582082748413086, + "learning_rate": 1.1591836734693878e-05, + "loss": 0.2984, + "step": 8240 + }, + { + "epoch": 42.09183673469388, + "grad_norm": 15.374835014343262, + "learning_rate": 1.1581632653061225e-05, + "loss": 0.2745, + "step": 8250 + }, + { + "epoch": 42.142857142857146, + "grad_norm": 15.124876976013184, + "learning_rate": 1.1571428571428573e-05, + "loss": 0.2039, + "step": 8260 + }, + { + "epoch": 42.19387755102041, + "grad_norm": 1.2030351161956787, + "learning_rate": 1.1561224489795918e-05, + "loss": 0.3055, + "step": 8270 + }, + { + "epoch": 42.244897959183675, + "grad_norm": 15.793198585510254, + "learning_rate": 1.1551020408163268e-05, + "loss": 0.0996, + "step": 8280 + }, + { + "epoch": 42.295918367346935, + "grad_norm": 0.6784889698028564, + "learning_rate": 1.1540816326530613e-05, + "loss": 0.4263, + "step": 8290 + }, + { + "epoch": 42.3469387755102, + "grad_norm": 3.279309034347534, + "learning_rate": 1.1530612244897961e-05, + "loss": 0.3612, + "step": 8300 + }, + { + "epoch": 42.39795918367347, + "grad_norm": 22.569589614868164, + "learning_rate": 1.1520408163265307e-05, + "loss": 0.2326, + "step": 8310 + }, + { + "epoch": 42.44897959183673, + "grad_norm": 17.62470054626465, + "learning_rate": 1.1510204081632654e-05, + "loss": 0.1926, + "step": 8320 + }, + { + "epoch": 42.5, + "grad_norm": 23.093355178833008, + "learning_rate": 1.15e-05, + "loss": 0.2079, + "step": 8330 + }, + { + "epoch": 42.55102040816327, + "grad_norm": 27.545778274536133, + "learning_rate": 1.1489795918367347e-05, + "loss": 0.3866, + "step": 8340 + }, + { + "epoch": 42.60204081632653, + "grad_norm": 17.604076385498047, + "learning_rate": 1.1479591836734697e-05, + "loss": 0.1971, + "step": 8350 + }, + { + "epoch": 42.6530612244898, + "grad_norm": 16.88875389099121, + "learning_rate": 1.146938775510204e-05, + "loss": 0.2792, + "step": 8360 + }, + { + "epoch": 42.704081632653065, + "grad_norm": 24.219806671142578, + "learning_rate": 1.145918367346939e-05, + "loss": 0.3288, + "step": 8370 + }, + { + "epoch": 42.755102040816325, + "grad_norm": 16.283964157104492, + "learning_rate": 1.1448979591836736e-05, + "loss": 0.4401, + "step": 8380 + }, + { + "epoch": 42.80612244897959, + "grad_norm": 12.87180233001709, + "learning_rate": 1.1438775510204083e-05, + "loss": 0.2874, + "step": 8390 + }, + { + "epoch": 42.857142857142854, + "grad_norm": 12.073690414428711, + "learning_rate": 1.1428571428571429e-05, + "loss": 0.2063, + "step": 8400 + }, + { + "epoch": 42.90816326530612, + "grad_norm": 2.407782793045044, + "learning_rate": 1.1418367346938777e-05, + "loss": 0.3469, + "step": 8410 + }, + { + "epoch": 42.95918367346939, + "grad_norm": 4.84604024887085, + "learning_rate": 1.1408163265306122e-05, + "loss": 0.1409, + "step": 8420 + }, + { + "epoch": 43.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.4876045882701874, + "eval_runtime": 1.017, + "eval_samples_per_second": 272.366, + "eval_steps_per_second": 34.414, + "step": 8428 + }, + { + "epoch": 43.01020408163265, + "grad_norm": 23.3207950592041, + "learning_rate": 1.139795918367347e-05, + "loss": 0.2329, + "step": 8430 + }, + { + "epoch": 43.06122448979592, + "grad_norm": 12.801007270812988, + "learning_rate": 1.1387755102040819e-05, + "loss": 0.2373, + "step": 8440 + }, + { + "epoch": 43.11224489795919, + "grad_norm": 25.943416595458984, + "learning_rate": 1.1377551020408165e-05, + "loss": 0.2178, + "step": 8450 + }, + { + "epoch": 43.16326530612245, + "grad_norm": 14.695255279541016, + "learning_rate": 1.1367346938775512e-05, + "loss": 0.4705, + "step": 8460 + }, + { + "epoch": 43.214285714285715, + "grad_norm": 0.8865009546279907, + "learning_rate": 1.1357142857142858e-05, + "loss": 0.408, + "step": 8470 + }, + { + "epoch": 43.265306122448976, + "grad_norm": 2.6337521076202393, + "learning_rate": 1.1346938775510206e-05, + "loss": 0.0958, + "step": 8480 + }, + { + "epoch": 43.316326530612244, + "grad_norm": 2.748199701309204, + "learning_rate": 1.1336734693877551e-05, + "loss": 0.3298, + "step": 8490 + }, + { + "epoch": 43.36734693877551, + "grad_norm": 30.08228874206543, + "learning_rate": 1.1326530612244899e-05, + "loss": 0.2831, + "step": 8500 + }, + { + "epoch": 43.41836734693877, + "grad_norm": 11.666099548339844, + "learning_rate": 1.1316326530612245e-05, + "loss": 0.2202, + "step": 8510 + }, + { + "epoch": 43.46938775510204, + "grad_norm": 16.803131103515625, + "learning_rate": 1.1306122448979592e-05, + "loss": 0.2348, + "step": 8520 + }, + { + "epoch": 43.52040816326531, + "grad_norm": 22.457286834716797, + "learning_rate": 1.1295918367346941e-05, + "loss": 0.3235, + "step": 8530 + }, + { + "epoch": 43.57142857142857, + "grad_norm": 11.632996559143066, + "learning_rate": 1.1285714285714287e-05, + "loss": 0.1888, + "step": 8540 + }, + { + "epoch": 43.62244897959184, + "grad_norm": 2.564565420150757, + "learning_rate": 1.1275510204081635e-05, + "loss": 0.3178, + "step": 8550 + }, + { + "epoch": 43.673469387755105, + "grad_norm": 28.80295181274414, + "learning_rate": 1.126530612244898e-05, + "loss": 0.3112, + "step": 8560 + }, + { + "epoch": 43.724489795918366, + "grad_norm": 16.563232421875, + "learning_rate": 1.1255102040816328e-05, + "loss": 0.2168, + "step": 8570 + }, + { + "epoch": 43.775510204081634, + "grad_norm": 6.326534271240234, + "learning_rate": 1.1244897959183674e-05, + "loss": 0.167, + "step": 8580 + }, + { + "epoch": 43.826530612244895, + "grad_norm": 2.861398458480835, + "learning_rate": 1.1234693877551021e-05, + "loss": 0.2612, + "step": 8590 + }, + { + "epoch": 43.87755102040816, + "grad_norm": 2.5920863151550293, + "learning_rate": 1.1224489795918367e-05, + "loss": 0.1916, + "step": 8600 + }, + { + "epoch": 43.92857142857143, + "grad_norm": 23.582937240600586, + "learning_rate": 1.1214285714285716e-05, + "loss": 0.6427, + "step": 8610 + }, + { + "epoch": 43.97959183673469, + "grad_norm": 17.968748092651367, + "learning_rate": 1.1204081632653062e-05, + "loss": 0.2683, + "step": 8620 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.6134239435195923, + "eval_runtime": 0.998, + "eval_samples_per_second": 277.55, + "eval_steps_per_second": 35.069, + "step": 8624 + }, + { + "epoch": 44.03061224489796, + "grad_norm": 5.34234619140625, + "learning_rate": 1.119387755102041e-05, + "loss": 0.2287, + "step": 8630 + }, + { + "epoch": 44.08163265306123, + "grad_norm": 19.025571823120117, + "learning_rate": 1.1183673469387757e-05, + "loss": 0.2447, + "step": 8640 + }, + { + "epoch": 44.13265306122449, + "grad_norm": 13.49502944946289, + "learning_rate": 1.1173469387755103e-05, + "loss": 0.3274, + "step": 8650 + }, + { + "epoch": 44.183673469387756, + "grad_norm": 28.83685302734375, + "learning_rate": 1.116326530612245e-05, + "loss": 0.2192, + "step": 8660 + }, + { + "epoch": 44.234693877551024, + "grad_norm": 0.44494903087615967, + "learning_rate": 1.1153061224489796e-05, + "loss": 0.3782, + "step": 8670 + }, + { + "epoch": 44.285714285714285, + "grad_norm": 24.09149742126465, + "learning_rate": 1.1142857142857143e-05, + "loss": 0.2657, + "step": 8680 + }, + { + "epoch": 44.33673469387755, + "grad_norm": 10.801912307739258, + "learning_rate": 1.113265306122449e-05, + "loss": 0.2235, + "step": 8690 + }, + { + "epoch": 44.38775510204081, + "grad_norm": 2.6411094665527344, + "learning_rate": 1.1122448979591838e-05, + "loss": 0.2212, + "step": 8700 + }, + { + "epoch": 44.43877551020408, + "grad_norm": 7.397251129150391, + "learning_rate": 1.1112244897959184e-05, + "loss": 0.3425, + "step": 8710 + }, + { + "epoch": 44.48979591836735, + "grad_norm": 1.0233888626098633, + "learning_rate": 1.1102040816326532e-05, + "loss": 0.2648, + "step": 8720 + }, + { + "epoch": 44.54081632653061, + "grad_norm": 31.182104110717773, + "learning_rate": 1.1091836734693879e-05, + "loss": 0.165, + "step": 8730 + }, + { + "epoch": 44.59183673469388, + "grad_norm": 18.37017059326172, + "learning_rate": 1.1081632653061225e-05, + "loss": 0.1269, + "step": 8740 + }, + { + "epoch": 44.642857142857146, + "grad_norm": 2.8033552169799805, + "learning_rate": 1.1071428571428572e-05, + "loss": 0.2118, + "step": 8750 + }, + { + "epoch": 44.69387755102041, + "grad_norm": 4.980088710784912, + "learning_rate": 1.1061224489795918e-05, + "loss": 0.2718, + "step": 8760 + }, + { + "epoch": 44.744897959183675, + "grad_norm": 28.23426055908203, + "learning_rate": 1.1051020408163267e-05, + "loss": 0.2805, + "step": 8770 + }, + { + "epoch": 44.795918367346935, + "grad_norm": 10.001432418823242, + "learning_rate": 1.1040816326530611e-05, + "loss": 0.1618, + "step": 8780 + }, + { + "epoch": 44.8469387755102, + "grad_norm": 1.6225627660751343, + "learning_rate": 1.103061224489796e-05, + "loss": 0.4171, + "step": 8790 + }, + { + "epoch": 44.89795918367347, + "grad_norm": 10.88904857635498, + "learning_rate": 1.1020408163265306e-05, + "loss": 0.2034, + "step": 8800 + }, + { + "epoch": 44.94897959183673, + "grad_norm": 14.345255851745605, + "learning_rate": 1.1010204081632654e-05, + "loss": 0.3092, + "step": 8810 + }, + { + "epoch": 45.0, + "grad_norm": 6.213929653167725, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.3535, + "step": 8820 + }, + { + "epoch": 45.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.4364229440689087, + "eval_runtime": 0.9979, + "eval_samples_per_second": 277.578, + "eval_steps_per_second": 35.073, + "step": 8820 + }, + { + "epoch": 45.05102040816327, + "grad_norm": 14.02003002166748, + "learning_rate": 1.0989795918367347e-05, + "loss": 0.1289, + "step": 8830 + }, + { + "epoch": 45.10204081632653, + "grad_norm": 8.015976905822754, + "learning_rate": 1.0979591836734695e-05, + "loss": 0.2223, + "step": 8840 + }, + { + "epoch": 45.1530612244898, + "grad_norm": 17.441743850708008, + "learning_rate": 1.096938775510204e-05, + "loss": 0.1368, + "step": 8850 + }, + { + "epoch": 45.204081632653065, + "grad_norm": 33.40061950683594, + "learning_rate": 1.095918367346939e-05, + "loss": 0.3637, + "step": 8860 + }, + { + "epoch": 45.255102040816325, + "grad_norm": 14.13931655883789, + "learning_rate": 1.0948979591836735e-05, + "loss": 0.3804, + "step": 8870 + }, + { + "epoch": 45.30612244897959, + "grad_norm": 22.892864227294922, + "learning_rate": 1.0938775510204083e-05, + "loss": 0.2355, + "step": 8880 + }, + { + "epoch": 45.357142857142854, + "grad_norm": 31.64022445678711, + "learning_rate": 1.0928571428571429e-05, + "loss": 0.2492, + "step": 8890 + }, + { + "epoch": 45.40816326530612, + "grad_norm": 12.655797958374023, + "learning_rate": 1.0918367346938776e-05, + "loss": 0.3086, + "step": 8900 + }, + { + "epoch": 45.45918367346939, + "grad_norm": 2.748716115951538, + "learning_rate": 1.0908163265306124e-05, + "loss": 0.1747, + "step": 8910 + }, + { + "epoch": 45.51020408163265, + "grad_norm": 6.854188919067383, + "learning_rate": 1.089795918367347e-05, + "loss": 0.4709, + "step": 8920 + }, + { + "epoch": 45.56122448979592, + "grad_norm": 3.16744065284729, + "learning_rate": 1.0887755102040819e-05, + "loss": 0.315, + "step": 8930 + }, + { + "epoch": 45.61224489795919, + "grad_norm": 9.317691802978516, + "learning_rate": 1.0877551020408163e-05, + "loss": 0.4798, + "step": 8940 + }, + { + "epoch": 45.66326530612245, + "grad_norm": 5.054515838623047, + "learning_rate": 1.0867346938775512e-05, + "loss": 0.2815, + "step": 8950 + }, + { + "epoch": 45.714285714285715, + "grad_norm": 14.822017669677734, + "learning_rate": 1.0857142857142858e-05, + "loss": 0.2572, + "step": 8960 + }, + { + "epoch": 45.765306122448976, + "grad_norm": 26.83745765686035, + "learning_rate": 1.0846938775510205e-05, + "loss": 0.2549, + "step": 8970 + }, + { + "epoch": 45.816326530612244, + "grad_norm": 3.1329498291015625, + "learning_rate": 1.0836734693877551e-05, + "loss": 0.1893, + "step": 8980 + }, + { + "epoch": 45.86734693877551, + "grad_norm": 8.890630722045898, + "learning_rate": 1.0826530612244899e-05, + "loss": 0.1402, + "step": 8990 + }, + { + "epoch": 45.91836734693877, + "grad_norm": 13.858083724975586, + "learning_rate": 1.0816326530612246e-05, + "loss": 0.22, + "step": 9000 + }, + { + "epoch": 45.96938775510204, + "grad_norm": 28.541608810424805, + "learning_rate": 1.0806122448979592e-05, + "loss": 0.1683, + "step": 9010 + }, + { + "epoch": 46.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.4059281349182129, + "eval_runtime": 1.0091, + "eval_samples_per_second": 274.5, + "eval_steps_per_second": 34.684, + "step": 9016 + }, + { + "epoch": 46.02040816326531, + "grad_norm": 39.618553161621094, + "learning_rate": 1.0795918367346941e-05, + "loss": 0.3841, + "step": 9020 + }, + { + "epoch": 46.07142857142857, + "grad_norm": 1.9254268407821655, + "learning_rate": 1.0785714285714287e-05, + "loss": 0.3476, + "step": 9030 + }, + { + "epoch": 46.12244897959184, + "grad_norm": 10.08415699005127, + "learning_rate": 1.0775510204081634e-05, + "loss": 0.3154, + "step": 9040 + }, + { + "epoch": 46.173469387755105, + "grad_norm": 24.28005599975586, + "learning_rate": 1.076530612244898e-05, + "loss": 0.2961, + "step": 9050 + }, + { + "epoch": 46.224489795918366, + "grad_norm": 8.578109741210938, + "learning_rate": 1.0755102040816328e-05, + "loss": 0.4395, + "step": 9060 + }, + { + "epoch": 46.275510204081634, + "grad_norm": 15.351518630981445, + "learning_rate": 1.0744897959183673e-05, + "loss": 0.1941, + "step": 9070 + }, + { + "epoch": 46.326530612244895, + "grad_norm": 4.9734978675842285, + "learning_rate": 1.073469387755102e-05, + "loss": 0.2479, + "step": 9080 + }, + { + "epoch": 46.37755102040816, + "grad_norm": 23.234966278076172, + "learning_rate": 1.072448979591837e-05, + "loss": 0.189, + "step": 9090 + }, + { + "epoch": 46.42857142857143, + "grad_norm": 14.160266876220703, + "learning_rate": 1.0714285714285714e-05, + "loss": 0.2577, + "step": 9100 + }, + { + "epoch": 46.47959183673469, + "grad_norm": 13.912514686584473, + "learning_rate": 1.0704081632653063e-05, + "loss": 0.2707, + "step": 9110 + }, + { + "epoch": 46.53061224489796, + "grad_norm": 22.197444915771484, + "learning_rate": 1.0693877551020409e-05, + "loss": 0.1601, + "step": 9120 + }, + { + "epoch": 46.58163265306123, + "grad_norm": 33.202762603759766, + "learning_rate": 1.0683673469387757e-05, + "loss": 0.6423, + "step": 9130 + }, + { + "epoch": 46.63265306122449, + "grad_norm": 3.903841495513916, + "learning_rate": 1.0673469387755102e-05, + "loss": 0.362, + "step": 9140 + }, + { + "epoch": 46.683673469387756, + "grad_norm": 9.70276927947998, + "learning_rate": 1.066326530612245e-05, + "loss": 0.1973, + "step": 9150 + }, + { + "epoch": 46.734693877551024, + "grad_norm": 11.807016372680664, + "learning_rate": 1.0653061224489796e-05, + "loss": 0.2036, + "step": 9160 + }, + { + "epoch": 46.785714285714285, + "grad_norm": 27.489591598510742, + "learning_rate": 1.0642857142857143e-05, + "loss": 0.2482, + "step": 9170 + }, + { + "epoch": 46.83673469387755, + "grad_norm": 11.883766174316406, + "learning_rate": 1.0632653061224492e-05, + "loss": 0.1474, + "step": 9180 + }, + { + "epoch": 46.88775510204081, + "grad_norm": 8.916913986206055, + "learning_rate": 1.0622448979591838e-05, + "loss": 0.1983, + "step": 9190 + }, + { + "epoch": 46.93877551020408, + "grad_norm": 28.345640182495117, + "learning_rate": 1.0612244897959186e-05, + "loss": 0.3937, + "step": 9200 + }, + { + "epoch": 46.98979591836735, + "grad_norm": 9.335458755493164, + "learning_rate": 1.0602040816326531e-05, + "loss": 0.43, + "step": 9210 + }, + { + "epoch": 47.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.39545148611068726, + "eval_runtime": 0.9972, + "eval_samples_per_second": 277.769, + "eval_steps_per_second": 35.097, + "step": 9212 + }, + { + "epoch": 47.04081632653061, + "grad_norm": 13.055625915527344, + "learning_rate": 1.0591836734693879e-05, + "loss": 0.3295, + "step": 9220 + }, + { + "epoch": 47.09183673469388, + "grad_norm": 22.485971450805664, + "learning_rate": 1.0581632653061225e-05, + "loss": 0.2135, + "step": 9230 + }, + { + "epoch": 47.142857142857146, + "grad_norm": 26.52958869934082, + "learning_rate": 1.0571428571428572e-05, + "loss": 0.3395, + "step": 9240 + }, + { + "epoch": 47.19387755102041, + "grad_norm": 18.65353012084961, + "learning_rate": 1.0561224489795918e-05, + "loss": 0.129, + "step": 9250 + }, + { + "epoch": 47.244897959183675, + "grad_norm": 2.742490530014038, + "learning_rate": 1.0551020408163265e-05, + "loss": 0.4399, + "step": 9260 + }, + { + "epoch": 47.295918367346935, + "grad_norm": 26.377254486083984, + "learning_rate": 1.0540816326530615e-05, + "loss": 0.3054, + "step": 9270 + }, + { + "epoch": 47.3469387755102, + "grad_norm": 17.06761360168457, + "learning_rate": 1.053061224489796e-05, + "loss": 0.3281, + "step": 9280 + }, + { + "epoch": 47.39795918367347, + "grad_norm": 22.568580627441406, + "learning_rate": 1.0520408163265308e-05, + "loss": 0.2918, + "step": 9290 + }, + { + "epoch": 47.44897959183673, + "grad_norm": 6.5478034019470215, + "learning_rate": 1.0510204081632654e-05, + "loss": 0.2797, + "step": 9300 + }, + { + "epoch": 47.5, + "grad_norm": 3.246676206588745, + "learning_rate": 1.0500000000000001e-05, + "loss": 0.7281, + "step": 9310 + }, + { + "epoch": 47.55102040816327, + "grad_norm": 23.381351470947266, + "learning_rate": 1.0489795918367347e-05, + "loss": 0.2733, + "step": 9320 + }, + { + "epoch": 47.60204081632653, + "grad_norm": 22.62360954284668, + "learning_rate": 1.0479591836734694e-05, + "loss": 0.3415, + "step": 9330 + }, + { + "epoch": 47.6530612244898, + "grad_norm": 11.388172149658203, + "learning_rate": 1.046938775510204e-05, + "loss": 0.1324, + "step": 9340 + }, + { + "epoch": 47.704081632653065, + "grad_norm": 1.7270504236221313, + "learning_rate": 1.045918367346939e-05, + "loss": 0.2084, + "step": 9350 + }, + { + "epoch": 47.755102040816325, + "grad_norm": 2.4763684272766113, + "learning_rate": 1.0448979591836737e-05, + "loss": 0.3023, + "step": 9360 + }, + { + "epoch": 47.80612244897959, + "grad_norm": 1.4133647680282593, + "learning_rate": 1.0438775510204083e-05, + "loss": 0.1963, + "step": 9370 + }, + { + "epoch": 47.857142857142854, + "grad_norm": 15.471637725830078, + "learning_rate": 1.042857142857143e-05, + "loss": 0.258, + "step": 9380 + }, + { + "epoch": 47.90816326530612, + "grad_norm": 8.931612968444824, + "learning_rate": 1.0418367346938776e-05, + "loss": 0.0994, + "step": 9390 + }, + { + "epoch": 47.95918367346939, + "grad_norm": 22.438880920410156, + "learning_rate": 1.0408163265306123e-05, + "loss": 0.5702, + "step": 9400 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.3898092210292816, + "eval_runtime": 1.0509, + "eval_samples_per_second": 263.582, + "eval_steps_per_second": 33.305, + "step": 9408 + }, + { + "epoch": 48.01020408163265, + "grad_norm": 0.4513070285320282, + "learning_rate": 1.039795918367347e-05, + "loss": 0.2696, + "step": 9410 + }, + { + "epoch": 48.06122448979592, + "grad_norm": 6.933629035949707, + "learning_rate": 1.0387755102040817e-05, + "loss": 0.2953, + "step": 9420 + }, + { + "epoch": 48.11224489795919, + "grad_norm": 33.02798080444336, + "learning_rate": 1.0377551020408162e-05, + "loss": 0.4555, + "step": 9430 + }, + { + "epoch": 48.16326530612245, + "grad_norm": 29.733671188354492, + "learning_rate": 1.0367346938775512e-05, + "loss": 0.2474, + "step": 9440 + }, + { + "epoch": 48.214285714285715, + "grad_norm": 11.83218002319336, + "learning_rate": 1.0357142857142859e-05, + "loss": 0.1938, + "step": 9450 + }, + { + "epoch": 48.265306122448976, + "grad_norm": 10.749662399291992, + "learning_rate": 1.0346938775510205e-05, + "loss": 0.2704, + "step": 9460 + }, + { + "epoch": 48.316326530612244, + "grad_norm": 12.345331192016602, + "learning_rate": 1.0336734693877552e-05, + "loss": 0.2977, + "step": 9470 + }, + { + "epoch": 48.36734693877551, + "grad_norm": 6.708714008331299, + "learning_rate": 1.0326530612244898e-05, + "loss": 0.2578, + "step": 9480 + }, + { + "epoch": 48.41836734693877, + "grad_norm": 22.155378341674805, + "learning_rate": 1.0316326530612246e-05, + "loss": 0.2791, + "step": 9490 + }, + { + "epoch": 48.46938775510204, + "grad_norm": 16.161104202270508, + "learning_rate": 1.0306122448979591e-05, + "loss": 0.1732, + "step": 9500 + }, + { + "epoch": 48.52040816326531, + "grad_norm": 2.8587968349456787, + "learning_rate": 1.029591836734694e-05, + "loss": 0.1839, + "step": 9510 + }, + { + "epoch": 48.57142857142857, + "grad_norm": 21.057994842529297, + "learning_rate": 1.0285714285714285e-05, + "loss": 0.1179, + "step": 9520 + }, + { + "epoch": 48.62244897959184, + "grad_norm": 3.4839565753936768, + "learning_rate": 1.0275510204081634e-05, + "loss": 0.1486, + "step": 9530 + }, + { + "epoch": 48.673469387755105, + "grad_norm": 6.173724174499512, + "learning_rate": 1.0265306122448981e-05, + "loss": 0.267, + "step": 9540 + }, + { + "epoch": 48.724489795918366, + "grad_norm": 3.51310133934021, + "learning_rate": 1.0255102040816327e-05, + "loss": 0.396, + "step": 9550 + }, + { + "epoch": 48.775510204081634, + "grad_norm": 16.72086524963379, + "learning_rate": 1.0244897959183675e-05, + "loss": 0.3982, + "step": 9560 + }, + { + "epoch": 48.826530612244895, + "grad_norm": 17.559070587158203, + "learning_rate": 1.023469387755102e-05, + "loss": 0.3259, + "step": 9570 + }, + { + "epoch": 48.87755102040816, + "grad_norm": 16.63841438293457, + "learning_rate": 1.0224489795918368e-05, + "loss": 0.2982, + "step": 9580 + }, + { + "epoch": 48.92857142857143, + "grad_norm": 0.7539604306221008, + "learning_rate": 1.0214285714285714e-05, + "loss": 0.1272, + "step": 9590 + }, + { + "epoch": 48.97959183673469, + "grad_norm": 31.266700744628906, + "learning_rate": 1.0204081632653063e-05, + "loss": 0.8043, + "step": 9600 + }, + { + "epoch": 49.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.596345067024231, + "eval_runtime": 0.9981, + "eval_samples_per_second": 277.519, + "eval_steps_per_second": 35.066, + "step": 9604 + }, + { + "epoch": 49.03061224489796, + "grad_norm": 37.56051254272461, + "learning_rate": 1.0193877551020409e-05, + "loss": 0.3007, + "step": 9610 + }, + { + "epoch": 49.08163265306123, + "grad_norm": 24.77801513671875, + "learning_rate": 1.0183673469387756e-05, + "loss": 0.2598, + "step": 9620 + }, + { + "epoch": 49.13265306122449, + "grad_norm": 15.495170593261719, + "learning_rate": 1.0173469387755104e-05, + "loss": 0.3709, + "step": 9630 + }, + { + "epoch": 49.183673469387756, + "grad_norm": 32.0685920715332, + "learning_rate": 1.016326530612245e-05, + "loss": 0.4094, + "step": 9640 + }, + { + "epoch": 49.234693877551024, + "grad_norm": 14.19920539855957, + "learning_rate": 1.0153061224489797e-05, + "loss": 0.4511, + "step": 9650 + }, + { + "epoch": 49.285714285714285, + "grad_norm": 24.553438186645508, + "learning_rate": 1.0142857142857143e-05, + "loss": 0.2992, + "step": 9660 + }, + { + "epoch": 49.33673469387755, + "grad_norm": 5.018165111541748, + "learning_rate": 1.013265306122449e-05, + "loss": 0.2298, + "step": 9670 + }, + { + "epoch": 49.38775510204081, + "grad_norm": 0.694602906703949, + "learning_rate": 1.0122448979591836e-05, + "loss": 0.2764, + "step": 9680 + }, + { + "epoch": 49.43877551020408, + "grad_norm": 14.046659469604492, + "learning_rate": 1.0112244897959185e-05, + "loss": 0.204, + "step": 9690 + }, + { + "epoch": 49.48979591836735, + "grad_norm": 17.197769165039062, + "learning_rate": 1.0102040816326531e-05, + "loss": 0.392, + "step": 9700 + }, + { + "epoch": 49.54081632653061, + "grad_norm": 1.4911197423934937, + "learning_rate": 1.0091836734693879e-05, + "loss": 0.1574, + "step": 9710 + }, + { + "epoch": 49.59183673469388, + "grad_norm": 17.199138641357422, + "learning_rate": 1.0081632653061226e-05, + "loss": 0.3963, + "step": 9720 + }, + { + "epoch": 49.642857142857146, + "grad_norm": 20.017789840698242, + "learning_rate": 1.0071428571428572e-05, + "loss": 0.236, + "step": 9730 + }, + { + "epoch": 49.69387755102041, + "grad_norm": 14.910563468933105, + "learning_rate": 1.006122448979592e-05, + "loss": 0.2261, + "step": 9740 + }, + { + "epoch": 49.744897959183675, + "grad_norm": 5.493719577789307, + "learning_rate": 1.0051020408163265e-05, + "loss": 0.2894, + "step": 9750 + }, + { + "epoch": 49.795918367346935, + "grad_norm": 4.413763523101807, + "learning_rate": 1.0040816326530614e-05, + "loss": 0.193, + "step": 9760 + }, + { + "epoch": 49.8469387755102, + "grad_norm": 3.9958415031433105, + "learning_rate": 1.003061224489796e-05, + "loss": 0.3082, + "step": 9770 + }, + { + "epoch": 49.89795918367347, + "grad_norm": 1.497524619102478, + "learning_rate": 1.0020408163265308e-05, + "loss": 0.088, + "step": 9780 + }, + { + "epoch": 49.94897959183673, + "grad_norm": 17.267000198364258, + "learning_rate": 1.0010204081632653e-05, + "loss": 0.5218, + "step": 9790 + }, + { + "epoch": 50.0, + "grad_norm": 9.129292488098145, + "learning_rate": 1e-05, + "loss": 0.3742, + "step": 9800 + }, + { + "epoch": 50.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.52725750207901, + "eval_runtime": 1.0103, + "eval_samples_per_second": 274.185, + "eval_steps_per_second": 34.644, + "step": 9800 + }, + { + "epoch": 50.05102040816327, + "grad_norm": 4.937376976013184, + "learning_rate": 9.989795918367348e-06, + "loss": 0.3733, + "step": 9810 + }, + { + "epoch": 50.10204081632653, + "grad_norm": 18.862049102783203, + "learning_rate": 9.979591836734694e-06, + "loss": 0.2366, + "step": 9820 + }, + { + "epoch": 50.1530612244898, + "grad_norm": 7.983892440795898, + "learning_rate": 9.969387755102042e-06, + "loss": 0.2177, + "step": 9830 + }, + { + "epoch": 50.204081632653065, + "grad_norm": 2.9340569972991943, + "learning_rate": 9.959183673469387e-06, + "loss": 0.2438, + "step": 9840 + }, + { + "epoch": 50.255102040816325, + "grad_norm": 22.085994720458984, + "learning_rate": 9.948979591836737e-06, + "loss": 0.2964, + "step": 9850 + }, + { + "epoch": 50.30612244897959, + "grad_norm": 0.7414323687553406, + "learning_rate": 9.938775510204082e-06, + "loss": 0.2736, + "step": 9860 + }, + { + "epoch": 50.357142857142854, + "grad_norm": 6.767985820770264, + "learning_rate": 9.92857142857143e-06, + "loss": 0.1981, + "step": 9870 + }, + { + "epoch": 50.40816326530612, + "grad_norm": 1.7000843286514282, + "learning_rate": 9.918367346938776e-06, + "loss": 0.126, + "step": 9880 + }, + { + "epoch": 50.45918367346939, + "grad_norm": 5.059332370758057, + "learning_rate": 9.908163265306123e-06, + "loss": 0.3383, + "step": 9890 + }, + { + "epoch": 50.51020408163265, + "grad_norm": 14.707350730895996, + "learning_rate": 9.89795918367347e-06, + "loss": 0.2567, + "step": 9900 + }, + { + "epoch": 50.56122448979592, + "grad_norm": 46.81512451171875, + "learning_rate": 9.887755102040816e-06, + "loss": 0.5197, + "step": 9910 + }, + { + "epoch": 50.61224489795919, + "grad_norm": 20.73604393005371, + "learning_rate": 9.877551020408164e-06, + "loss": 0.2843, + "step": 9920 + }, + { + "epoch": 50.66326530612245, + "grad_norm": 19.806276321411133, + "learning_rate": 9.867346938775511e-06, + "loss": 0.2171, + "step": 9930 + }, + { + "epoch": 50.714285714285715, + "grad_norm": 3.416189193725586, + "learning_rate": 9.857142857142859e-06, + "loss": 0.2057, + "step": 9940 + }, + { + "epoch": 50.765306122448976, + "grad_norm": 24.192386627197266, + "learning_rate": 9.846938775510205e-06, + "loss": 0.3665, + "step": 9950 + }, + { + "epoch": 50.816326530612244, + "grad_norm": 16.198684692382812, + "learning_rate": 9.836734693877552e-06, + "loss": 0.3972, + "step": 9960 + }, + { + "epoch": 50.86734693877551, + "grad_norm": 29.34848403930664, + "learning_rate": 9.8265306122449e-06, + "loss": 0.2339, + "step": 9970 + }, + { + "epoch": 50.91836734693877, + "grad_norm": 2.502192735671997, + "learning_rate": 9.816326530612245e-06, + "loss": 0.4402, + "step": 9980 + }, + { + "epoch": 50.96938775510204, + "grad_norm": 17.41951560974121, + "learning_rate": 9.806122448979593e-06, + "loss": 0.1026, + "step": 9990 + }, + { + "epoch": 51.0, + "eval_accuracy": 0.8989169675090253, + "eval_loss": 0.39987990260124207, + "eval_runtime": 1.0064, + "eval_samples_per_second": 275.235, + "eval_steps_per_second": 34.777, + "step": 9996 + }, + { + "epoch": 51.02040816326531, + "grad_norm": 24.76848602294922, + "learning_rate": 9.795918367346939e-06, + "loss": 0.2763, + "step": 10000 + }, + { + "epoch": 51.07142857142857, + "grad_norm": 23.847902297973633, + "learning_rate": 9.785714285714286e-06, + "loss": 0.201, + "step": 10010 + }, + { + "epoch": 51.12244897959184, + "grad_norm": 42.847190856933594, + "learning_rate": 9.775510204081634e-06, + "loss": 0.4703, + "step": 10020 + }, + { + "epoch": 51.173469387755105, + "grad_norm": 30.936927795410156, + "learning_rate": 9.765306122448981e-06, + "loss": 0.1511, + "step": 10030 + }, + { + "epoch": 51.224489795918366, + "grad_norm": 4.2489142417907715, + "learning_rate": 9.755102040816327e-06, + "loss": 0.3169, + "step": 10040 + }, + { + "epoch": 51.275510204081634, + "grad_norm": 2.965418577194214, + "learning_rate": 9.744897959183674e-06, + "loss": 0.0974, + "step": 10050 + }, + { + "epoch": 51.326530612244895, + "grad_norm": 32.12446975708008, + "learning_rate": 9.734693877551022e-06, + "loss": 0.5556, + "step": 10060 + }, + { + "epoch": 51.37755102040816, + "grad_norm": 10.805654525756836, + "learning_rate": 9.724489795918368e-06, + "loss": 0.3138, + "step": 10070 + }, + { + "epoch": 51.42857142857143, + "grad_norm": 1.1343532800674438, + "learning_rate": 9.714285714285715e-06, + "loss": 0.1618, + "step": 10080 + }, + { + "epoch": 51.47959183673469, + "grad_norm": 5.84451961517334, + "learning_rate": 9.704081632653061e-06, + "loss": 0.1764, + "step": 10090 + }, + { + "epoch": 51.53061224489796, + "grad_norm": 41.4073600769043, + "learning_rate": 9.693877551020408e-06, + "loss": 0.5317, + "step": 10100 + }, + { + "epoch": 51.58163265306123, + "grad_norm": 3.5908291339874268, + "learning_rate": 9.683673469387756e-06, + "loss": 0.1604, + "step": 10110 + }, + { + "epoch": 51.63265306122449, + "grad_norm": 37.648773193359375, + "learning_rate": 9.673469387755103e-06, + "loss": 0.3668, + "step": 10120 + }, + { + "epoch": 51.683673469387756, + "grad_norm": 0.970947802066803, + "learning_rate": 9.663265306122451e-06, + "loss": 0.0427, + "step": 10130 + }, + { + "epoch": 51.734693877551024, + "grad_norm": 25.296140670776367, + "learning_rate": 9.653061224489797e-06, + "loss": 0.1435, + "step": 10140 + }, + { + "epoch": 51.785714285714285, + "grad_norm": 1.5384756326675415, + "learning_rate": 9.642857142857144e-06, + "loss": 0.1863, + "step": 10150 + }, + { + "epoch": 51.83673469387755, + "grad_norm": 20.70154571533203, + "learning_rate": 9.63265306122449e-06, + "loss": 0.2675, + "step": 10160 + }, + { + "epoch": 51.88775510204081, + "grad_norm": 1.2115554809570312, + "learning_rate": 9.622448979591837e-06, + "loss": 0.2392, + "step": 10170 + }, + { + "epoch": 51.93877551020408, + "grad_norm": 17.742774963378906, + "learning_rate": 9.612244897959185e-06, + "loss": 0.564, + "step": 10180 + }, + { + "epoch": 51.98979591836735, + "grad_norm": 17.226428985595703, + "learning_rate": 9.60204081632653e-06, + "loss": 0.2357, + "step": 10190 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.47236067056655884, + "eval_runtime": 1.0035, + "eval_samples_per_second": 276.023, + "eval_steps_per_second": 34.876, + "step": 10192 + }, + { + "epoch": 52.04081632653061, + "grad_norm": 5.835048198699951, + "learning_rate": 9.591836734693878e-06, + "loss": 0.1891, + "step": 10200 + }, + { + "epoch": 52.09183673469388, + "grad_norm": 1.156166434288025, + "learning_rate": 9.581632653061226e-06, + "loss": 0.4142, + "step": 10210 + }, + { + "epoch": 52.142857142857146, + "grad_norm": 15.46737003326416, + "learning_rate": 9.571428571428573e-06, + "loss": 0.322, + "step": 10220 + }, + { + "epoch": 52.19387755102041, + "grad_norm": 24.277389526367188, + "learning_rate": 9.561224489795919e-06, + "loss": 0.3908, + "step": 10230 + }, + { + "epoch": 52.244897959183675, + "grad_norm": 11.838648796081543, + "learning_rate": 9.551020408163266e-06, + "loss": 0.1888, + "step": 10240 + }, + { + "epoch": 52.295918367346935, + "grad_norm": 21.2302303314209, + "learning_rate": 9.540816326530612e-06, + "loss": 0.1687, + "step": 10250 + }, + { + "epoch": 52.3469387755102, + "grad_norm": 5.784067630767822, + "learning_rate": 9.53061224489796e-06, + "loss": 0.1677, + "step": 10260 + }, + { + "epoch": 52.39795918367347, + "grad_norm": 4.069281101226807, + "learning_rate": 9.520408163265307e-06, + "loss": 0.1899, + "step": 10270 + }, + { + "epoch": 52.44897959183673, + "grad_norm": 39.54216766357422, + "learning_rate": 9.510204081632653e-06, + "loss": 0.4441, + "step": 10280 + }, + { + "epoch": 52.5, + "grad_norm": 7.6494140625, + "learning_rate": 9.5e-06, + "loss": 0.2656, + "step": 10290 + }, + { + "epoch": 52.55102040816327, + "grad_norm": 7.464126110076904, + "learning_rate": 9.489795918367348e-06, + "loss": 0.1958, + "step": 10300 + }, + { + "epoch": 52.60204081632653, + "grad_norm": 35.30615234375, + "learning_rate": 9.479591836734695e-06, + "loss": 0.2901, + "step": 10310 + }, + { + "epoch": 52.6530612244898, + "grad_norm": 14.667298316955566, + "learning_rate": 9.469387755102041e-06, + "loss": 0.3089, + "step": 10320 + }, + { + "epoch": 52.704081632653065, + "grad_norm": 5.512417793273926, + "learning_rate": 9.459183673469389e-06, + "loss": 0.23, + "step": 10330 + }, + { + "epoch": 52.755102040816325, + "grad_norm": 26.751949310302734, + "learning_rate": 9.448979591836736e-06, + "loss": 0.2886, + "step": 10340 + }, + { + "epoch": 52.80612244897959, + "grad_norm": 3.309727430343628, + "learning_rate": 9.438775510204082e-06, + "loss": 0.2335, + "step": 10350 + }, + { + "epoch": 52.857142857142854, + "grad_norm": 28.716583251953125, + "learning_rate": 9.42857142857143e-06, + "loss": 0.345, + "step": 10360 + }, + { + "epoch": 52.90816326530612, + "grad_norm": 8.494994163513184, + "learning_rate": 9.418367346938775e-06, + "loss": 0.289, + "step": 10370 + }, + { + "epoch": 52.95918367346939, + "grad_norm": 1.7165334224700928, + "learning_rate": 9.408163265306123e-06, + "loss": 0.2612, + "step": 10380 + }, + { + "epoch": 53.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.41692036390304565, + "eval_runtime": 1.0022, + "eval_samples_per_second": 276.384, + "eval_steps_per_second": 34.922, + "step": 10388 + }, + { + "epoch": 53.01020408163265, + "grad_norm": 3.0499188899993896, + "learning_rate": 9.39795918367347e-06, + "loss": 0.5454, + "step": 10390 + }, + { + "epoch": 53.06122448979592, + "grad_norm": 21.596120834350586, + "learning_rate": 9.387755102040818e-06, + "loss": 0.3184, + "step": 10400 + }, + { + "epoch": 53.11224489795919, + "grad_norm": 6.21745491027832, + "learning_rate": 9.377551020408164e-06, + "loss": 0.3075, + "step": 10410 + }, + { + "epoch": 53.16326530612245, + "grad_norm": 21.998977661132812, + "learning_rate": 9.367346938775511e-06, + "loss": 0.2669, + "step": 10420 + }, + { + "epoch": 53.214285714285715, + "grad_norm": 16.327312469482422, + "learning_rate": 9.357142857142859e-06, + "loss": 0.1578, + "step": 10430 + }, + { + "epoch": 53.265306122448976, + "grad_norm": 28.928312301635742, + "learning_rate": 9.346938775510204e-06, + "loss": 0.3822, + "step": 10440 + }, + { + "epoch": 53.316326530612244, + "grad_norm": 16.499805450439453, + "learning_rate": 9.336734693877552e-06, + "loss": 0.2767, + "step": 10450 + }, + { + "epoch": 53.36734693877551, + "grad_norm": 2.9976630210876465, + "learning_rate": 9.326530612244898e-06, + "loss": 0.2205, + "step": 10460 + }, + { + "epoch": 53.41836734693877, + "grad_norm": 13.97140884399414, + "learning_rate": 9.316326530612245e-06, + "loss": 0.1652, + "step": 10470 + }, + { + "epoch": 53.46938775510204, + "grad_norm": 14.757030487060547, + "learning_rate": 9.306122448979593e-06, + "loss": 0.3727, + "step": 10480 + }, + { + "epoch": 53.52040816326531, + "grad_norm": 10.051803588867188, + "learning_rate": 9.29591836734694e-06, + "loss": 0.2692, + "step": 10490 + }, + { + "epoch": 53.57142857142857, + "grad_norm": 2.555940628051758, + "learning_rate": 9.285714285714288e-06, + "loss": 0.0995, + "step": 10500 + }, + { + "epoch": 53.62244897959184, + "grad_norm": 14.824005126953125, + "learning_rate": 9.275510204081633e-06, + "loss": 0.3368, + "step": 10510 + }, + { + "epoch": 53.673469387755105, + "grad_norm": 8.637600898742676, + "learning_rate": 9.26530612244898e-06, + "loss": 0.1681, + "step": 10520 + }, + { + "epoch": 53.724489795918366, + "grad_norm": 28.2470645904541, + "learning_rate": 9.255102040816327e-06, + "loss": 0.2876, + "step": 10530 + }, + { + "epoch": 53.775510204081634, + "grad_norm": 12.620148658752441, + "learning_rate": 9.244897959183674e-06, + "loss": 0.2447, + "step": 10540 + }, + { + "epoch": 53.826530612244895, + "grad_norm": 4.907227993011475, + "learning_rate": 9.234693877551022e-06, + "loss": 0.1519, + "step": 10550 + }, + { + "epoch": 53.87755102040816, + "grad_norm": 1.079929232597351, + "learning_rate": 9.224489795918367e-06, + "loss": 0.2137, + "step": 10560 + }, + { + "epoch": 53.92857142857143, + "grad_norm": 12.417989730834961, + "learning_rate": 9.214285714285715e-06, + "loss": 0.0916, + "step": 10570 + }, + { + "epoch": 53.97959183673469, + "grad_norm": 28.29491424560547, + "learning_rate": 9.204081632653062e-06, + "loss": 0.4747, + "step": 10580 + }, + { + "epoch": 54.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.39725634455680847, + "eval_runtime": 0.9985, + "eval_samples_per_second": 277.405, + "eval_steps_per_second": 35.051, + "step": 10584 + }, + { + "epoch": 54.03061224489796, + "grad_norm": 24.29307746887207, + "learning_rate": 9.19387755102041e-06, + "loss": 0.263, + "step": 10590 + }, + { + "epoch": 54.08163265306123, + "grad_norm": 38.277374267578125, + "learning_rate": 9.183673469387756e-06, + "loss": 0.1799, + "step": 10600 + }, + { + "epoch": 54.13265306122449, + "grad_norm": 13.471637725830078, + "learning_rate": 9.173469387755103e-06, + "loss": 0.2766, + "step": 10610 + }, + { + "epoch": 54.183673469387756, + "grad_norm": 15.754558563232422, + "learning_rate": 9.163265306122449e-06, + "loss": 0.1801, + "step": 10620 + }, + { + "epoch": 54.234693877551024, + "grad_norm": 1.2889409065246582, + "learning_rate": 9.153061224489796e-06, + "loss": 0.1874, + "step": 10630 + }, + { + "epoch": 54.285714285714285, + "grad_norm": 1.7550626993179321, + "learning_rate": 9.142857142857144e-06, + "loss": 0.1494, + "step": 10640 + }, + { + "epoch": 54.33673469387755, + "grad_norm": 5.265414237976074, + "learning_rate": 9.13265306122449e-06, + "loss": 0.4031, + "step": 10650 + }, + { + "epoch": 54.38775510204081, + "grad_norm": 30.67131233215332, + "learning_rate": 9.122448979591837e-06, + "loss": 0.2256, + "step": 10660 + }, + { + "epoch": 54.43877551020408, + "grad_norm": 17.57633399963379, + "learning_rate": 9.112244897959185e-06, + "loss": 0.201, + "step": 10670 + }, + { + "epoch": 54.48979591836735, + "grad_norm": 1.6129862070083618, + "learning_rate": 9.102040816326532e-06, + "loss": 0.1325, + "step": 10680 + }, + { + "epoch": 54.54081632653061, + "grad_norm": 26.21156120300293, + "learning_rate": 9.091836734693878e-06, + "loss": 0.2999, + "step": 10690 + }, + { + "epoch": 54.59183673469388, + "grad_norm": 40.24671173095703, + "learning_rate": 9.081632653061225e-06, + "loss": 0.2998, + "step": 10700 + }, + { + "epoch": 54.642857142857146, + "grad_norm": 16.38653564453125, + "learning_rate": 9.071428571428573e-06, + "loss": 0.2825, + "step": 10710 + }, + { + "epoch": 54.69387755102041, + "grad_norm": 10.715635299682617, + "learning_rate": 9.061224489795919e-06, + "loss": 0.3416, + "step": 10720 + }, + { + "epoch": 54.744897959183675, + "grad_norm": 17.47795867919922, + "learning_rate": 9.051020408163266e-06, + "loss": 0.1243, + "step": 10730 + }, + { + "epoch": 54.795918367346935, + "grad_norm": 22.199556350708008, + "learning_rate": 9.040816326530612e-06, + "loss": 0.2263, + "step": 10740 + }, + { + "epoch": 54.8469387755102, + "grad_norm": 9.025761604309082, + "learning_rate": 9.03061224489796e-06, + "loss": 0.2307, + "step": 10750 + }, + { + "epoch": 54.89795918367347, + "grad_norm": 13.868419647216797, + "learning_rate": 9.020408163265307e-06, + "loss": 0.1659, + "step": 10760 + }, + { + "epoch": 54.94897959183673, + "grad_norm": 15.956478118896484, + "learning_rate": 9.010204081632654e-06, + "loss": 0.2213, + "step": 10770 + }, + { + "epoch": 55.0, + "grad_norm": 45.23737335205078, + "learning_rate": 9e-06, + "loss": 0.4943, + "step": 10780 + }, + { + "epoch": 55.0, + "eval_accuracy": 0.9061371841155235, + "eval_loss": 0.5155691504478455, + "eval_runtime": 0.9983, + "eval_samples_per_second": 277.469, + "eval_steps_per_second": 35.059, + "step": 10780 + }, + { + "epoch": 55.05102040816327, + "grad_norm": 15.772557258605957, + "learning_rate": 8.989795918367348e-06, + "loss": 0.2231, + "step": 10790 + }, + { + "epoch": 55.10204081632653, + "grad_norm": 19.207889556884766, + "learning_rate": 8.979591836734695e-06, + "loss": 0.2928, + "step": 10800 + }, + { + "epoch": 55.1530612244898, + "grad_norm": 35.7724723815918, + "learning_rate": 8.969387755102041e-06, + "loss": 0.4151, + "step": 10810 + }, + { + "epoch": 55.204081632653065, + "grad_norm": 33.47428894042969, + "learning_rate": 8.959183673469388e-06, + "loss": 0.5266, + "step": 10820 + }, + { + "epoch": 55.255102040816325, + "grad_norm": 2.5040481090545654, + "learning_rate": 8.948979591836734e-06, + "loss": 0.1873, + "step": 10830 + }, + { + "epoch": 55.30612244897959, + "grad_norm": 25.5793514251709, + "learning_rate": 8.938775510204082e-06, + "loss": 0.1669, + "step": 10840 + }, + { + "epoch": 55.357142857142854, + "grad_norm": 4.252367973327637, + "learning_rate": 8.92857142857143e-06, + "loss": 0.1926, + "step": 10850 + }, + { + "epoch": 55.40816326530612, + "grad_norm": 0.292059987783432, + "learning_rate": 8.918367346938777e-06, + "loss": 0.3432, + "step": 10860 + }, + { + "epoch": 55.45918367346939, + "grad_norm": 24.86861801147461, + "learning_rate": 8.908163265306124e-06, + "loss": 0.2959, + "step": 10870 + }, + { + "epoch": 55.51020408163265, + "grad_norm": 32.263946533203125, + "learning_rate": 8.89795918367347e-06, + "loss": 0.2763, + "step": 10880 + }, + { + "epoch": 55.56122448979592, + "grad_norm": 24.175312042236328, + "learning_rate": 8.887755102040817e-06, + "loss": 0.4425, + "step": 10890 + }, + { + "epoch": 55.61224489795919, + "grad_norm": 7.99574613571167, + "learning_rate": 8.877551020408163e-06, + "loss": 0.106, + "step": 10900 + }, + { + "epoch": 55.66326530612245, + "grad_norm": 2.605454921722412, + "learning_rate": 8.86734693877551e-06, + "loss": 0.1939, + "step": 10910 + }, + { + "epoch": 55.714285714285715, + "grad_norm": 32.04717254638672, + "learning_rate": 8.857142857142858e-06, + "loss": 0.395, + "step": 10920 + }, + { + "epoch": 55.765306122448976, + "grad_norm": 2.6572344303131104, + "learning_rate": 8.846938775510204e-06, + "loss": 0.1562, + "step": 10930 + }, + { + "epoch": 55.816326530612244, + "grad_norm": 9.090982437133789, + "learning_rate": 8.836734693877552e-06, + "loss": 0.1864, + "step": 10940 + }, + { + "epoch": 55.86734693877551, + "grad_norm": 2.101627826690674, + "learning_rate": 8.826530612244899e-06, + "loss": 0.3608, + "step": 10950 + }, + { + "epoch": 55.91836734693877, + "grad_norm": 18.739809036254883, + "learning_rate": 8.816326530612247e-06, + "loss": 0.1969, + "step": 10960 + }, + { + "epoch": 55.96938775510204, + "grad_norm": 21.086181640625, + "learning_rate": 8.806122448979592e-06, + "loss": 0.2296, + "step": 10970 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.6397488117218018, + "eval_runtime": 1.0387, + "eval_samples_per_second": 266.671, + "eval_steps_per_second": 33.695, + "step": 10976 + }, + { + "epoch": 56.02040816326531, + "grad_norm": 24.158361434936523, + "learning_rate": 8.79591836734694e-06, + "loss": 0.3055, + "step": 10980 + }, + { + "epoch": 56.07142857142857, + "grad_norm": 9.925491333007812, + "learning_rate": 8.785714285714286e-06, + "loss": 0.1268, + "step": 10990 + }, + { + "epoch": 56.12244897959184, + "grad_norm": 13.758755683898926, + "learning_rate": 8.775510204081633e-06, + "loss": 0.6231, + "step": 11000 + }, + { + "epoch": 56.173469387755105, + "grad_norm": 40.79158020019531, + "learning_rate": 8.76530612244898e-06, + "loss": 0.2624, + "step": 11010 + }, + { + "epoch": 56.224489795918366, + "grad_norm": 7.537052631378174, + "learning_rate": 8.755102040816326e-06, + "loss": 0.1936, + "step": 11020 + }, + { + "epoch": 56.275510204081634, + "grad_norm": 4.473420143127441, + "learning_rate": 8.744897959183676e-06, + "loss": 0.224, + "step": 11030 + }, + { + "epoch": 56.326530612244895, + "grad_norm": 11.621440887451172, + "learning_rate": 8.734693877551021e-06, + "loss": 0.2848, + "step": 11040 + }, + { + "epoch": 56.37755102040816, + "grad_norm": 46.6587028503418, + "learning_rate": 8.724489795918369e-06, + "loss": 0.5454, + "step": 11050 + }, + { + "epoch": 56.42857142857143, + "grad_norm": 8.200979232788086, + "learning_rate": 8.714285714285715e-06, + "loss": 0.1789, + "step": 11060 + }, + { + "epoch": 56.47959183673469, + "grad_norm": 28.611772537231445, + "learning_rate": 8.704081632653062e-06, + "loss": 0.4482, + "step": 11070 + }, + { + "epoch": 56.53061224489796, + "grad_norm": 7.166450500488281, + "learning_rate": 8.69387755102041e-06, + "loss": 0.4748, + "step": 11080 + }, + { + "epoch": 56.58163265306123, + "grad_norm": 4.096757411956787, + "learning_rate": 8.683673469387755e-06, + "loss": 0.1543, + "step": 11090 + }, + { + "epoch": 56.63265306122449, + "grad_norm": 41.63648223876953, + "learning_rate": 8.673469387755103e-06, + "loss": 0.2418, + "step": 11100 + }, + { + "epoch": 56.683673469387756, + "grad_norm": 12.663362503051758, + "learning_rate": 8.663265306122449e-06, + "loss": 0.4144, + "step": 11110 + }, + { + "epoch": 56.734693877551024, + "grad_norm": 7.94280481338501, + "learning_rate": 8.653061224489798e-06, + "loss": 0.1369, + "step": 11120 + }, + { + "epoch": 56.785714285714285, + "grad_norm": 10.940966606140137, + "learning_rate": 8.642857142857144e-06, + "loss": 0.4186, + "step": 11130 + }, + { + "epoch": 56.83673469387755, + "grad_norm": 0.8230370283126831, + "learning_rate": 8.632653061224491e-06, + "loss": 0.1589, + "step": 11140 + }, + { + "epoch": 56.88775510204081, + "grad_norm": 24.352340698242188, + "learning_rate": 8.622448979591837e-06, + "loss": 0.2616, + "step": 11150 + }, + { + "epoch": 56.93877551020408, + "grad_norm": 36.927154541015625, + "learning_rate": 8.612244897959184e-06, + "loss": 0.4105, + "step": 11160 + }, + { + "epoch": 56.98979591836735, + "grad_norm": 2.7754557132720947, + "learning_rate": 8.602040816326532e-06, + "loss": 0.1789, + "step": 11170 + }, + { + "epoch": 57.0, + "eval_accuracy": 0.8267148014440433, + "eval_loss": 0.509782612323761, + "eval_runtime": 1.0649, + "eval_samples_per_second": 260.109, + "eval_steps_per_second": 32.866, + "step": 11172 + }, + { + "epoch": 57.04081632653061, + "grad_norm": 25.533119201660156, + "learning_rate": 8.591836734693878e-06, + "loss": 0.2659, + "step": 11180 + }, + { + "epoch": 57.09183673469388, + "grad_norm": 12.635940551757812, + "learning_rate": 8.581632653061225e-06, + "loss": 0.1523, + "step": 11190 + }, + { + "epoch": 57.142857142857146, + "grad_norm": 4.289495468139648, + "learning_rate": 8.571428571428571e-06, + "loss": 0.2356, + "step": 11200 + }, + { + "epoch": 57.19387755102041, + "grad_norm": 24.788164138793945, + "learning_rate": 8.56122448979592e-06, + "loss": 0.3661, + "step": 11210 + }, + { + "epoch": 57.244897959183675, + "grad_norm": 31.052841186523438, + "learning_rate": 8.551020408163266e-06, + "loss": 0.3714, + "step": 11220 + }, + { + "epoch": 57.295918367346935, + "grad_norm": 39.57874298095703, + "learning_rate": 8.540816326530613e-06, + "loss": 0.5319, + "step": 11230 + }, + { + "epoch": 57.3469387755102, + "grad_norm": 1.093644380569458, + "learning_rate": 8.530612244897961e-06, + "loss": 0.0904, + "step": 11240 + }, + { + "epoch": 57.39795918367347, + "grad_norm": 1.086511254310608, + "learning_rate": 8.520408163265307e-06, + "loss": 0.1965, + "step": 11250 + }, + { + "epoch": 57.44897959183673, + "grad_norm": 7.9456987380981445, + "learning_rate": 8.510204081632654e-06, + "loss": 0.1848, + "step": 11260 + }, + { + "epoch": 57.5, + "grad_norm": 20.959230422973633, + "learning_rate": 8.5e-06, + "loss": 0.1453, + "step": 11270 + }, + { + "epoch": 57.55102040816327, + "grad_norm": 23.841773986816406, + "learning_rate": 8.489795918367347e-06, + "loss": 0.2055, + "step": 11280 + }, + { + "epoch": 57.60204081632653, + "grad_norm": 35.155029296875, + "learning_rate": 8.479591836734695e-06, + "loss": 0.1952, + "step": 11290 + }, + { + "epoch": 57.6530612244898, + "grad_norm": 9.69937515258789, + "learning_rate": 8.469387755102042e-06, + "loss": 0.164, + "step": 11300 + }, + { + "epoch": 57.704081632653065, + "grad_norm": 34.393470764160156, + "learning_rate": 8.459183673469388e-06, + "loss": 0.3778, + "step": 11310 + }, + { + "epoch": 57.755102040816325, + "grad_norm": 1.3232866525650024, + "learning_rate": 8.448979591836736e-06, + "loss": 0.0839, + "step": 11320 + }, + { + "epoch": 57.80612244897959, + "grad_norm": 17.029067993164062, + "learning_rate": 8.438775510204083e-06, + "loss": 0.2575, + "step": 11330 + }, + { + "epoch": 57.857142857142854, + "grad_norm": 6.47840690612793, + "learning_rate": 8.428571428571429e-06, + "loss": 0.2839, + "step": 11340 + }, + { + "epoch": 57.90816326530612, + "grad_norm": 37.308738708496094, + "learning_rate": 8.418367346938776e-06, + "loss": 0.3327, + "step": 11350 + }, + { + "epoch": 57.95918367346939, + "grad_norm": 0.8903458714485168, + "learning_rate": 8.408163265306122e-06, + "loss": 0.4355, + "step": 11360 + }, + { + "epoch": 58.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.5032309889793396, + "eval_runtime": 0.9938, + "eval_samples_per_second": 278.74, + "eval_steps_per_second": 35.22, + "step": 11368 + }, + { + "epoch": 58.01020408163265, + "grad_norm": 22.674251556396484, + "learning_rate": 8.39795918367347e-06, + "loss": 0.406, + "step": 11370 + }, + { + "epoch": 58.06122448979592, + "grad_norm": 24.471176147460938, + "learning_rate": 8.387755102040817e-06, + "loss": 0.2035, + "step": 11380 + }, + { + "epoch": 58.11224489795919, + "grad_norm": 14.768692016601562, + "learning_rate": 8.377551020408165e-06, + "loss": 0.2689, + "step": 11390 + }, + { + "epoch": 58.16326530612245, + "grad_norm": 29.382850646972656, + "learning_rate": 8.36734693877551e-06, + "loss": 0.2638, + "step": 11400 + }, + { + "epoch": 58.214285714285715, + "grad_norm": 31.610193252563477, + "learning_rate": 8.357142857142858e-06, + "loss": 0.2101, + "step": 11410 + }, + { + "epoch": 58.265306122448976, + "grad_norm": 24.501243591308594, + "learning_rate": 8.346938775510205e-06, + "loss": 0.3884, + "step": 11420 + }, + { + "epoch": 58.316326530612244, + "grad_norm": 18.3912410736084, + "learning_rate": 8.336734693877551e-06, + "loss": 0.3061, + "step": 11430 + }, + { + "epoch": 58.36734693877551, + "grad_norm": 2.261777639389038, + "learning_rate": 8.326530612244899e-06, + "loss": 0.1306, + "step": 11440 + }, + { + "epoch": 58.41836734693877, + "grad_norm": 7.096632957458496, + "learning_rate": 8.316326530612246e-06, + "loss": 0.4471, + "step": 11450 + }, + { + "epoch": 58.46938775510204, + "grad_norm": 30.137601852416992, + "learning_rate": 8.306122448979592e-06, + "loss": 0.2175, + "step": 11460 + }, + { + "epoch": 58.52040816326531, + "grad_norm": 34.18726348876953, + "learning_rate": 8.29591836734694e-06, + "loss": 0.4566, + "step": 11470 + }, + { + "epoch": 58.57142857142857, + "grad_norm": 39.821434020996094, + "learning_rate": 8.285714285714287e-06, + "loss": 0.2599, + "step": 11480 + }, + { + "epoch": 58.62244897959184, + "grad_norm": 19.520374298095703, + "learning_rate": 8.275510204081634e-06, + "loss": 0.3169, + "step": 11490 + }, + { + "epoch": 58.673469387755105, + "grad_norm": 3.8211231231689453, + "learning_rate": 8.26530612244898e-06, + "loss": 0.0948, + "step": 11500 + }, + { + "epoch": 58.724489795918366, + "grad_norm": 3.1806092262268066, + "learning_rate": 8.255102040816328e-06, + "loss": 0.1415, + "step": 11510 + }, + { + "epoch": 58.775510204081634, + "grad_norm": 7.679635047912598, + "learning_rate": 8.244897959183674e-06, + "loss": 0.1717, + "step": 11520 + }, + { + "epoch": 58.826530612244895, + "grad_norm": 24.31257438659668, + "learning_rate": 8.234693877551021e-06, + "loss": 0.5436, + "step": 11530 + }, + { + "epoch": 58.87755102040816, + "grad_norm": 2.4208831787109375, + "learning_rate": 8.224489795918369e-06, + "loss": 0.2644, + "step": 11540 + }, + { + "epoch": 58.92857142857143, + "grad_norm": 5.644859790802002, + "learning_rate": 8.214285714285714e-06, + "loss": 0.258, + "step": 11550 + }, + { + "epoch": 58.97959183673469, + "grad_norm": 25.775012969970703, + "learning_rate": 8.204081632653062e-06, + "loss": 0.3957, + "step": 11560 + }, + { + "epoch": 59.0, + "eval_accuracy": 0.9025270758122743, + "eval_loss": 0.42054516077041626, + "eval_runtime": 0.9995, + "eval_samples_per_second": 277.148, + "eval_steps_per_second": 35.019, + "step": 11564 + }, + { + "epoch": 59.03061224489796, + "grad_norm": 0.5135757923126221, + "learning_rate": 8.19387755102041e-06, + "loss": 0.1095, + "step": 11570 + }, + { + "epoch": 59.08163265306123, + "grad_norm": 5.598399639129639, + "learning_rate": 8.183673469387757e-06, + "loss": 0.1958, + "step": 11580 + }, + { + "epoch": 59.13265306122449, + "grad_norm": 0.44218021631240845, + "learning_rate": 8.173469387755103e-06, + "loss": 0.1147, + "step": 11590 + }, + { + "epoch": 59.183673469387756, + "grad_norm": 1.5531724691390991, + "learning_rate": 8.16326530612245e-06, + "loss": 0.1471, + "step": 11600 + }, + { + "epoch": 59.234693877551024, + "grad_norm": 37.34202575683594, + "learning_rate": 8.153061224489796e-06, + "loss": 0.1829, + "step": 11610 + }, + { + "epoch": 59.285714285714285, + "grad_norm": 15.249494552612305, + "learning_rate": 8.142857142857143e-06, + "loss": 0.5386, + "step": 11620 + }, + { + "epoch": 59.33673469387755, + "grad_norm": 5.966726303100586, + "learning_rate": 8.13265306122449e-06, + "loss": 0.2848, + "step": 11630 + }, + { + "epoch": 59.38775510204081, + "grad_norm": 1.9381986856460571, + "learning_rate": 8.122448979591837e-06, + "loss": 0.203, + "step": 11640 + }, + { + "epoch": 59.43877551020408, + "grad_norm": 7.639269828796387, + "learning_rate": 8.112244897959184e-06, + "loss": 0.2015, + "step": 11650 + }, + { + "epoch": 59.48979591836735, + "grad_norm": 3.7811472415924072, + "learning_rate": 8.102040816326532e-06, + "loss": 0.2437, + "step": 11660 + }, + { + "epoch": 59.54081632653061, + "grad_norm": 11.729641914367676, + "learning_rate": 8.091836734693879e-06, + "loss": 0.2114, + "step": 11670 + }, + { + "epoch": 59.59183673469388, + "grad_norm": 1.8369371891021729, + "learning_rate": 8.081632653061225e-06, + "loss": 0.2237, + "step": 11680 + }, + { + "epoch": 59.642857142857146, + "grad_norm": 32.86522674560547, + "learning_rate": 8.071428571428572e-06, + "loss": 0.3557, + "step": 11690 + }, + { + "epoch": 59.69387755102041, + "grad_norm": 1.281840205192566, + "learning_rate": 8.06122448979592e-06, + "loss": 0.3151, + "step": 11700 + }, + { + "epoch": 59.744897959183675, + "grad_norm": 3.163912773132324, + "learning_rate": 8.051020408163266e-06, + "loss": 0.2964, + "step": 11710 + }, + { + "epoch": 59.795918367346935, + "grad_norm": 38.169742584228516, + "learning_rate": 8.040816326530613e-06, + "loss": 0.2797, + "step": 11720 + }, + { + "epoch": 59.8469387755102, + "grad_norm": 0.4326093792915344, + "learning_rate": 8.030612244897959e-06, + "loss": 0.5198, + "step": 11730 + }, + { + "epoch": 59.89795918367347, + "grad_norm": 3.2172343730926514, + "learning_rate": 8.020408163265306e-06, + "loss": 0.2254, + "step": 11740 + }, + { + "epoch": 59.94897959183673, + "grad_norm": 1.0696576833724976, + "learning_rate": 8.010204081632654e-06, + "loss": 0.2583, + "step": 11750 + }, + { + "epoch": 60.0, + "grad_norm": 33.77302169799805, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4806, + "step": 11760 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.7010745406150818, + "eval_runtime": 0.9983, + "eval_samples_per_second": 277.47, + "eval_steps_per_second": 35.059, + "step": 11760 + }, + { + "epoch": 60.05102040816327, + "grad_norm": 21.017393112182617, + "learning_rate": 7.989795918367347e-06, + "loss": 0.102, + "step": 11770 + }, + { + "epoch": 60.10204081632653, + "grad_norm": 22.196216583251953, + "learning_rate": 7.979591836734695e-06, + "loss": 0.278, + "step": 11780 + }, + { + "epoch": 60.1530612244898, + "grad_norm": 16.994935989379883, + "learning_rate": 7.969387755102042e-06, + "loss": 0.3013, + "step": 11790 + }, + { + "epoch": 60.204081632653065, + "grad_norm": 16.680845260620117, + "learning_rate": 7.959183673469388e-06, + "loss": 0.1818, + "step": 11800 + }, + { + "epoch": 60.255102040816325, + "grad_norm": 45.07088088989258, + "learning_rate": 7.948979591836735e-06, + "loss": 0.3685, + "step": 11810 + }, + { + "epoch": 60.30612244897959, + "grad_norm": 27.242115020751953, + "learning_rate": 7.938775510204081e-06, + "loss": 0.3369, + "step": 11820 + }, + { + "epoch": 60.357142857142854, + "grad_norm": 8.068239212036133, + "learning_rate": 7.928571428571429e-06, + "loss": 0.5068, + "step": 11830 + }, + { + "epoch": 60.40816326530612, + "grad_norm": 4.541833877563477, + "learning_rate": 7.918367346938776e-06, + "loss": 0.299, + "step": 11840 + }, + { + "epoch": 60.45918367346939, + "grad_norm": 3.0083467960357666, + "learning_rate": 7.908163265306124e-06, + "loss": 0.3356, + "step": 11850 + }, + { + "epoch": 60.51020408163265, + "grad_norm": 9.174148559570312, + "learning_rate": 7.897959183673471e-06, + "loss": 0.2454, + "step": 11860 + }, + { + "epoch": 60.56122448979592, + "grad_norm": 27.664485931396484, + "learning_rate": 7.887755102040817e-06, + "loss": 0.324, + "step": 11870 + }, + { + "epoch": 60.61224489795919, + "grad_norm": 9.969484329223633, + "learning_rate": 7.877551020408164e-06, + "loss": 0.2079, + "step": 11880 + }, + { + "epoch": 60.66326530612245, + "grad_norm": 23.410911560058594, + "learning_rate": 7.86734693877551e-06, + "loss": 0.1182, + "step": 11890 + }, + { + "epoch": 60.714285714285715, + "grad_norm": 28.38062286376953, + "learning_rate": 7.857142857142858e-06, + "loss": 0.1469, + "step": 11900 + }, + { + "epoch": 60.765306122448976, + "grad_norm": 20.978757858276367, + "learning_rate": 7.846938775510205e-06, + "loss": 0.2617, + "step": 11910 + }, + { + "epoch": 60.816326530612244, + "grad_norm": 18.4687442779541, + "learning_rate": 7.836734693877551e-06, + "loss": 0.2335, + "step": 11920 + }, + { + "epoch": 60.86734693877551, + "grad_norm": 2.435167074203491, + "learning_rate": 7.826530612244898e-06, + "loss": 0.3492, + "step": 11930 + }, + { + "epoch": 60.91836734693877, + "grad_norm": 21.541751861572266, + "learning_rate": 7.816326530612246e-06, + "loss": 0.2464, + "step": 11940 + }, + { + "epoch": 60.96938775510204, + "grad_norm": 29.505895614624023, + "learning_rate": 7.806122448979593e-06, + "loss": 0.2356, + "step": 11950 + }, + { + "epoch": 61.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.7832220792770386, + "eval_runtime": 1.0098, + "eval_samples_per_second": 274.317, + "eval_steps_per_second": 34.661, + "step": 11956 + }, + { + "epoch": 61.02040816326531, + "grad_norm": 22.8256893157959, + "learning_rate": 7.79591836734694e-06, + "loss": 0.1427, + "step": 11960 + }, + { + "epoch": 61.07142857142857, + "grad_norm": 8.644538879394531, + "learning_rate": 7.785714285714287e-06, + "loss": 0.1863, + "step": 11970 + }, + { + "epoch": 61.12244897959184, + "grad_norm": 2.185852289199829, + "learning_rate": 7.775510204081632e-06, + "loss": 0.1894, + "step": 11980 + }, + { + "epoch": 61.173469387755105, + "grad_norm": 43.12406539916992, + "learning_rate": 7.76530612244898e-06, + "loss": 0.3048, + "step": 11990 + }, + { + "epoch": 61.224489795918366, + "grad_norm": 2.153162717819214, + "learning_rate": 7.755102040816327e-06, + "loss": 0.3395, + "step": 12000 + }, + { + "epoch": 61.275510204081634, + "grad_norm": 10.847805976867676, + "learning_rate": 7.744897959183673e-06, + "loss": 0.3195, + "step": 12010 + }, + { + "epoch": 61.326530612244895, + "grad_norm": 4.3105621337890625, + "learning_rate": 7.73469387755102e-06, + "loss": 0.1406, + "step": 12020 + }, + { + "epoch": 61.37755102040816, + "grad_norm": 2.6797704696655273, + "learning_rate": 7.724489795918368e-06, + "loss": 0.1415, + "step": 12030 + }, + { + "epoch": 61.42857142857143, + "grad_norm": 14.327332496643066, + "learning_rate": 7.714285714285716e-06, + "loss": 0.3265, + "step": 12040 + }, + { + "epoch": 61.47959183673469, + "grad_norm": 1.3813791275024414, + "learning_rate": 7.704081632653061e-06, + "loss": 0.1746, + "step": 12050 + }, + { + "epoch": 61.53061224489796, + "grad_norm": 1.5404123067855835, + "learning_rate": 7.693877551020409e-06, + "loss": 0.156, + "step": 12060 + }, + { + "epoch": 61.58163265306123, + "grad_norm": 19.875141143798828, + "learning_rate": 7.683673469387756e-06, + "loss": 0.2375, + "step": 12070 + }, + { + "epoch": 61.63265306122449, + "grad_norm": 22.32607650756836, + "learning_rate": 7.673469387755102e-06, + "loss": 0.2622, + "step": 12080 + }, + { + "epoch": 61.683673469387756, + "grad_norm": 49.059852600097656, + "learning_rate": 7.66326530612245e-06, + "loss": 0.4982, + "step": 12090 + }, + { + "epoch": 61.734693877551024, + "grad_norm": 0.6080814003944397, + "learning_rate": 7.653061224489796e-06, + "loss": 0.0983, + "step": 12100 + }, + { + "epoch": 61.785714285714285, + "grad_norm": 1.038907766342163, + "learning_rate": 7.642857142857143e-06, + "loss": 0.292, + "step": 12110 + }, + { + "epoch": 61.83673469387755, + "grad_norm": 9.938033103942871, + "learning_rate": 7.63265306122449e-06, + "loss": 0.1049, + "step": 12120 + }, + { + "epoch": 61.88775510204081, + "grad_norm": 3.6407618522644043, + "learning_rate": 7.622448979591838e-06, + "loss": 0.2859, + "step": 12130 + }, + { + "epoch": 61.93877551020408, + "grad_norm": 16.786304473876953, + "learning_rate": 7.612244897959185e-06, + "loss": 0.2256, + "step": 12140 + }, + { + "epoch": 61.98979591836735, + "grad_norm": 36.143856048583984, + "learning_rate": 7.602040816326531e-06, + "loss": 0.3865, + "step": 12150 + }, + { + "epoch": 62.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.46221768856048584, + "eval_runtime": 0.9995, + "eval_samples_per_second": 277.136, + "eval_steps_per_second": 35.017, + "step": 12152 + }, + { + "epoch": 62.04081632653061, + "grad_norm": 12.808893203735352, + "learning_rate": 7.591836734693878e-06, + "loss": 0.1058, + "step": 12160 + }, + { + "epoch": 62.09183673469388, + "grad_norm": 0.4051969647407532, + "learning_rate": 7.581632653061225e-06, + "loss": 0.0794, + "step": 12170 + }, + { + "epoch": 62.142857142857146, + "grad_norm": 32.404815673828125, + "learning_rate": 7.571428571428572e-06, + "loss": 0.2429, + "step": 12180 + }, + { + "epoch": 62.19387755102041, + "grad_norm": 8.866398811340332, + "learning_rate": 7.561224489795919e-06, + "loss": 0.3704, + "step": 12190 + }, + { + "epoch": 62.244897959183675, + "grad_norm": 24.656524658203125, + "learning_rate": 7.551020408163265e-06, + "loss": 0.1116, + "step": 12200 + }, + { + "epoch": 62.295918367346935, + "grad_norm": 5.963210105895996, + "learning_rate": 7.540816326530614e-06, + "loss": 0.2053, + "step": 12210 + }, + { + "epoch": 62.3469387755102, + "grad_norm": 20.637388229370117, + "learning_rate": 7.53061224489796e-06, + "loss": 0.3725, + "step": 12220 + }, + { + "epoch": 62.39795918367347, + "grad_norm": 6.2543044090271, + "learning_rate": 7.520408163265307e-06, + "loss": 0.2837, + "step": 12230 + }, + { + "epoch": 62.44897959183673, + "grad_norm": 17.733964920043945, + "learning_rate": 7.5102040816326536e-06, + "loss": 0.2252, + "step": 12240 + }, + { + "epoch": 62.5, + "grad_norm": 0.9982537031173706, + "learning_rate": 7.500000000000001e-06, + "loss": 0.2155, + "step": 12250 + }, + { + "epoch": 62.55102040816327, + "grad_norm": 1.911987066268921, + "learning_rate": 7.489795918367348e-06, + "loss": 0.117, + "step": 12260 + }, + { + "epoch": 62.60204081632653, + "grad_norm": 14.23578929901123, + "learning_rate": 7.479591836734694e-06, + "loss": 0.2181, + "step": 12270 + }, + { + "epoch": 62.6530612244898, + "grad_norm": 11.06711196899414, + "learning_rate": 7.469387755102041e-06, + "loss": 0.3577, + "step": 12280 + }, + { + "epoch": 62.704081632653065, + "grad_norm": 1.1529871225357056, + "learning_rate": 7.459183673469388e-06, + "loss": 0.0949, + "step": 12290 + }, + { + "epoch": 62.755102040816325, + "grad_norm": 0.31703999638557434, + "learning_rate": 7.448979591836736e-06, + "loss": 0.1197, + "step": 12300 + }, + { + "epoch": 62.80612244897959, + "grad_norm": 5.484655380249023, + "learning_rate": 7.4387755102040826e-06, + "loss": 0.2553, + "step": 12310 + }, + { + "epoch": 62.857142857142854, + "grad_norm": 8.561174392700195, + "learning_rate": 7.428571428571429e-06, + "loss": 0.2765, + "step": 12320 + }, + { + "epoch": 62.90816326530612, + "grad_norm": 6.939566135406494, + "learning_rate": 7.418367346938776e-06, + "loss": 0.1403, + "step": 12330 + }, + { + "epoch": 62.95918367346939, + "grad_norm": 3.5760629177093506, + "learning_rate": 7.408163265306123e-06, + "loss": 0.3504, + "step": 12340 + }, + { + "epoch": 63.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.5889376997947693, + "eval_runtime": 0.9994, + "eval_samples_per_second": 277.173, + "eval_steps_per_second": 35.022, + "step": 12348 + }, + { + "epoch": 63.01020408163265, + "grad_norm": 30.241453170776367, + "learning_rate": 7.39795918367347e-06, + "loss": 0.335, + "step": 12350 + }, + { + "epoch": 63.06122448979592, + "grad_norm": 18.252962112426758, + "learning_rate": 7.387755102040817e-06, + "loss": 0.2433, + "step": 12360 + }, + { + "epoch": 63.11224489795919, + "grad_norm": 1.6258810758590698, + "learning_rate": 7.377551020408163e-06, + "loss": 0.2805, + "step": 12370 + }, + { + "epoch": 63.16326530612245, + "grad_norm": 22.66288948059082, + "learning_rate": 7.367346938775511e-06, + "loss": 0.1351, + "step": 12380 + }, + { + "epoch": 63.214285714285715, + "grad_norm": 23.972553253173828, + "learning_rate": 7.357142857142858e-06, + "loss": 0.1741, + "step": 12390 + }, + { + "epoch": 63.265306122448976, + "grad_norm": 10.380844116210938, + "learning_rate": 7.346938775510205e-06, + "loss": 0.1509, + "step": 12400 + }, + { + "epoch": 63.316326530612244, + "grad_norm": 16.45281219482422, + "learning_rate": 7.3367346938775515e-06, + "loss": 0.2157, + "step": 12410 + }, + { + "epoch": 63.36734693877551, + "grad_norm": 14.863101959228516, + "learning_rate": 7.326530612244899e-06, + "loss": 0.2718, + "step": 12420 + }, + { + "epoch": 63.41836734693877, + "grad_norm": 0.8119106888771057, + "learning_rate": 7.316326530612246e-06, + "loss": 0.1634, + "step": 12430 + }, + { + "epoch": 63.46938775510204, + "grad_norm": 21.098934173583984, + "learning_rate": 7.306122448979592e-06, + "loss": 0.2267, + "step": 12440 + }, + { + "epoch": 63.52040816326531, + "grad_norm": 1.6102977991104126, + "learning_rate": 7.295918367346939e-06, + "loss": 0.4136, + "step": 12450 + }, + { + "epoch": 63.57142857142857, + "grad_norm": 0.4376663565635681, + "learning_rate": 7.285714285714286e-06, + "loss": 0.2111, + "step": 12460 + }, + { + "epoch": 63.62244897959184, + "grad_norm": 2.0043997764587402, + "learning_rate": 7.275510204081633e-06, + "loss": 0.4028, + "step": 12470 + }, + { + "epoch": 63.673469387755105, + "grad_norm": 0.6079874634742737, + "learning_rate": 7.2653061224489805e-06, + "loss": 0.1818, + "step": 12480 + }, + { + "epoch": 63.724489795918366, + "grad_norm": 21.697187423706055, + "learning_rate": 7.255102040816327e-06, + "loss": 0.1508, + "step": 12490 + }, + { + "epoch": 63.775510204081634, + "grad_norm": 20.07520866394043, + "learning_rate": 7.244897959183675e-06, + "loss": 0.3788, + "step": 12500 + }, + { + "epoch": 63.826530612244895, + "grad_norm": 0.9484468698501587, + "learning_rate": 7.234693877551021e-06, + "loss": 0.2407, + "step": 12510 + }, + { + "epoch": 63.87755102040816, + "grad_norm": 22.69029426574707, + "learning_rate": 7.224489795918368e-06, + "loss": 0.4261, + "step": 12520 + }, + { + "epoch": 63.92857142857143, + "grad_norm": 12.349723815917969, + "learning_rate": 7.2142857142857145e-06, + "loss": 0.1916, + "step": 12530 + }, + { + "epoch": 63.97959183673469, + "grad_norm": 3.0168395042419434, + "learning_rate": 7.204081632653061e-06, + "loss": 0.3766, + "step": 12540 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.5246071815490723, + "eval_runtime": 1.0847, + "eval_samples_per_second": 255.359, + "eval_steps_per_second": 32.266, + "step": 12544 + }, + { + "epoch": 64.03061224489795, + "grad_norm": 23.5684757232666, + "learning_rate": 7.193877551020409e-06, + "loss": 0.1266, + "step": 12550 + }, + { + "epoch": 64.08163265306122, + "grad_norm": 46.36762237548828, + "learning_rate": 7.183673469387755e-06, + "loss": 0.243, + "step": 12560 + }, + { + "epoch": 64.13265306122449, + "grad_norm": 2.579113006591797, + "learning_rate": 7.173469387755103e-06, + "loss": 0.3378, + "step": 12570 + }, + { + "epoch": 64.18367346938776, + "grad_norm": 37.33270263671875, + "learning_rate": 7.16326530612245e-06, + "loss": 0.2071, + "step": 12580 + }, + { + "epoch": 64.23469387755102, + "grad_norm": 4.427562713623047, + "learning_rate": 7.153061224489797e-06, + "loss": 0.153, + "step": 12590 + }, + { + "epoch": 64.28571428571429, + "grad_norm": 11.904278755187988, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.1123, + "step": 12600 + }, + { + "epoch": 64.33673469387755, + "grad_norm": 9.346539497375488, + "learning_rate": 7.13265306122449e-06, + "loss": 0.2683, + "step": 12610 + }, + { + "epoch": 64.38775510204081, + "grad_norm": 21.351396560668945, + "learning_rate": 7.122448979591837e-06, + "loss": 0.1731, + "step": 12620 + }, + { + "epoch": 64.43877551020408, + "grad_norm": 3.145725965499878, + "learning_rate": 7.112244897959184e-06, + "loss": 0.0873, + "step": 12630 + }, + { + "epoch": 64.48979591836735, + "grad_norm": 26.72862434387207, + "learning_rate": 7.102040816326531e-06, + "loss": 0.2522, + "step": 12640 + }, + { + "epoch": 64.54081632653062, + "grad_norm": 0.8792725801467896, + "learning_rate": 7.091836734693878e-06, + "loss": 0.3313, + "step": 12650 + }, + { + "epoch": 64.59183673469387, + "grad_norm": 9.251016616821289, + "learning_rate": 7.081632653061226e-06, + "loss": 0.2437, + "step": 12660 + }, + { + "epoch": 64.64285714285714, + "grad_norm": 22.425621032714844, + "learning_rate": 7.0714285714285726e-06, + "loss": 0.2607, + "step": 12670 + }, + { + "epoch": 64.6938775510204, + "grad_norm": 7.240108489990234, + "learning_rate": 7.061224489795919e-06, + "loss": 0.4146, + "step": 12680 + }, + { + "epoch": 64.74489795918367, + "grad_norm": 21.328227996826172, + "learning_rate": 7.051020408163266e-06, + "loss": 0.5971, + "step": 12690 + }, + { + "epoch": 64.79591836734694, + "grad_norm": 6.054476261138916, + "learning_rate": 7.0408163265306125e-06, + "loss": 0.0715, + "step": 12700 + }, + { + "epoch": 64.84693877551021, + "grad_norm": 25.8214111328125, + "learning_rate": 7.03061224489796e-06, + "loss": 0.1869, + "step": 12710 + }, + { + "epoch": 64.89795918367346, + "grad_norm": 15.36678409576416, + "learning_rate": 7.020408163265307e-06, + "loss": 0.166, + "step": 12720 + }, + { + "epoch": 64.94897959183673, + "grad_norm": 32.016143798828125, + "learning_rate": 7.010204081632653e-06, + "loss": 0.1881, + "step": 12730 + }, + { + "epoch": 65.0, + "grad_norm": 30.883359909057617, + "learning_rate": 7e-06, + "loss": 0.1336, + "step": 12740 + }, + { + "epoch": 65.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.6461602449417114, + "eval_runtime": 1.0394, + "eval_samples_per_second": 266.491, + "eval_steps_per_second": 33.672, + "step": 12740 + }, + { + "epoch": 65.05102040816327, + "grad_norm": 5.947268486022949, + "learning_rate": 6.989795918367348e-06, + "loss": 0.0624, + "step": 12750 + }, + { + "epoch": 65.10204081632654, + "grad_norm": 1.8414580821990967, + "learning_rate": 6.979591836734695e-06, + "loss": 0.2217, + "step": 12760 + }, + { + "epoch": 65.15306122448979, + "grad_norm": 15.210898399353027, + "learning_rate": 6.9693877551020415e-06, + "loss": 0.209, + "step": 12770 + }, + { + "epoch": 65.20408163265306, + "grad_norm": 1.1254956722259521, + "learning_rate": 6.959183673469388e-06, + "loss": 0.4428, + "step": 12780 + }, + { + "epoch": 65.25510204081633, + "grad_norm": 26.08260726928711, + "learning_rate": 6.948979591836736e-06, + "loss": 0.2462, + "step": 12790 + }, + { + "epoch": 65.3061224489796, + "grad_norm": 23.332658767700195, + "learning_rate": 6.938775510204082e-06, + "loss": 0.1654, + "step": 12800 + }, + { + "epoch": 65.35714285714286, + "grad_norm": 18.200082778930664, + "learning_rate": 6.928571428571429e-06, + "loss": 0.1999, + "step": 12810 + }, + { + "epoch": 65.40816326530613, + "grad_norm": 7.355938911437988, + "learning_rate": 6.9183673469387755e-06, + "loss": 0.4258, + "step": 12820 + }, + { + "epoch": 65.45918367346938, + "grad_norm": 29.441251754760742, + "learning_rate": 6.908163265306122e-06, + "loss": 0.2091, + "step": 12830 + }, + { + "epoch": 65.51020408163265, + "grad_norm": 32.40293502807617, + "learning_rate": 6.8979591836734705e-06, + "loss": 0.3265, + "step": 12840 + }, + { + "epoch": 65.56122448979592, + "grad_norm": 29.68283462524414, + "learning_rate": 6.887755102040817e-06, + "loss": 0.1929, + "step": 12850 + }, + { + "epoch": 65.61224489795919, + "grad_norm": 15.24181842803955, + "learning_rate": 6.877551020408164e-06, + "loss": 0.2588, + "step": 12860 + }, + { + "epoch": 65.66326530612245, + "grad_norm": 15.568742752075195, + "learning_rate": 6.867346938775511e-06, + "loss": 0.1526, + "step": 12870 + }, + { + "epoch": 65.71428571428571, + "grad_norm": 13.686704635620117, + "learning_rate": 6.857142857142858e-06, + "loss": 0.227, + "step": 12880 + }, + { + "epoch": 65.76530612244898, + "grad_norm": 31.384748458862305, + "learning_rate": 6.8469387755102046e-06, + "loss": 0.4605, + "step": 12890 + }, + { + "epoch": 65.81632653061224, + "grad_norm": 2.28995943069458, + "learning_rate": 6.836734693877551e-06, + "loss": 0.1354, + "step": 12900 + }, + { + "epoch": 65.86734693877551, + "grad_norm": 20.415807723999023, + "learning_rate": 6.826530612244898e-06, + "loss": 0.2293, + "step": 12910 + }, + { + "epoch": 65.91836734693878, + "grad_norm": 47.58285140991211, + "learning_rate": 6.816326530612245e-06, + "loss": 0.4399, + "step": 12920 + }, + { + "epoch": 65.96938775510205, + "grad_norm": 26.599428176879883, + "learning_rate": 6.806122448979592e-06, + "loss": 0.3275, + "step": 12930 + }, + { + "epoch": 66.0, + "eval_accuracy": 0.8628158844765343, + "eval_loss": 0.5013401508331299, + "eval_runtime": 1.0125, + "eval_samples_per_second": 273.57, + "eval_steps_per_second": 34.567, + "step": 12936 + }, + { + "epoch": 66.0204081632653, + "grad_norm": 2.0271527767181396, + "learning_rate": 6.7959183673469394e-06, + "loss": 0.2295, + "step": 12940 + }, + { + "epoch": 66.07142857142857, + "grad_norm": 10.203573226928711, + "learning_rate": 6.785714285714287e-06, + "loss": 0.3789, + "step": 12950 + }, + { + "epoch": 66.12244897959184, + "grad_norm": 3.814969778060913, + "learning_rate": 6.7755102040816336e-06, + "loss": 0.2151, + "step": 12960 + }, + { + "epoch": 66.1734693877551, + "grad_norm": 30.38828468322754, + "learning_rate": 6.76530612244898e-06, + "loss": 0.2755, + "step": 12970 + }, + { + "epoch": 66.22448979591837, + "grad_norm": 14.487350463867188, + "learning_rate": 6.755102040816327e-06, + "loss": 0.384, + "step": 12980 + }, + { + "epoch": 66.27551020408163, + "grad_norm": 2.6648643016815186, + "learning_rate": 6.7448979591836735e-06, + "loss": 0.0881, + "step": 12990 + }, + { + "epoch": 66.3265306122449, + "grad_norm": 32.701786041259766, + "learning_rate": 6.734693877551021e-06, + "loss": 0.4149, + "step": 13000 + }, + { + "epoch": 66.37755102040816, + "grad_norm": 28.567655563354492, + "learning_rate": 6.724489795918368e-06, + "loss": 0.3084, + "step": 13010 + }, + { + "epoch": 66.42857142857143, + "grad_norm": 0.6487907767295837, + "learning_rate": 6.714285714285714e-06, + "loss": 0.0987, + "step": 13020 + }, + { + "epoch": 66.4795918367347, + "grad_norm": 19.951332092285156, + "learning_rate": 6.704081632653063e-06, + "loss": 0.2143, + "step": 13030 + }, + { + "epoch": 66.53061224489795, + "grad_norm": 3.4577834606170654, + "learning_rate": 6.693877551020409e-06, + "loss": 0.1117, + "step": 13040 + }, + { + "epoch": 66.58163265306122, + "grad_norm": 24.311569213867188, + "learning_rate": 6.683673469387756e-06, + "loss": 0.1493, + "step": 13050 + }, + { + "epoch": 66.63265306122449, + "grad_norm": 0.6273188591003418, + "learning_rate": 6.6734693877551025e-06, + "loss": 0.164, + "step": 13060 + }, + { + "epoch": 66.68367346938776, + "grad_norm": 2.258786201477051, + "learning_rate": 6.663265306122449e-06, + "loss": 0.2118, + "step": 13070 + }, + { + "epoch": 66.73469387755102, + "grad_norm": 44.41456604003906, + "learning_rate": 6.653061224489797e-06, + "loss": 0.2651, + "step": 13080 + }, + { + "epoch": 66.78571428571429, + "grad_norm": 2.3279895782470703, + "learning_rate": 6.642857142857143e-06, + "loss": 0.3493, + "step": 13090 + }, + { + "epoch": 66.83673469387755, + "grad_norm": 5.928819179534912, + "learning_rate": 6.63265306122449e-06, + "loss": 0.2413, + "step": 13100 + }, + { + "epoch": 66.88775510204081, + "grad_norm": 14.05907917022705, + "learning_rate": 6.6224489795918365e-06, + "loss": 0.11, + "step": 13110 + }, + { + "epoch": 66.93877551020408, + "grad_norm": 3.6681485176086426, + "learning_rate": 6.612244897959185e-06, + "loss": 0.0933, + "step": 13120 + }, + { + "epoch": 66.98979591836735, + "grad_norm": 1.1369929313659668, + "learning_rate": 6.6020408163265315e-06, + "loss": 0.3765, + "step": 13130 + }, + { + "epoch": 67.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.4856545329093933, + "eval_runtime": 1.0052, + "eval_samples_per_second": 275.572, + "eval_steps_per_second": 34.82, + "step": 13132 + }, + { + "epoch": 67.04081632653062, + "grad_norm": 20.271913528442383, + "learning_rate": 6.591836734693878e-06, + "loss": 0.1693, + "step": 13140 + }, + { + "epoch": 67.09183673469387, + "grad_norm": 45.590736389160156, + "learning_rate": 6.581632653061225e-06, + "loss": 0.2676, + "step": 13150 + }, + { + "epoch": 67.14285714285714, + "grad_norm": 23.853425979614258, + "learning_rate": 6.571428571428572e-06, + "loss": 0.2243, + "step": 13160 + }, + { + "epoch": 67.1938775510204, + "grad_norm": 12.5801420211792, + "learning_rate": 6.561224489795919e-06, + "loss": 0.2764, + "step": 13170 + }, + { + "epoch": 67.24489795918367, + "grad_norm": 18.84653663635254, + "learning_rate": 6.5510204081632656e-06, + "loss": 0.0919, + "step": 13180 + }, + { + "epoch": 67.29591836734694, + "grad_norm": 46.19068908691406, + "learning_rate": 6.540816326530612e-06, + "loss": 0.4186, + "step": 13190 + }, + { + "epoch": 67.34693877551021, + "grad_norm": 31.988140106201172, + "learning_rate": 6.530612244897959e-06, + "loss": 0.1621, + "step": 13200 + }, + { + "epoch": 67.39795918367346, + "grad_norm": 25.058948516845703, + "learning_rate": 6.520408163265307e-06, + "loss": 0.3666, + "step": 13210 + }, + { + "epoch": 67.44897959183673, + "grad_norm": 21.009185791015625, + "learning_rate": 6.510204081632654e-06, + "loss": 0.2347, + "step": 13220 + }, + { + "epoch": 67.5, + "grad_norm": 11.084943771362305, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.1314, + "step": 13230 + }, + { + "epoch": 67.55102040816327, + "grad_norm": 9.887314796447754, + "learning_rate": 6.489795918367348e-06, + "loss": 0.37, + "step": 13240 + }, + { + "epoch": 67.60204081632654, + "grad_norm": 17.127517700195312, + "learning_rate": 6.4795918367346946e-06, + "loss": 0.1555, + "step": 13250 + }, + { + "epoch": 67.65306122448979, + "grad_norm": 5.455459117889404, + "learning_rate": 6.469387755102041e-06, + "loss": 0.1799, + "step": 13260 + }, + { + "epoch": 67.70408163265306, + "grad_norm": 38.28322982788086, + "learning_rate": 6.459183673469388e-06, + "loss": 0.2256, + "step": 13270 + }, + { + "epoch": 67.75510204081633, + "grad_norm": 32.398338317871094, + "learning_rate": 6.4489795918367345e-06, + "loss": 0.4517, + "step": 13280 + }, + { + "epoch": 67.8061224489796, + "grad_norm": 3.4224867820739746, + "learning_rate": 6.438775510204082e-06, + "loss": 0.3524, + "step": 13290 + }, + { + "epoch": 67.85714285714286, + "grad_norm": 1.3440306186676025, + "learning_rate": 6.4285714285714295e-06, + "loss": 0.2477, + "step": 13300 + }, + { + "epoch": 67.90816326530613, + "grad_norm": 43.639404296875, + "learning_rate": 6.418367346938776e-06, + "loss": 0.2372, + "step": 13310 + }, + { + "epoch": 67.95918367346938, + "grad_norm": 20.636371612548828, + "learning_rate": 6.408163265306124e-06, + "loss": 0.1622, + "step": 13320 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.4917532503604889, + "eval_runtime": 1.0071, + "eval_samples_per_second": 275.051, + "eval_steps_per_second": 34.754, + "step": 13328 + }, + { + "epoch": 68.01020408163265, + "grad_norm": 0.5114874839782715, + "learning_rate": 6.39795918367347e-06, + "loss": 0.3192, + "step": 13330 + }, + { + "epoch": 68.06122448979592, + "grad_norm": 15.068284034729004, + "learning_rate": 6.387755102040817e-06, + "loss": 0.2615, + "step": 13340 + }, + { + "epoch": 68.11224489795919, + "grad_norm": 15.123217582702637, + "learning_rate": 6.3775510204081635e-06, + "loss": 0.2608, + "step": 13350 + }, + { + "epoch": 68.16326530612245, + "grad_norm": 41.276954650878906, + "learning_rate": 6.36734693877551e-06, + "loss": 0.3798, + "step": 13360 + }, + { + "epoch": 68.21428571428571, + "grad_norm": 35.171321868896484, + "learning_rate": 6.357142857142858e-06, + "loss": 0.2842, + "step": 13370 + }, + { + "epoch": 68.26530612244898, + "grad_norm": 5.952198505401611, + "learning_rate": 6.346938775510204e-06, + "loss": 0.1273, + "step": 13380 + }, + { + "epoch": 68.31632653061224, + "grad_norm": 0.8108131885528564, + "learning_rate": 6.336734693877552e-06, + "loss": 0.1766, + "step": 13390 + }, + { + "epoch": 68.36734693877551, + "grad_norm": 35.16721725463867, + "learning_rate": 6.326530612244899e-06, + "loss": 0.3258, + "step": 13400 + }, + { + "epoch": 68.41836734693878, + "grad_norm": 32.322086334228516, + "learning_rate": 6.316326530612246e-06, + "loss": 0.2416, + "step": 13410 + }, + { + "epoch": 68.46938775510205, + "grad_norm": 27.62706184387207, + "learning_rate": 6.3061224489795925e-06, + "loss": 0.1429, + "step": 13420 + }, + { + "epoch": 68.5204081632653, + "grad_norm": 31.36840057373047, + "learning_rate": 6.295918367346939e-06, + "loss": 0.1854, + "step": 13430 + }, + { + "epoch": 68.57142857142857, + "grad_norm": 26.24332618713379, + "learning_rate": 6.285714285714286e-06, + "loss": 0.1213, + "step": 13440 + }, + { + "epoch": 68.62244897959184, + "grad_norm": 27.90714454650879, + "learning_rate": 6.275510204081633e-06, + "loss": 0.1611, + "step": 13450 + }, + { + "epoch": 68.6734693877551, + "grad_norm": 0.6034018397331238, + "learning_rate": 6.26530612244898e-06, + "loss": 0.181, + "step": 13460 + }, + { + "epoch": 68.72448979591837, + "grad_norm": 17.892732620239258, + "learning_rate": 6.2551020408163266e-06, + "loss": 0.2751, + "step": 13470 + }, + { + "epoch": 68.77551020408163, + "grad_norm": 31.821821212768555, + "learning_rate": 6.244897959183675e-06, + "loss": 0.402, + "step": 13480 + }, + { + "epoch": 68.8265306122449, + "grad_norm": 30.57606315612793, + "learning_rate": 6.2346938775510215e-06, + "loss": 0.1712, + "step": 13490 + }, + { + "epoch": 68.87755102040816, + "grad_norm": 7.244333267211914, + "learning_rate": 6.224489795918368e-06, + "loss": 0.3112, + "step": 13500 + }, + { + "epoch": 68.92857142857143, + "grad_norm": 1.7811192274093628, + "learning_rate": 6.214285714285715e-06, + "loss": 0.2679, + "step": 13510 + }, + { + "epoch": 68.9795918367347, + "grad_norm": 24.882198333740234, + "learning_rate": 6.2040816326530614e-06, + "loss": 0.2291, + "step": 13520 + }, + { + "epoch": 69.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.5733516812324524, + "eval_runtime": 1.0187, + "eval_samples_per_second": 271.928, + "eval_steps_per_second": 34.359, + "step": 13524 + }, + { + "epoch": 69.03061224489795, + "grad_norm": 38.79179382324219, + "learning_rate": 6.193877551020409e-06, + "loss": 0.3698, + "step": 13530 + }, + { + "epoch": 69.08163265306122, + "grad_norm": 21.131038665771484, + "learning_rate": 6.1836734693877556e-06, + "loss": 0.2493, + "step": 13540 + }, + { + "epoch": 69.13265306122449, + "grad_norm": 4.691605091094971, + "learning_rate": 6.173469387755102e-06, + "loss": 0.1243, + "step": 13550 + }, + { + "epoch": 69.18367346938776, + "grad_norm": 1.8720529079437256, + "learning_rate": 6.163265306122449e-06, + "loss": 0.1552, + "step": 13560 + }, + { + "epoch": 69.23469387755102, + "grad_norm": 4.829433917999268, + "learning_rate": 6.153061224489797e-06, + "loss": 0.2175, + "step": 13570 + }, + { + "epoch": 69.28571428571429, + "grad_norm": 35.22263717651367, + "learning_rate": 6.142857142857144e-06, + "loss": 0.3623, + "step": 13580 + }, + { + "epoch": 69.33673469387755, + "grad_norm": 20.385683059692383, + "learning_rate": 6.1326530612244905e-06, + "loss": 0.1108, + "step": 13590 + }, + { + "epoch": 69.38775510204081, + "grad_norm": 5.461350440979004, + "learning_rate": 6.122448979591837e-06, + "loss": 0.1574, + "step": 13600 + }, + { + "epoch": 69.43877551020408, + "grad_norm": 1.2874220609664917, + "learning_rate": 6.112244897959185e-06, + "loss": 0.5626, + "step": 13610 + }, + { + "epoch": 69.48979591836735, + "grad_norm": 7.270111083984375, + "learning_rate": 6.102040816326531e-06, + "loss": 0.304, + "step": 13620 + }, + { + "epoch": 69.54081632653062, + "grad_norm": 2.0195155143737793, + "learning_rate": 6.091836734693878e-06, + "loss": 0.1614, + "step": 13630 + }, + { + "epoch": 69.59183673469387, + "grad_norm": 19.15982437133789, + "learning_rate": 6.0816326530612245e-06, + "loss": 0.2432, + "step": 13640 + }, + { + "epoch": 69.64285714285714, + "grad_norm": 10.150402069091797, + "learning_rate": 6.071428571428571e-06, + "loss": 0.317, + "step": 13650 + }, + { + "epoch": 69.6938775510204, + "grad_norm": 10.678250312805176, + "learning_rate": 6.0612244897959195e-06, + "loss": 0.1421, + "step": 13660 + }, + { + "epoch": 69.74489795918367, + "grad_norm": 15.709683418273926, + "learning_rate": 6.051020408163266e-06, + "loss": 0.2212, + "step": 13670 + }, + { + "epoch": 69.79591836734694, + "grad_norm": 2.9321069717407227, + "learning_rate": 6.040816326530613e-06, + "loss": 0.1137, + "step": 13680 + }, + { + "epoch": 69.84693877551021, + "grad_norm": 31.105802536010742, + "learning_rate": 6.03061224489796e-06, + "loss": 0.083, + "step": 13690 + }, + { + "epoch": 69.89795918367346, + "grad_norm": 18.391803741455078, + "learning_rate": 6.020408163265307e-06, + "loss": 0.2768, + "step": 13700 + }, + { + "epoch": 69.94897959183673, + "grad_norm": 5.542973518371582, + "learning_rate": 6.0102040816326535e-06, + "loss": 0.1118, + "step": 13710 + }, + { + "epoch": 70.0, + "grad_norm": 41.31367111206055, + "learning_rate": 6e-06, + "loss": 0.1786, + "step": 13720 + }, + { + "epoch": 70.0, + "eval_accuracy": 0.8231046931407943, + "eval_loss": 0.6691077351570129, + "eval_runtime": 1.0069, + "eval_samples_per_second": 275.089, + "eval_steps_per_second": 34.759, + "step": 13720 + }, + { + "epoch": 70.05102040816327, + "grad_norm": 18.903507232666016, + "learning_rate": 5.989795918367347e-06, + "loss": 0.3383, + "step": 13730 + }, + { + "epoch": 70.10204081632654, + "grad_norm": 2.2333953380584717, + "learning_rate": 5.979591836734694e-06, + "loss": 0.152, + "step": 13740 + }, + { + "epoch": 70.15306122448979, + "grad_norm": 11.092477798461914, + "learning_rate": 5.969387755102042e-06, + "loss": 0.3006, + "step": 13750 + }, + { + "epoch": 70.20408163265306, + "grad_norm": 6.052380561828613, + "learning_rate": 5.959183673469388e-06, + "loss": 0.1493, + "step": 13760 + }, + { + "epoch": 70.25510204081633, + "grad_norm": 30.79180908203125, + "learning_rate": 5.948979591836735e-06, + "loss": 0.1802, + "step": 13770 + }, + { + "epoch": 70.3061224489796, + "grad_norm": 11.03248119354248, + "learning_rate": 5.9387755102040825e-06, + "loss": 0.1929, + "step": 13780 + }, + { + "epoch": 70.35714285714286, + "grad_norm": 2.2029080390930176, + "learning_rate": 5.928571428571429e-06, + "loss": 0.1121, + "step": 13790 + }, + { + "epoch": 70.40816326530613, + "grad_norm": 8.066947937011719, + "learning_rate": 5.918367346938776e-06, + "loss": 0.1707, + "step": 13800 + }, + { + "epoch": 70.45918367346938, + "grad_norm": 6.4734392166137695, + "learning_rate": 5.9081632653061224e-06, + "loss": 0.2236, + "step": 13810 + }, + { + "epoch": 70.51020408163265, + "grad_norm": 24.212282180786133, + "learning_rate": 5.89795918367347e-06, + "loss": 0.4107, + "step": 13820 + }, + { + "epoch": 70.56122448979592, + "grad_norm": 1.010794997215271, + "learning_rate": 5.8877551020408166e-06, + "loss": 0.241, + "step": 13830 + }, + { + "epoch": 70.61224489795919, + "grad_norm": 0.8334818482398987, + "learning_rate": 5.877551020408164e-06, + "loss": 0.4166, + "step": 13840 + }, + { + "epoch": 70.66326530612245, + "grad_norm": 17.859638214111328, + "learning_rate": 5.867346938775511e-06, + "loss": 0.3133, + "step": 13850 + }, + { + "epoch": 70.71428571428571, + "grad_norm": 48.436744689941406, + "learning_rate": 5.857142857142858e-06, + "loss": 0.2612, + "step": 13860 + }, + { + "epoch": 70.76530612244898, + "grad_norm": 17.945890426635742, + "learning_rate": 5.846938775510205e-06, + "loss": 0.2198, + "step": 13870 + }, + { + "epoch": 70.81632653061224, + "grad_norm": 0.6416598558425903, + "learning_rate": 5.8367346938775515e-06, + "loss": 0.2373, + "step": 13880 + }, + { + "epoch": 70.86734693877551, + "grad_norm": 0.5166141390800476, + "learning_rate": 5.826530612244898e-06, + "loss": 0.1659, + "step": 13890 + }, + { + "epoch": 70.91836734693878, + "grad_norm": 19.99555015563965, + "learning_rate": 5.816326530612246e-06, + "loss": 0.2624, + "step": 13900 + }, + { + "epoch": 70.96938775510205, + "grad_norm": 50.422977447509766, + "learning_rate": 5.806122448979592e-06, + "loss": 0.3451, + "step": 13910 + }, + { + "epoch": 71.0, + "eval_accuracy": 0.8772563176895307, + "eval_loss": 0.7317641973495483, + "eval_runtime": 1.0054, + "eval_samples_per_second": 275.508, + "eval_steps_per_second": 34.811, + "step": 13916 + }, + { + "epoch": 71.0204081632653, + "grad_norm": 31.94869041442871, + "learning_rate": 5.795918367346939e-06, + "loss": 0.3308, + "step": 13920 + }, + { + "epoch": 71.07142857142857, + "grad_norm": 3.873305559158325, + "learning_rate": 5.785714285714286e-06, + "loss": 0.1405, + "step": 13930 + }, + { + "epoch": 71.12244897959184, + "grad_norm": 3.1949236392974854, + "learning_rate": 5.775510204081634e-06, + "loss": 0.1948, + "step": 13940 + }, + { + "epoch": 71.1734693877551, + "grad_norm": 18.632291793823242, + "learning_rate": 5.7653061224489805e-06, + "loss": 0.1845, + "step": 13950 + }, + { + "epoch": 71.22448979591837, + "grad_norm": 2.5716090202331543, + "learning_rate": 5.755102040816327e-06, + "loss": 0.1698, + "step": 13960 + }, + { + "epoch": 71.27551020408163, + "grad_norm": 1.0562517642974854, + "learning_rate": 5.744897959183674e-06, + "loss": 0.3142, + "step": 13970 + }, + { + "epoch": 71.3265306122449, + "grad_norm": 21.013874053955078, + "learning_rate": 5.73469387755102e-06, + "loss": 0.203, + "step": 13980 + }, + { + "epoch": 71.37755102040816, + "grad_norm": 16.857980728149414, + "learning_rate": 5.724489795918368e-06, + "loss": 0.1206, + "step": 13990 + }, + { + "epoch": 71.42857142857143, + "grad_norm": 27.141582489013672, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.3361, + "step": 14000 + }, + { + "epoch": 71.4795918367347, + "grad_norm": 33.69123458862305, + "learning_rate": 5.704081632653061e-06, + "loss": 0.4568, + "step": 14010 + }, + { + "epoch": 71.53061224489795, + "grad_norm": 0.453238844871521, + "learning_rate": 5.6938775510204095e-06, + "loss": 0.3505, + "step": 14020 + }, + { + "epoch": 71.58163265306122, + "grad_norm": 14.864845275878906, + "learning_rate": 5.683673469387756e-06, + "loss": 0.1685, + "step": 14030 + }, + { + "epoch": 71.63265306122449, + "grad_norm": 12.315058708190918, + "learning_rate": 5.673469387755103e-06, + "loss": 0.2161, + "step": 14040 + }, + { + "epoch": 71.68367346938776, + "grad_norm": 27.893156051635742, + "learning_rate": 5.663265306122449e-06, + "loss": 0.2952, + "step": 14050 + }, + { + "epoch": 71.73469387755102, + "grad_norm": 8.868497848510742, + "learning_rate": 5.653061224489796e-06, + "loss": 0.3527, + "step": 14060 + }, + { + "epoch": 71.78571428571429, + "grad_norm": 32.70676040649414, + "learning_rate": 5.6428571428571435e-06, + "loss": 0.1873, + "step": 14070 + }, + { + "epoch": 71.83673469387755, + "grad_norm": 43.73965072631836, + "learning_rate": 5.63265306122449e-06, + "loss": 0.312, + "step": 14080 + }, + { + "epoch": 71.88775510204081, + "grad_norm": 35.661376953125, + "learning_rate": 5.622448979591837e-06, + "loss": 0.1623, + "step": 14090 + }, + { + "epoch": 71.93877551020408, + "grad_norm": 21.950332641601562, + "learning_rate": 5.6122448979591834e-06, + "loss": 0.1211, + "step": 14100 + }, + { + "epoch": 71.98979591836735, + "grad_norm": 3.769540548324585, + "learning_rate": 5.602040816326531e-06, + "loss": 0.2313, + "step": 14110 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.8700361010830325, + "eval_loss": 0.5041437149047852, + "eval_runtime": 1.0068, + "eval_samples_per_second": 275.128, + "eval_steps_per_second": 34.764, + "step": 14112 + }, + { + "epoch": 72.04081632653062, + "grad_norm": 1.8095431327819824, + "learning_rate": 5.591836734693878e-06, + "loss": 0.3457, + "step": 14120 + }, + { + "epoch": 72.09183673469387, + "grad_norm": 12.303759574890137, + "learning_rate": 5.581632653061225e-06, + "loss": 0.1154, + "step": 14130 + }, + { + "epoch": 72.14285714285714, + "grad_norm": 4.44205904006958, + "learning_rate": 5.571428571428572e-06, + "loss": 0.1237, + "step": 14140 + }, + { + "epoch": 72.1938775510204, + "grad_norm": 0.5412704348564148, + "learning_rate": 5.561224489795919e-06, + "loss": 0.4194, + "step": 14150 + }, + { + "epoch": 72.24489795918367, + "grad_norm": 12.714085578918457, + "learning_rate": 5.551020408163266e-06, + "loss": 0.1265, + "step": 14160 + }, + { + "epoch": 72.29591836734694, + "grad_norm": 35.36760330200195, + "learning_rate": 5.5408163265306125e-06, + "loss": 0.2791, + "step": 14170 + }, + { + "epoch": 72.34693877551021, + "grad_norm": 25.820354461669922, + "learning_rate": 5.530612244897959e-06, + "loss": 0.1145, + "step": 14180 + }, + { + "epoch": 72.39795918367346, + "grad_norm": 28.520578384399414, + "learning_rate": 5.520408163265306e-06, + "loss": 0.2316, + "step": 14190 + }, + { + "epoch": 72.44897959183673, + "grad_norm": 45.79890441894531, + "learning_rate": 5.510204081632653e-06, + "loss": 0.3081, + "step": 14200 + }, + { + "epoch": 72.5, + "grad_norm": 22.07777214050293, + "learning_rate": 5.500000000000001e-06, + "loss": 0.3681, + "step": 14210 + }, + { + "epoch": 72.55102040816327, + "grad_norm": 1.2228648662567139, + "learning_rate": 5.489795918367347e-06, + "loss": 0.25, + "step": 14220 + }, + { + "epoch": 72.60204081632654, + "grad_norm": 31.68996238708496, + "learning_rate": 5.479591836734695e-06, + "loss": 0.4552, + "step": 14230 + }, + { + "epoch": 72.65306122448979, + "grad_norm": 5.81611442565918, + "learning_rate": 5.4693877551020415e-06, + "loss": 0.2011, + "step": 14240 + }, + { + "epoch": 72.70408163265306, + "grad_norm": 3.1671016216278076, + "learning_rate": 5.459183673469388e-06, + "loss": 0.2212, + "step": 14250 + }, + { + "epoch": 72.75510204081633, + "grad_norm": 0.48449960350990295, + "learning_rate": 5.448979591836735e-06, + "loss": 0.1818, + "step": 14260 + }, + { + "epoch": 72.8061224489796, + "grad_norm": 25.856773376464844, + "learning_rate": 5.438775510204081e-06, + "loss": 0.2014, + "step": 14270 + }, + { + "epoch": 72.85714285714286, + "grad_norm": 16.805490493774414, + "learning_rate": 5.428571428571429e-06, + "loss": 0.1758, + "step": 14280 + }, + { + "epoch": 72.90816326530613, + "grad_norm": 4.414918422698975, + "learning_rate": 5.4183673469387755e-06, + "loss": 0.0773, + "step": 14290 + }, + { + "epoch": 72.95918367346938, + "grad_norm": 12.022470474243164, + "learning_rate": 5.408163265306123e-06, + "loss": 0.1984, + "step": 14300 + }, + { + "epoch": 73.0, + "eval_accuracy": 0.7689530685920578, + "eval_loss": 0.6517756581306458, + "eval_runtime": 0.9992, + "eval_samples_per_second": 277.224, + "eval_steps_per_second": 35.028, + "step": 14308 + }, + { + "epoch": 73.01020408163265, + "grad_norm": 6.723094463348389, + "learning_rate": 5.3979591836734705e-06, + "loss": 0.0979, + "step": 14310 + }, + { + "epoch": 73.06122448979592, + "grad_norm": 3.94631028175354, + "learning_rate": 5.387755102040817e-06, + "loss": 0.3635, + "step": 14320 + }, + { + "epoch": 73.11224489795919, + "grad_norm": 10.169487953186035, + "learning_rate": 5.377551020408164e-06, + "loss": 0.1903, + "step": 14330 + }, + { + "epoch": 73.16326530612245, + "grad_norm": 57.7105712890625, + "learning_rate": 5.36734693877551e-06, + "loss": 0.4363, + "step": 14340 + }, + { + "epoch": 73.21428571428571, + "grad_norm": 6.537781715393066, + "learning_rate": 5.357142857142857e-06, + "loss": 0.1082, + "step": 14350 + }, + { + "epoch": 73.26530612244898, + "grad_norm": 5.327415466308594, + "learning_rate": 5.3469387755102045e-06, + "loss": 0.1644, + "step": 14360 + }, + { + "epoch": 73.31632653061224, + "grad_norm": 10.126859664916992, + "learning_rate": 5.336734693877551e-06, + "loss": 0.238, + "step": 14370 + }, + { + "epoch": 73.36734693877551, + "grad_norm": 0.5607475638389587, + "learning_rate": 5.326530612244898e-06, + "loss": 0.1348, + "step": 14380 + }, + { + "epoch": 73.41836734693878, + "grad_norm": 12.572546005249023, + "learning_rate": 5.316326530612246e-06, + "loss": 0.5427, + "step": 14390 + }, + { + "epoch": 73.46938775510205, + "grad_norm": 20.89989471435547, + "learning_rate": 5.306122448979593e-06, + "loss": 0.6631, + "step": 14400 + }, + { + "epoch": 73.5204081632653, + "grad_norm": 1.963832139968872, + "learning_rate": 5.295918367346939e-06, + "loss": 0.087, + "step": 14410 + }, + { + "epoch": 73.57142857142857, + "grad_norm": 28.084333419799805, + "learning_rate": 5.285714285714286e-06, + "loss": 0.2199, + "step": 14420 + }, + { + "epoch": 73.62244897959184, + "grad_norm": 18.32474136352539, + "learning_rate": 5.275510204081633e-06, + "loss": 0.3334, + "step": 14430 + }, + { + "epoch": 73.6734693877551, + "grad_norm": 21.624387741088867, + "learning_rate": 5.26530612244898e-06, + "loss": 0.3067, + "step": 14440 + }, + { + "epoch": 73.72448979591837, + "grad_norm": 29.55394744873047, + "learning_rate": 5.255102040816327e-06, + "loss": 0.1547, + "step": 14450 + }, + { + "epoch": 73.77551020408163, + "grad_norm": 30.49858283996582, + "learning_rate": 5.2448979591836735e-06, + "loss": 0.2556, + "step": 14460 + }, + { + "epoch": 73.8265306122449, + "grad_norm": 0.7240644097328186, + "learning_rate": 5.23469387755102e-06, + "loss": 0.1483, + "step": 14470 + }, + { + "epoch": 73.87755102040816, + "grad_norm": 10.325899124145508, + "learning_rate": 5.2244897959183684e-06, + "loss": 0.0704, + "step": 14480 + }, + { + "epoch": 73.92857142857143, + "grad_norm": 32.770015716552734, + "learning_rate": 5.214285714285715e-06, + "loss": 0.1667, + "step": 14490 + }, + { + "epoch": 73.9795918367347, + "grad_norm": 32.657691955566406, + "learning_rate": 5.204081632653062e-06, + "loss": 0.2345, + "step": 14500 + }, + { + "epoch": 74.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.5279582738876343, + "eval_runtime": 1.0001, + "eval_samples_per_second": 276.962, + "eval_steps_per_second": 34.995, + "step": 14504 + }, + { + "epoch": 74.03061224489795, + "grad_norm": 22.30213165283203, + "learning_rate": 5.193877551020408e-06, + "loss": 0.1863, + "step": 14510 + }, + { + "epoch": 74.08163265306122, + "grad_norm": 20.714458465576172, + "learning_rate": 5.183673469387756e-06, + "loss": 0.2903, + "step": 14520 + }, + { + "epoch": 74.13265306122449, + "grad_norm": 0.6000948548316956, + "learning_rate": 5.1734693877551025e-06, + "loss": 0.1457, + "step": 14530 + }, + { + "epoch": 74.18367346938776, + "grad_norm": 20.726655960083008, + "learning_rate": 5.163265306122449e-06, + "loss": 0.158, + "step": 14540 + }, + { + "epoch": 74.23469387755102, + "grad_norm": 15.64941692352295, + "learning_rate": 5.153061224489796e-06, + "loss": 0.2832, + "step": 14550 + }, + { + "epoch": 74.28571428571429, + "grad_norm": 8.78612232208252, + "learning_rate": 5.142857142857142e-06, + "loss": 0.1575, + "step": 14560 + }, + { + "epoch": 74.33673469387755, + "grad_norm": 45.87373733520508, + "learning_rate": 5.132653061224491e-06, + "loss": 0.4281, + "step": 14570 + }, + { + "epoch": 74.38775510204081, + "grad_norm": 9.643601417541504, + "learning_rate": 5.122448979591837e-06, + "loss": 0.3971, + "step": 14580 + }, + { + "epoch": 74.43877551020408, + "grad_norm": 1.9270423650741577, + "learning_rate": 5.112244897959184e-06, + "loss": 0.1595, + "step": 14590 + }, + { + "epoch": 74.48979591836735, + "grad_norm": 25.51284408569336, + "learning_rate": 5.1020408163265315e-06, + "loss": 0.0756, + "step": 14600 + }, + { + "epoch": 74.54081632653062, + "grad_norm": 9.79099178314209, + "learning_rate": 5.091836734693878e-06, + "loss": 0.1911, + "step": 14610 + }, + { + "epoch": 74.59183673469387, + "grad_norm": 26.95984649658203, + "learning_rate": 5.081632653061225e-06, + "loss": 0.1043, + "step": 14620 + }, + { + "epoch": 74.64285714285714, + "grad_norm": 27.77216339111328, + "learning_rate": 5.071428571428571e-06, + "loss": 0.3259, + "step": 14630 + }, + { + "epoch": 74.6938775510204, + "grad_norm": 0.5454763174057007, + "learning_rate": 5.061224489795918e-06, + "loss": 0.1797, + "step": 14640 + }, + { + "epoch": 74.74489795918367, + "grad_norm": 0.34584543108940125, + "learning_rate": 5.0510204081632655e-06, + "loss": 0.3321, + "step": 14650 + }, + { + "epoch": 74.79591836734694, + "grad_norm": 29.086713790893555, + "learning_rate": 5.040816326530613e-06, + "loss": 0.2058, + "step": 14660 + }, + { + "epoch": 74.84693877551021, + "grad_norm": 14.277048110961914, + "learning_rate": 5.03061224489796e-06, + "loss": 0.1291, + "step": 14670 + }, + { + "epoch": 74.89795918367346, + "grad_norm": 11.965267181396484, + "learning_rate": 5.020408163265307e-06, + "loss": 0.2885, + "step": 14680 + }, + { + "epoch": 74.94897959183673, + "grad_norm": 48.427459716796875, + "learning_rate": 5.010204081632654e-06, + "loss": 0.2854, + "step": 14690 + }, + { + "epoch": 75.0, + "grad_norm": 2.302154302597046, + "learning_rate": 5e-06, + "loss": 0.0851, + "step": 14700 + }, + { + "epoch": 75.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.6301760077476501, + "eval_runtime": 1.0876, + "eval_samples_per_second": 254.686, + "eval_steps_per_second": 32.181, + "step": 14700 + }, + { + "epoch": 75.05102040816327, + "grad_norm": 17.548139572143555, + "learning_rate": 4.989795918367347e-06, + "loss": 0.2201, + "step": 14710 + }, + { + "epoch": 75.10204081632654, + "grad_norm": 15.283744812011719, + "learning_rate": 4.979591836734694e-06, + "loss": 0.5509, + "step": 14720 + }, + { + "epoch": 75.15306122448979, + "grad_norm": 7.129177570343018, + "learning_rate": 4.969387755102041e-06, + "loss": 0.1675, + "step": 14730 + }, + { + "epoch": 75.20408163265306, + "grad_norm": 2.457040309906006, + "learning_rate": 4.959183673469388e-06, + "loss": 0.3166, + "step": 14740 + }, + { + "epoch": 75.25510204081633, + "grad_norm": 27.27569007873535, + "learning_rate": 4.948979591836735e-06, + "loss": 0.2913, + "step": 14750 + }, + { + "epoch": 75.3061224489796, + "grad_norm": 3.5576016902923584, + "learning_rate": 4.938775510204082e-06, + "loss": 0.2552, + "step": 14760 + }, + { + "epoch": 75.35714285714286, + "grad_norm": 0.859001636505127, + "learning_rate": 4.928571428571429e-06, + "loss": 0.4739, + "step": 14770 + }, + { + "epoch": 75.40816326530613, + "grad_norm": 22.442140579223633, + "learning_rate": 4.918367346938776e-06, + "loss": 0.6096, + "step": 14780 + }, + { + "epoch": 75.45918367346938, + "grad_norm": 4.478810787200928, + "learning_rate": 4.908163265306123e-06, + "loss": 0.2171, + "step": 14790 + }, + { + "epoch": 75.51020408163265, + "grad_norm": 40.68708038330078, + "learning_rate": 4.897959183673469e-06, + "loss": 0.2607, + "step": 14800 + }, + { + "epoch": 75.56122448979592, + "grad_norm": 0.20140103995800018, + "learning_rate": 4.887755102040817e-06, + "loss": 0.2787, + "step": 14810 + }, + { + "epoch": 75.61224489795919, + "grad_norm": 27.775543212890625, + "learning_rate": 4.8775510204081635e-06, + "loss": 0.2055, + "step": 14820 + }, + { + "epoch": 75.66326530612245, + "grad_norm": 5.5859174728393555, + "learning_rate": 4.867346938775511e-06, + "loss": 0.2917, + "step": 14830 + }, + { + "epoch": 75.71428571428571, + "grad_norm": 19.516965866088867, + "learning_rate": 4.857142857142858e-06, + "loss": 0.3068, + "step": 14840 + }, + { + "epoch": 75.76530612244898, + "grad_norm": 6.106418609619141, + "learning_rate": 4.846938775510204e-06, + "loss": 0.4217, + "step": 14850 + }, + { + "epoch": 75.81632653061224, + "grad_norm": 30.099210739135742, + "learning_rate": 4.836734693877552e-06, + "loss": 0.187, + "step": 14860 + }, + { + "epoch": 75.86734693877551, + "grad_norm": 2.3075945377349854, + "learning_rate": 4.826530612244898e-06, + "loss": 0.1541, + "step": 14870 + }, + { + "epoch": 75.91836734693878, + "grad_norm": 3.578822374343872, + "learning_rate": 4.816326530612245e-06, + "loss": 0.1841, + "step": 14880 + }, + { + "epoch": 75.96938775510205, + "grad_norm": 22.67223358154297, + "learning_rate": 4.8061224489795925e-06, + "loss": 0.2234, + "step": 14890 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.4842948019504547, + "eval_runtime": 0.9966, + "eval_samples_per_second": 277.949, + "eval_steps_per_second": 35.12, + "step": 14896 + }, + { + "epoch": 76.0204081632653, + "grad_norm": 12.774593353271484, + "learning_rate": 4.795918367346939e-06, + "loss": 0.4351, + "step": 14900 + }, + { + "epoch": 76.07142857142857, + "grad_norm": 14.880784034729004, + "learning_rate": 4.785714285714287e-06, + "loss": 0.087, + "step": 14910 + }, + { + "epoch": 76.12244897959184, + "grad_norm": 4.729762554168701, + "learning_rate": 4.775510204081633e-06, + "loss": 0.0641, + "step": 14920 + }, + { + "epoch": 76.1734693877551, + "grad_norm": 1.0666556358337402, + "learning_rate": 4.76530612244898e-06, + "loss": 0.1364, + "step": 14930 + }, + { + "epoch": 76.22448979591837, + "grad_norm": 29.946163177490234, + "learning_rate": 4.7551020408163265e-06, + "loss": 0.2539, + "step": 14940 + }, + { + "epoch": 76.27551020408163, + "grad_norm": 22.065452575683594, + "learning_rate": 4.744897959183674e-06, + "loss": 0.2437, + "step": 14950 + }, + { + "epoch": 76.3265306122449, + "grad_norm": 0.8338122367858887, + "learning_rate": 4.734693877551021e-06, + "loss": 0.2664, + "step": 14960 + }, + { + "epoch": 76.37755102040816, + "grad_norm": 7.923938751220703, + "learning_rate": 4.724489795918368e-06, + "loss": 0.1858, + "step": 14970 + }, + { + "epoch": 76.42857142857143, + "grad_norm": 24.35021209716797, + "learning_rate": 4.714285714285715e-06, + "loss": 0.2449, + "step": 14980 + }, + { + "epoch": 76.4795918367347, + "grad_norm": 0.31677842140197754, + "learning_rate": 4.704081632653061e-06, + "loss": 0.1659, + "step": 14990 + }, + { + "epoch": 76.53061224489795, + "grad_norm": 22.837501525878906, + "learning_rate": 4.693877551020409e-06, + "loss": 0.1893, + "step": 15000 + }, + { + "epoch": 76.58163265306122, + "grad_norm": 6.759703159332275, + "learning_rate": 4.6836734693877555e-06, + "loss": 0.198, + "step": 15010 + }, + { + "epoch": 76.63265306122449, + "grad_norm": 0.1937294751405716, + "learning_rate": 4.673469387755102e-06, + "loss": 0.3494, + "step": 15020 + }, + { + "epoch": 76.68367346938776, + "grad_norm": 7.638617038726807, + "learning_rate": 4.663265306122449e-06, + "loss": 0.2424, + "step": 15030 + }, + { + "epoch": 76.73469387755102, + "grad_norm": 17.881322860717773, + "learning_rate": 4.653061224489796e-06, + "loss": 0.3592, + "step": 15040 + }, + { + "epoch": 76.78571428571429, + "grad_norm": 9.466768264770508, + "learning_rate": 4.642857142857144e-06, + "loss": 0.1652, + "step": 15050 + }, + { + "epoch": 76.83673469387755, + "grad_norm": 16.348644256591797, + "learning_rate": 4.63265306122449e-06, + "loss": 0.2283, + "step": 15060 + }, + { + "epoch": 76.88775510204081, + "grad_norm": 2.283270835876465, + "learning_rate": 4.622448979591837e-06, + "loss": 0.2391, + "step": 15070 + }, + { + "epoch": 76.93877551020408, + "grad_norm": 12.172776222229004, + "learning_rate": 4.612244897959184e-06, + "loss": 0.2254, + "step": 15080 + }, + { + "epoch": 76.98979591836735, + "grad_norm": 8.979292869567871, + "learning_rate": 4.602040816326531e-06, + "loss": 0.2266, + "step": 15090 + }, + { + "epoch": 77.0, + "eval_accuracy": 0.8628158844765343, + "eval_loss": 0.4900093078613281, + "eval_runtime": 1.0011, + "eval_samples_per_second": 276.691, + "eval_steps_per_second": 34.961, + "step": 15092 + }, + { + "epoch": 77.04081632653062, + "grad_norm": 5.165127277374268, + "learning_rate": 4.591836734693878e-06, + "loss": 0.4326, + "step": 15100 + }, + { + "epoch": 77.09183673469387, + "grad_norm": 3.1901614665985107, + "learning_rate": 4.5816326530612245e-06, + "loss": 0.1962, + "step": 15110 + }, + { + "epoch": 77.14285714285714, + "grad_norm": 0.5199191570281982, + "learning_rate": 4.571428571428572e-06, + "loss": 0.1939, + "step": 15120 + }, + { + "epoch": 77.1938775510204, + "grad_norm": 32.647342681884766, + "learning_rate": 4.561224489795919e-06, + "loss": 0.1995, + "step": 15130 + }, + { + "epoch": 77.24489795918367, + "grad_norm": 3.1833088397979736, + "learning_rate": 4.551020408163266e-06, + "loss": 0.1963, + "step": 15140 + }, + { + "epoch": 77.29591836734694, + "grad_norm": 5.948197841644287, + "learning_rate": 4.540816326530613e-06, + "loss": 0.2977, + "step": 15150 + }, + { + "epoch": 77.34693877551021, + "grad_norm": 2.8567793369293213, + "learning_rate": 4.530612244897959e-06, + "loss": 0.1921, + "step": 15160 + }, + { + "epoch": 77.39795918367346, + "grad_norm": 33.681854248046875, + "learning_rate": 4.520408163265306e-06, + "loss": 0.266, + "step": 15170 + }, + { + "epoch": 77.44897959183673, + "grad_norm": 13.103612899780273, + "learning_rate": 4.5102040816326535e-06, + "loss": 0.1765, + "step": 15180 + }, + { + "epoch": 77.5, + "grad_norm": 33.5579833984375, + "learning_rate": 4.5e-06, + "loss": 0.6661, + "step": 15190 + }, + { + "epoch": 77.55102040816327, + "grad_norm": 24.5067138671875, + "learning_rate": 4.489795918367348e-06, + "loss": 0.3462, + "step": 15200 + }, + { + "epoch": 77.60204081632654, + "grad_norm": 24.81558609008789, + "learning_rate": 4.479591836734694e-06, + "loss": 0.3137, + "step": 15210 + }, + { + "epoch": 77.65306122448979, + "grad_norm": 1.0147420167922974, + "learning_rate": 4.469387755102041e-06, + "loss": 0.328, + "step": 15220 + }, + { + "epoch": 77.70408163265306, + "grad_norm": 10.42029094696045, + "learning_rate": 4.459183673469388e-06, + "loss": 0.0811, + "step": 15230 + }, + { + "epoch": 77.75510204081633, + "grad_norm": 8.624947547912598, + "learning_rate": 4.448979591836735e-06, + "loss": 0.1752, + "step": 15240 + }, + { + "epoch": 77.8061224489796, + "grad_norm": 22.081663131713867, + "learning_rate": 4.438775510204082e-06, + "loss": 0.3436, + "step": 15250 + }, + { + "epoch": 77.85714285714286, + "grad_norm": 33.639678955078125, + "learning_rate": 4.428571428571429e-06, + "loss": 0.2915, + "step": 15260 + }, + { + "epoch": 77.90816326530613, + "grad_norm": 2.287531614303589, + "learning_rate": 4.418367346938776e-06, + "loss": 0.1991, + "step": 15270 + }, + { + "epoch": 77.95918367346938, + "grad_norm": 32.962825775146484, + "learning_rate": 4.408163265306123e-06, + "loss": 0.2735, + "step": 15280 + }, + { + "epoch": 78.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.524876594543457, + "eval_runtime": 0.999, + "eval_samples_per_second": 277.275, + "eval_steps_per_second": 35.035, + "step": 15288 + }, + { + "epoch": 78.01020408163265, + "grad_norm": 1.213417410850525, + "learning_rate": 4.39795918367347e-06, + "loss": 0.3544, + "step": 15290 + }, + { + "epoch": 78.06122448979592, + "grad_norm": 43.896305084228516, + "learning_rate": 4.3877551020408165e-06, + "loss": 0.3544, + "step": 15300 + }, + { + "epoch": 78.11224489795919, + "grad_norm": 3.0828168392181396, + "learning_rate": 4.377551020408163e-06, + "loss": 0.1293, + "step": 15310 + }, + { + "epoch": 78.16326530612245, + "grad_norm": 8.468568801879883, + "learning_rate": 4.367346938775511e-06, + "loss": 0.2771, + "step": 15320 + }, + { + "epoch": 78.21428571428571, + "grad_norm": 7.703556060791016, + "learning_rate": 4.357142857142857e-06, + "loss": 0.201, + "step": 15330 + }, + { + "epoch": 78.26530612244898, + "grad_norm": 20.330432891845703, + "learning_rate": 4.346938775510205e-06, + "loss": 0.2232, + "step": 15340 + }, + { + "epoch": 78.31632653061224, + "grad_norm": 16.423688888549805, + "learning_rate": 4.336734693877551e-06, + "loss": 0.0912, + "step": 15350 + }, + { + "epoch": 78.36734693877551, + "grad_norm": 21.406173706054688, + "learning_rate": 4.326530612244899e-06, + "loss": 0.3349, + "step": 15360 + }, + { + "epoch": 78.41836734693878, + "grad_norm": 0.4630574882030487, + "learning_rate": 4.3163265306122455e-06, + "loss": 0.1276, + "step": 15370 + }, + { + "epoch": 78.46938775510205, + "grad_norm": 15.888497352600098, + "learning_rate": 4.306122448979592e-06, + "loss": 0.1449, + "step": 15380 + }, + { + "epoch": 78.5204081632653, + "grad_norm": 24.146495819091797, + "learning_rate": 4.295918367346939e-06, + "loss": 0.2157, + "step": 15390 + }, + { + "epoch": 78.57142857142857, + "grad_norm": 25.034496307373047, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.3132, + "step": 15400 + }, + { + "epoch": 78.62244897959184, + "grad_norm": 10.221307754516602, + "learning_rate": 4.275510204081633e-06, + "loss": 0.3075, + "step": 15410 + }, + { + "epoch": 78.6734693877551, + "grad_norm": 11.892243385314941, + "learning_rate": 4.2653061224489804e-06, + "loss": 0.1132, + "step": 15420 + }, + { + "epoch": 78.72448979591837, + "grad_norm": 1.9759546518325806, + "learning_rate": 4.255102040816327e-06, + "loss": 0.1536, + "step": 15430 + }, + { + "epoch": 78.77551020408163, + "grad_norm": 23.30167007446289, + "learning_rate": 4.244897959183674e-06, + "loss": 0.1548, + "step": 15440 + }, + { + "epoch": 78.8265306122449, + "grad_norm": 3.072357416152954, + "learning_rate": 4.234693877551021e-06, + "loss": 0.1763, + "step": 15450 + }, + { + "epoch": 78.87755102040816, + "grad_norm": 0.7805272936820984, + "learning_rate": 4.224489795918368e-06, + "loss": 0.1927, + "step": 15460 + }, + { + "epoch": 78.92857142857143, + "grad_norm": 14.9546480178833, + "learning_rate": 4.2142857142857145e-06, + "loss": 0.3072, + "step": 15470 + }, + { + "epoch": 78.9795918367347, + "grad_norm": 22.33051109313965, + "learning_rate": 4.204081632653061e-06, + "loss": 0.2442, + "step": 15480 + }, + { + "epoch": 79.0, + "eval_accuracy": 0.8916967509025271, + "eval_loss": 0.5061302781105042, + "eval_runtime": 1.0058, + "eval_samples_per_second": 275.399, + "eval_steps_per_second": 34.798, + "step": 15484 + }, + { + "epoch": 79.03061224489795, + "grad_norm": 18.914520263671875, + "learning_rate": 4.193877551020409e-06, + "loss": 0.0868, + "step": 15490 + }, + { + "epoch": 79.08163265306122, + "grad_norm": 6.228423595428467, + "learning_rate": 4.183673469387755e-06, + "loss": 0.3356, + "step": 15500 + }, + { + "epoch": 79.13265306122449, + "grad_norm": 46.308528900146484, + "learning_rate": 4.173469387755103e-06, + "loss": 0.2509, + "step": 15510 + }, + { + "epoch": 79.18367346938776, + "grad_norm": 2.605499744415283, + "learning_rate": 4.163265306122449e-06, + "loss": 0.09, + "step": 15520 + }, + { + "epoch": 79.23469387755102, + "grad_norm": 14.905548095703125, + "learning_rate": 4.153061224489796e-06, + "loss": 0.1749, + "step": 15530 + }, + { + "epoch": 79.28571428571429, + "grad_norm": 8.328230857849121, + "learning_rate": 4.1428571428571435e-06, + "loss": 0.1318, + "step": 15540 + }, + { + "epoch": 79.33673469387755, + "grad_norm": 35.11759567260742, + "learning_rate": 4.13265306122449e-06, + "loss": 0.2182, + "step": 15550 + }, + { + "epoch": 79.38775510204081, + "grad_norm": 17.75996208190918, + "learning_rate": 4.122448979591837e-06, + "loss": 0.2355, + "step": 15560 + }, + { + "epoch": 79.43877551020408, + "grad_norm": 47.27717590332031, + "learning_rate": 4.112244897959184e-06, + "loss": 0.2948, + "step": 15570 + }, + { + "epoch": 79.48979591836735, + "grad_norm": 1.2012115716934204, + "learning_rate": 4.102040816326531e-06, + "loss": 0.2102, + "step": 15580 + }, + { + "epoch": 79.54081632653062, + "grad_norm": 30.00411033630371, + "learning_rate": 4.091836734693878e-06, + "loss": 0.1818, + "step": 15590 + }, + { + "epoch": 79.59183673469387, + "grad_norm": 3.9917380809783936, + "learning_rate": 4.081632653061225e-06, + "loss": 0.1209, + "step": 15600 + }, + { + "epoch": 79.64285714285714, + "grad_norm": 2.8822126388549805, + "learning_rate": 4.071428571428572e-06, + "loss": 0.1706, + "step": 15610 + }, + { + "epoch": 79.6938775510204, + "grad_norm": 1.6397266387939453, + "learning_rate": 4.061224489795918e-06, + "loss": 0.245, + "step": 15620 + }, + { + "epoch": 79.74489795918367, + "grad_norm": 0.2444908767938614, + "learning_rate": 4.051020408163266e-06, + "loss": 0.2015, + "step": 15630 + }, + { + "epoch": 79.79591836734694, + "grad_norm": 41.925331115722656, + "learning_rate": 4.040816326530612e-06, + "loss": 0.3276, + "step": 15640 + }, + { + "epoch": 79.84693877551021, + "grad_norm": 25.163707733154297, + "learning_rate": 4.03061224489796e-06, + "loss": 0.2702, + "step": 15650 + }, + { + "epoch": 79.89795918367346, + "grad_norm": 2.329144239425659, + "learning_rate": 4.0204081632653065e-06, + "loss": 0.2852, + "step": 15660 + }, + { + "epoch": 79.94897959183673, + "grad_norm": 8.50625228881836, + "learning_rate": 4.010204081632653e-06, + "loss": 0.257, + "step": 15670 + }, + { + "epoch": 80.0, + "grad_norm": 0.3259093463420868, + "learning_rate": 4.000000000000001e-06, + "loss": 0.2246, + "step": 15680 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.8664259927797834, + "eval_loss": 0.4809911251068115, + "eval_runtime": 1.0054, + "eval_samples_per_second": 275.512, + "eval_steps_per_second": 34.812, + "step": 15680 + }, + { + "epoch": 80.05102040816327, + "grad_norm": 21.524263381958008, + "learning_rate": 3.989795918367347e-06, + "loss": 0.2428, + "step": 15690 + }, + { + "epoch": 80.10204081632654, + "grad_norm": 23.943897247314453, + "learning_rate": 3.979591836734694e-06, + "loss": 0.3094, + "step": 15700 + }, + { + "epoch": 80.15306122448979, + "grad_norm": 6.069556713104248, + "learning_rate": 3.969387755102041e-06, + "loss": 0.0477, + "step": 15710 + }, + { + "epoch": 80.20408163265306, + "grad_norm": 4.9373650550842285, + "learning_rate": 3.959183673469388e-06, + "loss": 0.3088, + "step": 15720 + }, + { + "epoch": 80.25510204081633, + "grad_norm": 43.122920989990234, + "learning_rate": 3.9489795918367356e-06, + "loss": 0.2121, + "step": 15730 + }, + { + "epoch": 80.3061224489796, + "grad_norm": 19.9013671875, + "learning_rate": 3.938775510204082e-06, + "loss": 0.2149, + "step": 15740 + }, + { + "epoch": 80.35714285714286, + "grad_norm": 1.47056245803833, + "learning_rate": 3.928571428571429e-06, + "loss": 0.1739, + "step": 15750 + }, + { + "epoch": 80.40816326530613, + "grad_norm": 6.8874688148498535, + "learning_rate": 3.9183673469387755e-06, + "loss": 0.2165, + "step": 15760 + }, + { + "epoch": 80.45918367346938, + "grad_norm": 28.98600959777832, + "learning_rate": 3.908163265306123e-06, + "loss": 0.3437, + "step": 15770 + }, + { + "epoch": 80.51020408163265, + "grad_norm": 1.1088128089904785, + "learning_rate": 3.89795918367347e-06, + "loss": 0.1941, + "step": 15780 + }, + { + "epoch": 80.56122448979592, + "grad_norm": 1.074175238609314, + "learning_rate": 3.887755102040816e-06, + "loss": 0.4111, + "step": 15790 + }, + { + "epoch": 80.61224489795919, + "grad_norm": 21.87729263305664, + "learning_rate": 3.877551020408164e-06, + "loss": 0.1257, + "step": 15800 + }, + { + "epoch": 80.66326530612245, + "grad_norm": 10.955086708068848, + "learning_rate": 3.86734693877551e-06, + "loss": 0.2362, + "step": 15810 + }, + { + "epoch": 80.71428571428571, + "grad_norm": 24.620656967163086, + "learning_rate": 3.857142857142858e-06, + "loss": 0.1977, + "step": 15820 + }, + { + "epoch": 80.76530612244898, + "grad_norm": 11.403874397277832, + "learning_rate": 3.8469387755102045e-06, + "loss": 0.2498, + "step": 15830 + }, + { + "epoch": 80.81632653061224, + "grad_norm": 1.8278330564498901, + "learning_rate": 3.836734693877551e-06, + "loss": 0.2067, + "step": 15840 + }, + { + "epoch": 80.86734693877551, + "grad_norm": 4.380407810211182, + "learning_rate": 3.826530612244898e-06, + "loss": 0.1851, + "step": 15850 + }, + { + "epoch": 80.91836734693878, + "grad_norm": 5.616203308105469, + "learning_rate": 3.816326530612245e-06, + "loss": 0.1859, + "step": 15860 + }, + { + "epoch": 80.96938775510205, + "grad_norm": 7.985535621643066, + "learning_rate": 3.8061224489795923e-06, + "loss": 0.3557, + "step": 15870 + }, + { + "epoch": 81.0, + "eval_accuracy": 0.8122743682310469, + "eval_loss": 0.6420153379440308, + "eval_runtime": 0.9975, + "eval_samples_per_second": 277.699, + "eval_steps_per_second": 35.088, + "step": 15876 + }, + { + "epoch": 81.0204081632653, + "grad_norm": 25.5210018157959, + "learning_rate": 3.795918367346939e-06, + "loss": 0.3854, + "step": 15880 + }, + { + "epoch": 81.07142857142857, + "grad_norm": 1.7337543964385986, + "learning_rate": 3.785714285714286e-06, + "loss": 0.2927, + "step": 15890 + }, + { + "epoch": 81.12244897959184, + "grad_norm": 7.199073791503906, + "learning_rate": 3.7755102040816327e-06, + "loss": 0.1946, + "step": 15900 + }, + { + "epoch": 81.1734693877551, + "grad_norm": 1.877372145652771, + "learning_rate": 3.76530612244898e-06, + "loss": 0.1716, + "step": 15910 + }, + { + "epoch": 81.22448979591837, + "grad_norm": 1.424315094947815, + "learning_rate": 3.7551020408163268e-06, + "loss": 0.1721, + "step": 15920 + }, + { + "epoch": 81.27551020408163, + "grad_norm": 0.08887893706560135, + "learning_rate": 3.744897959183674e-06, + "loss": 0.117, + "step": 15930 + }, + { + "epoch": 81.3265306122449, + "grad_norm": 34.33686447143555, + "learning_rate": 3.7346938775510205e-06, + "loss": 0.4972, + "step": 15940 + }, + { + "epoch": 81.37755102040816, + "grad_norm": 1.4418655633926392, + "learning_rate": 3.724489795918368e-06, + "loss": 0.119, + "step": 15950 + }, + { + "epoch": 81.42857142857143, + "grad_norm": 10.43326473236084, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.1936, + "step": 15960 + }, + { + "epoch": 81.4795918367347, + "grad_norm": 6.831284523010254, + "learning_rate": 3.7040816326530617e-06, + "loss": 0.3134, + "step": 15970 + }, + { + "epoch": 81.53061224489795, + "grad_norm": 8.005436897277832, + "learning_rate": 3.6938775510204083e-06, + "loss": 0.1225, + "step": 15980 + }, + { + "epoch": 81.58163265306122, + "grad_norm": 8.023073196411133, + "learning_rate": 3.6836734693877554e-06, + "loss": 0.3182, + "step": 15990 + }, + { + "epoch": 81.63265306122449, + "grad_norm": 9.110128402709961, + "learning_rate": 3.6734693877551024e-06, + "loss": 0.3201, + "step": 16000 + }, + { + "epoch": 81.68367346938776, + "grad_norm": 0.1990327537059784, + "learning_rate": 3.6632653061224495e-06, + "loss": 0.0982, + "step": 16010 + }, + { + "epoch": 81.73469387755102, + "grad_norm": 34.50798034667969, + "learning_rate": 3.653061224489796e-06, + "loss": 0.1312, + "step": 16020 + }, + { + "epoch": 81.78571428571429, + "grad_norm": 3.944993257522583, + "learning_rate": 3.642857142857143e-06, + "loss": 0.2776, + "step": 16030 + }, + { + "epoch": 81.83673469387755, + "grad_norm": 3.5682852268218994, + "learning_rate": 3.6326530612244903e-06, + "loss": 0.2794, + "step": 16040 + }, + { + "epoch": 81.88775510204081, + "grad_norm": 3.840916872024536, + "learning_rate": 3.6224489795918373e-06, + "loss": 0.2495, + "step": 16050 + }, + { + "epoch": 81.93877551020408, + "grad_norm": 10.819250106811523, + "learning_rate": 3.612244897959184e-06, + "loss": 0.1697, + "step": 16060 + }, + { + "epoch": 81.98979591836735, + "grad_norm": 21.10453224182129, + "learning_rate": 3.6020408163265306e-06, + "loss": 0.2017, + "step": 16070 + }, + { + "epoch": 82.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.5157850384712219, + "eval_runtime": 0.9992, + "eval_samples_per_second": 277.223, + "eval_steps_per_second": 35.028, + "step": 16072 + }, + { + "epoch": 82.04081632653062, + "grad_norm": 2.5481784343719482, + "learning_rate": 3.5918367346938777e-06, + "loss": 0.1156, + "step": 16080 + }, + { + "epoch": 82.09183673469387, + "grad_norm": 2.878183603286743, + "learning_rate": 3.581632653061225e-06, + "loss": 0.1873, + "step": 16090 + }, + { + "epoch": 82.14285714285714, + "grad_norm": 3.8095788955688477, + "learning_rate": 3.5714285714285718e-06, + "loss": 0.0686, + "step": 16100 + }, + { + "epoch": 82.1938775510204, + "grad_norm": 1.6254216432571411, + "learning_rate": 3.5612244897959184e-06, + "loss": 0.135, + "step": 16110 + }, + { + "epoch": 82.24489795918367, + "grad_norm": 44.448612213134766, + "learning_rate": 3.5510204081632655e-06, + "loss": 0.4995, + "step": 16120 + }, + { + "epoch": 82.29591836734694, + "grad_norm": 3.154118776321411, + "learning_rate": 3.540816326530613e-06, + "loss": 0.1962, + "step": 16130 + }, + { + "epoch": 82.34693877551021, + "grad_norm": 31.657968521118164, + "learning_rate": 3.5306122448979596e-06, + "loss": 0.2069, + "step": 16140 + }, + { + "epoch": 82.39795918367346, + "grad_norm": 20.364303588867188, + "learning_rate": 3.5204081632653062e-06, + "loss": 0.397, + "step": 16150 + }, + { + "epoch": 82.44897959183673, + "grad_norm": 37.04352569580078, + "learning_rate": 3.5102040816326533e-06, + "loss": 0.2006, + "step": 16160 + }, + { + "epoch": 82.5, + "grad_norm": 7.28590202331543, + "learning_rate": 3.5e-06, + "loss": 0.1641, + "step": 16170 + }, + { + "epoch": 82.55102040816327, + "grad_norm": 37.455291748046875, + "learning_rate": 3.4897959183673474e-06, + "loss": 0.2405, + "step": 16180 + }, + { + "epoch": 82.60204081632654, + "grad_norm": 9.498440742492676, + "learning_rate": 3.479591836734694e-06, + "loss": 0.2517, + "step": 16190 + }, + { + "epoch": 82.65306122448979, + "grad_norm": 7.089411735534668, + "learning_rate": 3.469387755102041e-06, + "loss": 0.1783, + "step": 16200 + }, + { + "epoch": 82.70408163265306, + "grad_norm": 46.47587966918945, + "learning_rate": 3.4591836734693878e-06, + "loss": 0.2832, + "step": 16210 + }, + { + "epoch": 82.75510204081633, + "grad_norm": 65.72367858886719, + "learning_rate": 3.4489795918367353e-06, + "loss": 0.392, + "step": 16220 + }, + { + "epoch": 82.8061224489796, + "grad_norm": 21.230134963989258, + "learning_rate": 3.438775510204082e-06, + "loss": 0.1261, + "step": 16230 + }, + { + "epoch": 82.85714285714286, + "grad_norm": 17.535839080810547, + "learning_rate": 3.428571428571429e-06, + "loss": 0.1801, + "step": 16240 + }, + { + "epoch": 82.90816326530613, + "grad_norm": 1.463416337966919, + "learning_rate": 3.4183673469387756e-06, + "loss": 0.1091, + "step": 16250 + }, + { + "epoch": 82.95918367346938, + "grad_norm": 5.50378942489624, + "learning_rate": 3.4081632653061227e-06, + "loss": 0.249, + "step": 16260 + }, + { + "epoch": 83.0, + "eval_accuracy": 0.9025270758122743, + "eval_loss": 0.4363895058631897, + "eval_runtime": 1.0171, + "eval_samples_per_second": 272.333, + "eval_steps_per_second": 34.41, + "step": 16268 + }, + { + "epoch": 83.01020408163265, + "grad_norm": 15.178898811340332, + "learning_rate": 3.3979591836734697e-06, + "loss": 0.1123, + "step": 16270 + }, + { + "epoch": 83.06122448979592, + "grad_norm": 2.882610559463501, + "learning_rate": 3.3877551020408168e-06, + "loss": 0.1501, + "step": 16280 + }, + { + "epoch": 83.11224489795919, + "grad_norm": 4.557912349700928, + "learning_rate": 3.3775510204081634e-06, + "loss": 0.3885, + "step": 16290 + }, + { + "epoch": 83.16326530612245, + "grad_norm": 1.1619328260421753, + "learning_rate": 3.3673469387755105e-06, + "loss": 0.1274, + "step": 16300 + }, + { + "epoch": 83.21428571428571, + "grad_norm": 34.344322204589844, + "learning_rate": 3.357142857142857e-06, + "loss": 0.2945, + "step": 16310 + }, + { + "epoch": 83.26530612244898, + "grad_norm": 0.5423146486282349, + "learning_rate": 3.3469387755102046e-06, + "loss": 0.1279, + "step": 16320 + }, + { + "epoch": 83.31632653061224, + "grad_norm": 2.986043691635132, + "learning_rate": 3.3367346938775513e-06, + "loss": 0.329, + "step": 16330 + }, + { + "epoch": 83.36734693877551, + "grad_norm": 32.15123748779297, + "learning_rate": 3.3265306122448983e-06, + "loss": 0.2952, + "step": 16340 + }, + { + "epoch": 83.41836734693878, + "grad_norm": 24.476350784301758, + "learning_rate": 3.316326530612245e-06, + "loss": 0.2733, + "step": 16350 + }, + { + "epoch": 83.46938775510205, + "grad_norm": 12.141332626342773, + "learning_rate": 3.3061224489795924e-06, + "loss": 0.0771, + "step": 16360 + }, + { + "epoch": 83.5204081632653, + "grad_norm": 27.662635803222656, + "learning_rate": 3.295918367346939e-06, + "loss": 0.2778, + "step": 16370 + }, + { + "epoch": 83.57142857142857, + "grad_norm": 35.881202697753906, + "learning_rate": 3.285714285714286e-06, + "loss": 0.3093, + "step": 16380 + }, + { + "epoch": 83.62244897959184, + "grad_norm": 3.4644417762756348, + "learning_rate": 3.2755102040816328e-06, + "loss": 0.2749, + "step": 16390 + }, + { + "epoch": 83.6734693877551, + "grad_norm": 6.118886947631836, + "learning_rate": 3.2653061224489794e-06, + "loss": 0.1706, + "step": 16400 + }, + { + "epoch": 83.72448979591837, + "grad_norm": 18.26921272277832, + "learning_rate": 3.255102040816327e-06, + "loss": 0.1031, + "step": 16410 + }, + { + "epoch": 83.77551020408163, + "grad_norm": 4.544955253601074, + "learning_rate": 3.244897959183674e-06, + "loss": 0.2469, + "step": 16420 + }, + { + "epoch": 83.8265306122449, + "grad_norm": 10.462955474853516, + "learning_rate": 3.2346938775510206e-06, + "loss": 0.2638, + "step": 16430 + }, + { + "epoch": 83.87755102040816, + "grad_norm": 2.0833020210266113, + "learning_rate": 3.2244897959183672e-06, + "loss": 0.0702, + "step": 16440 + }, + { + "epoch": 83.92857142857143, + "grad_norm": 12.66522216796875, + "learning_rate": 3.2142857142857147e-06, + "loss": 0.1751, + "step": 16450 + }, + { + "epoch": 83.9795918367347, + "grad_norm": 3.683253288269043, + "learning_rate": 3.204081632653062e-06, + "loss": 0.2566, + "step": 16460 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.8736462093862816, + "eval_loss": 0.5507027506828308, + "eval_runtime": 1.0022, + "eval_samples_per_second": 276.381, + "eval_steps_per_second": 34.922, + "step": 16464 + }, + { + "epoch": 84.03061224489795, + "grad_norm": 23.517208099365234, + "learning_rate": 3.1938775510204084e-06, + "loss": 0.1412, + "step": 16470 + }, + { + "epoch": 84.08163265306122, + "grad_norm": 11.195033073425293, + "learning_rate": 3.183673469387755e-06, + "loss": 0.0783, + "step": 16480 + }, + { + "epoch": 84.13265306122449, + "grad_norm": 0.780221700668335, + "learning_rate": 3.173469387755102e-06, + "loss": 0.2326, + "step": 16490 + }, + { + "epoch": 84.18367346938776, + "grad_norm": 24.656831741333008, + "learning_rate": 3.1632653061224496e-06, + "loss": 0.2802, + "step": 16500 + }, + { + "epoch": 84.23469387755102, + "grad_norm": 25.569185256958008, + "learning_rate": 3.1530612244897963e-06, + "loss": 0.3485, + "step": 16510 + }, + { + "epoch": 84.28571428571429, + "grad_norm": 37.62729263305664, + "learning_rate": 3.142857142857143e-06, + "loss": 0.1441, + "step": 16520 + }, + { + "epoch": 84.33673469387755, + "grad_norm": 8.158026695251465, + "learning_rate": 3.13265306122449e-06, + "loss": 0.142, + "step": 16530 + }, + { + "epoch": 84.38775510204081, + "grad_norm": 16.40460205078125, + "learning_rate": 3.1224489795918374e-06, + "loss": 0.1415, + "step": 16540 + }, + { + "epoch": 84.43877551020408, + "grad_norm": 41.52944564819336, + "learning_rate": 3.112244897959184e-06, + "loss": 0.4017, + "step": 16550 + }, + { + "epoch": 84.48979591836735, + "grad_norm": 0.4954560399055481, + "learning_rate": 3.1020408163265307e-06, + "loss": 0.2047, + "step": 16560 + }, + { + "epoch": 84.54081632653062, + "grad_norm": 30.724458694458008, + "learning_rate": 3.0918367346938778e-06, + "loss": 0.2093, + "step": 16570 + }, + { + "epoch": 84.59183673469387, + "grad_norm": 6.768970012664795, + "learning_rate": 3.0816326530612244e-06, + "loss": 0.3113, + "step": 16580 + }, + { + "epoch": 84.64285714285714, + "grad_norm": 7.989703178405762, + "learning_rate": 3.071428571428572e-06, + "loss": 0.1392, + "step": 16590 + }, + { + "epoch": 84.6938775510204, + "grad_norm": 18.151731491088867, + "learning_rate": 3.0612244897959185e-06, + "loss": 0.2743, + "step": 16600 + }, + { + "epoch": 84.74489795918367, + "grad_norm": 0.533332109451294, + "learning_rate": 3.0510204081632656e-06, + "loss": 0.2133, + "step": 16610 + }, + { + "epoch": 84.79591836734694, + "grad_norm": 34.20014953613281, + "learning_rate": 3.0408163265306122e-06, + "loss": 0.189, + "step": 16620 + }, + { + "epoch": 84.84693877551021, + "grad_norm": 1.158712387084961, + "learning_rate": 3.0306122448979597e-06, + "loss": 0.3725, + "step": 16630 + }, + { + "epoch": 84.89795918367346, + "grad_norm": 0.5959182381629944, + "learning_rate": 3.0204081632653064e-06, + "loss": 0.1986, + "step": 16640 + }, + { + "epoch": 84.94897959183673, + "grad_norm": 12.45845890045166, + "learning_rate": 3.0102040816326534e-06, + "loss": 0.1937, + "step": 16650 + }, + { + "epoch": 85.0, + "grad_norm": 7.251813888549805, + "learning_rate": 3e-06, + "loss": 0.1012, + "step": 16660 + }, + { + "epoch": 85.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.47279682755470276, + "eval_runtime": 0.998, + "eval_samples_per_second": 277.567, + "eval_steps_per_second": 35.072, + "step": 16660 + }, + { + "epoch": 85.05102040816327, + "grad_norm": 11.385180473327637, + "learning_rate": 2.989795918367347e-06, + "loss": 0.1869, + "step": 16670 + }, + { + "epoch": 85.10204081632654, + "grad_norm": 1.974629282951355, + "learning_rate": 2.979591836734694e-06, + "loss": 0.0938, + "step": 16680 + }, + { + "epoch": 85.15306122448979, + "grad_norm": 10.089799880981445, + "learning_rate": 2.9693877551020413e-06, + "loss": 0.1943, + "step": 16690 + }, + { + "epoch": 85.20408163265306, + "grad_norm": 9.131400108337402, + "learning_rate": 2.959183673469388e-06, + "loss": 0.2869, + "step": 16700 + }, + { + "epoch": 85.25510204081633, + "grad_norm": 5.9185662269592285, + "learning_rate": 2.948979591836735e-06, + "loss": 0.2402, + "step": 16710 + }, + { + "epoch": 85.3061224489796, + "grad_norm": 5.765443325042725, + "learning_rate": 2.938775510204082e-06, + "loss": 0.1159, + "step": 16720 + }, + { + "epoch": 85.35714285714286, + "grad_norm": 39.08930206298828, + "learning_rate": 2.928571428571429e-06, + "loss": 0.1996, + "step": 16730 + }, + { + "epoch": 85.40816326530613, + "grad_norm": 0.24387198686599731, + "learning_rate": 2.9183673469387757e-06, + "loss": 0.2449, + "step": 16740 + }, + { + "epoch": 85.45918367346938, + "grad_norm": 1.3607583045959473, + "learning_rate": 2.908163265306123e-06, + "loss": 0.2389, + "step": 16750 + }, + { + "epoch": 85.51020408163265, + "grad_norm": 10.419044494628906, + "learning_rate": 2.8979591836734694e-06, + "loss": 0.087, + "step": 16760 + }, + { + "epoch": 85.56122448979592, + "grad_norm": 35.22848892211914, + "learning_rate": 2.887755102040817e-06, + "loss": 0.4144, + "step": 16770 + }, + { + "epoch": 85.61224489795919, + "grad_norm": 5.821999549865723, + "learning_rate": 2.8775510204081636e-06, + "loss": 0.6079, + "step": 16780 + }, + { + "epoch": 85.66326530612245, + "grad_norm": 0.17407262325286865, + "learning_rate": 2.86734693877551e-06, + "loss": 0.3179, + "step": 16790 + }, + { + "epoch": 85.71428571428571, + "grad_norm": 5.639936447143555, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.298, + "step": 16800 + }, + { + "epoch": 85.76530612244898, + "grad_norm": 9.247851371765137, + "learning_rate": 2.8469387755102047e-06, + "loss": 0.3814, + "step": 16810 + }, + { + "epoch": 85.81632653061224, + "grad_norm": 30.942747116088867, + "learning_rate": 2.8367346938775514e-06, + "loss": 0.3325, + "step": 16820 + }, + { + "epoch": 85.86734693877551, + "grad_norm": 9.413663864135742, + "learning_rate": 2.826530612244898e-06, + "loss": 0.0306, + "step": 16830 + }, + { + "epoch": 85.91836734693878, + "grad_norm": 1.2761138677597046, + "learning_rate": 2.816326530612245e-06, + "loss": 0.0712, + "step": 16840 + }, + { + "epoch": 85.96938775510205, + "grad_norm": 25.14208984375, + "learning_rate": 2.8061224489795917e-06, + "loss": 0.1972, + "step": 16850 + }, + { + "epoch": 86.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 0.5746248364448547, + "eval_runtime": 1.0189, + "eval_samples_per_second": 271.869, + "eval_steps_per_second": 34.352, + "step": 16856 + }, + { + "epoch": 86.0204081632653, + "grad_norm": 12.963753700256348, + "learning_rate": 2.795918367346939e-06, + "loss": 0.1492, + "step": 16860 + }, + { + "epoch": 86.07142857142857, + "grad_norm": 27.61210060119629, + "learning_rate": 2.785714285714286e-06, + "loss": 0.3554, + "step": 16870 + }, + { + "epoch": 86.12244897959184, + "grad_norm": 29.644655227661133, + "learning_rate": 2.775510204081633e-06, + "loss": 0.1217, + "step": 16880 + }, + { + "epoch": 86.1734693877551, + "grad_norm": 23.89996910095215, + "learning_rate": 2.7653061224489795e-06, + "loss": 0.1276, + "step": 16890 + }, + { + "epoch": 86.22448979591837, + "grad_norm": 28.514251708984375, + "learning_rate": 2.7551020408163266e-06, + "loss": 0.1826, + "step": 16900 + }, + { + "epoch": 86.27551020408163, + "grad_norm": 8.876848220825195, + "learning_rate": 2.7448979591836737e-06, + "loss": 0.0892, + "step": 16910 + }, + { + "epoch": 86.3265306122449, + "grad_norm": 37.74919509887695, + "learning_rate": 2.7346938775510207e-06, + "loss": 0.3094, + "step": 16920 + }, + { + "epoch": 86.37755102040816, + "grad_norm": 1.1519695520401, + "learning_rate": 2.7244897959183674e-06, + "loss": 0.2463, + "step": 16930 + }, + { + "epoch": 86.42857142857143, + "grad_norm": 28.24508285522461, + "learning_rate": 2.7142857142857144e-06, + "loss": 0.1384, + "step": 16940 + }, + { + "epoch": 86.4795918367347, + "grad_norm": 10.30151081085205, + "learning_rate": 2.7040816326530615e-06, + "loss": 0.1481, + "step": 16950 + }, + { + "epoch": 86.53061224489795, + "grad_norm": 4.71318244934082, + "learning_rate": 2.6938775510204086e-06, + "loss": 0.2336, + "step": 16960 + }, + { + "epoch": 86.58163265306122, + "grad_norm": 6.319276809692383, + "learning_rate": 2.683673469387755e-06, + "loss": 0.5202, + "step": 16970 + }, + { + "epoch": 86.63265306122449, + "grad_norm": 17.20753288269043, + "learning_rate": 2.6734693877551023e-06, + "loss": 0.1363, + "step": 16980 + }, + { + "epoch": 86.68367346938776, + "grad_norm": 30.989686965942383, + "learning_rate": 2.663265306122449e-06, + "loss": 0.3424, + "step": 16990 + }, + { + "epoch": 86.73469387755102, + "grad_norm": 0.9678404331207275, + "learning_rate": 2.6530612244897964e-06, + "loss": 0.3842, + "step": 17000 + }, + { + "epoch": 86.78571428571429, + "grad_norm": 22.190399169921875, + "learning_rate": 2.642857142857143e-06, + "loss": 0.2099, + "step": 17010 + }, + { + "epoch": 86.83673469387755, + "grad_norm": 6.763099193572998, + "learning_rate": 2.63265306122449e-06, + "loss": 0.1056, + "step": 17020 + }, + { + "epoch": 86.88775510204081, + "grad_norm": 28.219539642333984, + "learning_rate": 2.6224489795918367e-06, + "loss": 0.2426, + "step": 17030 + }, + { + "epoch": 86.93877551020408, + "grad_norm": 0.8934134840965271, + "learning_rate": 2.6122448979591842e-06, + "loss": 0.1343, + "step": 17040 + }, + { + "epoch": 86.98979591836735, + "grad_norm": 46.49623107910156, + "learning_rate": 2.602040816326531e-06, + "loss": 0.7922, + "step": 17050 + }, + { + "epoch": 87.0, + "eval_accuracy": 0.8628158844765343, + "eval_loss": 0.5261682271957397, + "eval_runtime": 1.0277, + "eval_samples_per_second": 269.541, + "eval_steps_per_second": 34.057, + "step": 17052 + }, + { + "epoch": 87.04081632653062, + "grad_norm": 0.35685351490974426, + "learning_rate": 2.591836734693878e-06, + "loss": 0.2127, + "step": 17060 + }, + { + "epoch": 87.09183673469387, + "grad_norm": 0.9195706248283386, + "learning_rate": 2.5816326530612246e-06, + "loss": 0.241, + "step": 17070 + }, + { + "epoch": 87.14285714285714, + "grad_norm": 49.179832458496094, + "learning_rate": 2.571428571428571e-06, + "loss": 0.4003, + "step": 17080 + }, + { + "epoch": 87.1938775510204, + "grad_norm": 39.513450622558594, + "learning_rate": 2.5612244897959187e-06, + "loss": 0.2338, + "step": 17090 + }, + { + "epoch": 87.24489795918367, + "grad_norm": 19.85643196105957, + "learning_rate": 2.5510204081632657e-06, + "loss": 0.5628, + "step": 17100 + }, + { + "epoch": 87.29591836734694, + "grad_norm": 22.615419387817383, + "learning_rate": 2.5408163265306124e-06, + "loss": 0.2576, + "step": 17110 + }, + { + "epoch": 87.34693877551021, + "grad_norm": 6.284287452697754, + "learning_rate": 2.530612244897959e-06, + "loss": 0.0865, + "step": 17120 + }, + { + "epoch": 87.39795918367346, + "grad_norm": 0.733860194683075, + "learning_rate": 2.5204081632653065e-06, + "loss": 0.0137, + "step": 17130 + }, + { + "epoch": 87.44897959183673, + "grad_norm": 7.231849670410156, + "learning_rate": 2.5102040816326536e-06, + "loss": 0.1917, + "step": 17140 + }, + { + "epoch": 87.5, + "grad_norm": 0.44974711537361145, + "learning_rate": 2.5e-06, + "loss": 0.1283, + "step": 17150 + }, + { + "epoch": 87.55102040816327, + "grad_norm": 29.02192497253418, + "learning_rate": 2.489795918367347e-06, + "loss": 0.2555, + "step": 17160 + }, + { + "epoch": 87.60204081632654, + "grad_norm": 23.775123596191406, + "learning_rate": 2.479591836734694e-06, + "loss": 0.3265, + "step": 17170 + }, + { + "epoch": 87.65306122448979, + "grad_norm": 29.504384994506836, + "learning_rate": 2.469387755102041e-06, + "loss": 0.2368, + "step": 17180 + }, + { + "epoch": 87.70408163265306, + "grad_norm": 3.761201858520508, + "learning_rate": 2.459183673469388e-06, + "loss": 0.1903, + "step": 17190 + }, + { + "epoch": 87.75510204081633, + "grad_norm": 2.1016347408294678, + "learning_rate": 2.4489795918367347e-06, + "loss": 0.1154, + "step": 17200 + }, + { + "epoch": 87.8061224489796, + "grad_norm": 26.894195556640625, + "learning_rate": 2.4387755102040817e-06, + "loss": 0.2784, + "step": 17210 + }, + { + "epoch": 87.85714285714286, + "grad_norm": 3.392759323120117, + "learning_rate": 2.428571428571429e-06, + "loss": 0.324, + "step": 17220 + }, + { + "epoch": 87.90816326530613, + "grad_norm": 3.55159854888916, + "learning_rate": 2.418367346938776e-06, + "loss": 0.1331, + "step": 17230 + }, + { + "epoch": 87.95918367346938, + "grad_norm": 4.807551383972168, + "learning_rate": 2.4081632653061225e-06, + "loss": 0.1229, + "step": 17240 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.6292526125907898, + "eval_runtime": 1.011, + "eval_samples_per_second": 273.979, + "eval_steps_per_second": 34.618, + "step": 17248 + }, + { + "epoch": 88.01020408163265, + "grad_norm": 0.5412343740463257, + "learning_rate": 2.3979591836734696e-06, + "loss": 0.2111, + "step": 17250 + }, + { + "epoch": 88.06122448979592, + "grad_norm": 16.970882415771484, + "learning_rate": 2.3877551020408166e-06, + "loss": 0.1973, + "step": 17260 + }, + { + "epoch": 88.11224489795919, + "grad_norm": 22.15845489501953, + "learning_rate": 2.3775510204081633e-06, + "loss": 0.1096, + "step": 17270 + }, + { + "epoch": 88.16326530612245, + "grad_norm": 14.04793643951416, + "learning_rate": 2.3673469387755103e-06, + "loss": 0.1496, + "step": 17280 + }, + { + "epoch": 88.21428571428571, + "grad_norm": 27.746109008789062, + "learning_rate": 2.3571428571428574e-06, + "loss": 0.1301, + "step": 17290 + }, + { + "epoch": 88.26530612244898, + "grad_norm": 33.9559440612793, + "learning_rate": 2.3469387755102044e-06, + "loss": 0.2768, + "step": 17300 + }, + { + "epoch": 88.31632653061224, + "grad_norm": 10.501508712768555, + "learning_rate": 2.336734693877551e-06, + "loss": 0.3422, + "step": 17310 + }, + { + "epoch": 88.36734693877551, + "grad_norm": 1.2863095998764038, + "learning_rate": 2.326530612244898e-06, + "loss": 0.2606, + "step": 17320 + }, + { + "epoch": 88.41836734693878, + "grad_norm": 13.803751945495605, + "learning_rate": 2.316326530612245e-06, + "loss": 0.1704, + "step": 17330 + }, + { + "epoch": 88.46938775510205, + "grad_norm": 2.0353987216949463, + "learning_rate": 2.306122448979592e-06, + "loss": 0.7119, + "step": 17340 + }, + { + "epoch": 88.5204081632653, + "grad_norm": 2.4765756130218506, + "learning_rate": 2.295918367346939e-06, + "loss": 0.2162, + "step": 17350 + }, + { + "epoch": 88.57142857142857, + "grad_norm": 24.268665313720703, + "learning_rate": 2.285714285714286e-06, + "loss": 0.1102, + "step": 17360 + }, + { + "epoch": 88.62244897959184, + "grad_norm": 36.384212493896484, + "learning_rate": 2.275510204081633e-06, + "loss": 0.3281, + "step": 17370 + }, + { + "epoch": 88.6734693877551, + "grad_norm": 1.235011339187622, + "learning_rate": 2.2653061224489797e-06, + "loss": 0.3715, + "step": 17380 + }, + { + "epoch": 88.72448979591837, + "grad_norm": 27.002023696899414, + "learning_rate": 2.2551020408163267e-06, + "loss": 0.2962, + "step": 17390 + }, + { + "epoch": 88.77551020408163, + "grad_norm": 0.3913618326187134, + "learning_rate": 2.244897959183674e-06, + "loss": 0.2142, + "step": 17400 + }, + { + "epoch": 88.8265306122449, + "grad_norm": 0.9241206049919128, + "learning_rate": 2.2346938775510204e-06, + "loss": 0.1781, + "step": 17410 + }, + { + "epoch": 88.87755102040816, + "grad_norm": 12.835947036743164, + "learning_rate": 2.2244897959183675e-06, + "loss": 0.3585, + "step": 17420 + }, + { + "epoch": 88.92857142857143, + "grad_norm": 0.17079252004623413, + "learning_rate": 2.2142857142857146e-06, + "loss": 0.0749, + "step": 17430 + }, + { + "epoch": 88.9795918367347, + "grad_norm": 0.8527528047561646, + "learning_rate": 2.2040816326530616e-06, + "loss": 0.0248, + "step": 17440 + }, + { + "epoch": 89.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.6192805171012878, + "eval_runtime": 1.008, + "eval_samples_per_second": 274.807, + "eval_steps_per_second": 34.723, + "step": 17444 + }, + { + "epoch": 89.03061224489795, + "grad_norm": 28.320755004882812, + "learning_rate": 2.1938775510204083e-06, + "loss": 0.2473, + "step": 17450 + }, + { + "epoch": 89.08163265306122, + "grad_norm": 2.1770079135894775, + "learning_rate": 2.1836734693877553e-06, + "loss": 0.2011, + "step": 17460 + }, + { + "epoch": 89.13265306122449, + "grad_norm": 13.582273483276367, + "learning_rate": 2.1734693877551024e-06, + "loss": 0.4687, + "step": 17470 + }, + { + "epoch": 89.18367346938776, + "grad_norm": 13.654045104980469, + "learning_rate": 2.1632653061224495e-06, + "loss": 0.1745, + "step": 17480 + }, + { + "epoch": 89.23469387755102, + "grad_norm": 1.2029261589050293, + "learning_rate": 2.153061224489796e-06, + "loss": 0.3699, + "step": 17490 + }, + { + "epoch": 89.28571428571429, + "grad_norm": 3.5786337852478027, + "learning_rate": 2.1428571428571427e-06, + "loss": 0.2779, + "step": 17500 + }, + { + "epoch": 89.33673469387755, + "grad_norm": 2.0610575675964355, + "learning_rate": 2.1326530612244902e-06, + "loss": 0.1861, + "step": 17510 + }, + { + "epoch": 89.38775510204081, + "grad_norm": 0.6334846615791321, + "learning_rate": 2.122448979591837e-06, + "loss": 0.1324, + "step": 17520 + }, + { + "epoch": 89.43877551020408, + "grad_norm": 14.662342071533203, + "learning_rate": 2.112244897959184e-06, + "loss": 0.277, + "step": 17530 + }, + { + "epoch": 89.48979591836735, + "grad_norm": 1.0749694108963013, + "learning_rate": 2.1020408163265306e-06, + "loss": 0.1239, + "step": 17540 + }, + { + "epoch": 89.54081632653062, + "grad_norm": 42.486698150634766, + "learning_rate": 2.0918367346938776e-06, + "loss": 0.1603, + "step": 17550 + }, + { + "epoch": 89.59183673469387, + "grad_norm": 0.9968705177307129, + "learning_rate": 2.0816326530612247e-06, + "loss": 0.0585, + "step": 17560 + }, + { + "epoch": 89.64285714285714, + "grad_norm": 16.23180389404297, + "learning_rate": 2.0714285714285717e-06, + "loss": 0.1786, + "step": 17570 + }, + { + "epoch": 89.6938775510204, + "grad_norm": 15.472877502441406, + "learning_rate": 2.0612244897959184e-06, + "loss": 0.1067, + "step": 17580 + }, + { + "epoch": 89.74489795918367, + "grad_norm": 2.471834659576416, + "learning_rate": 2.0510204081632654e-06, + "loss": 0.1473, + "step": 17590 + }, + { + "epoch": 89.79591836734694, + "grad_norm": 15.856159210205078, + "learning_rate": 2.0408163265306125e-06, + "loss": 0.1022, + "step": 17600 + }, + { + "epoch": 89.84693877551021, + "grad_norm": 19.345705032348633, + "learning_rate": 2.030612244897959e-06, + "loss": 0.3282, + "step": 17610 + }, + { + "epoch": 89.89795918367346, + "grad_norm": 19.667593002319336, + "learning_rate": 2.020408163265306e-06, + "loss": 0.3853, + "step": 17620 + }, + { + "epoch": 89.94897959183673, + "grad_norm": 0.3212265372276306, + "learning_rate": 2.0102040816326533e-06, + "loss": 0.209, + "step": 17630 + }, + { + "epoch": 90.0, + "grad_norm": 14.387212753295898, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0925, + "step": 17640 + }, + { + "epoch": 90.0, + "eval_accuracy": 0.8700361010830325, + "eval_loss": 0.4754519760608673, + "eval_runtime": 1.0125, + "eval_samples_per_second": 273.589, + "eval_steps_per_second": 34.569, + "step": 17640 + }, + { + "epoch": 90.05102040816327, + "grad_norm": 7.858551502227783, + "learning_rate": 1.989795918367347e-06, + "loss": 0.2129, + "step": 17650 + }, + { + "epoch": 90.10204081632654, + "grad_norm": 1.2059929370880127, + "learning_rate": 1.979591836734694e-06, + "loss": 0.2523, + "step": 17660 + }, + { + "epoch": 90.15306122448979, + "grad_norm": 1.7275913953781128, + "learning_rate": 1.969387755102041e-06, + "loss": 0.2467, + "step": 17670 + }, + { + "epoch": 90.20408163265306, + "grad_norm": 38.21623611450195, + "learning_rate": 1.9591836734693877e-06, + "loss": 0.3007, + "step": 17680 + }, + { + "epoch": 90.25510204081633, + "grad_norm": 9.708069801330566, + "learning_rate": 1.948979591836735e-06, + "loss": 0.1311, + "step": 17690 + }, + { + "epoch": 90.3061224489796, + "grad_norm": 25.080371856689453, + "learning_rate": 1.938775510204082e-06, + "loss": 0.0554, + "step": 17700 + }, + { + "epoch": 90.35714285714286, + "grad_norm": 36.623619079589844, + "learning_rate": 1.928571428571429e-06, + "loss": 0.3716, + "step": 17710 + }, + { + "epoch": 90.40816326530613, + "grad_norm": 1.103722095489502, + "learning_rate": 1.9183673469387756e-06, + "loss": 0.1114, + "step": 17720 + }, + { + "epoch": 90.45918367346938, + "grad_norm": 8.962408065795898, + "learning_rate": 1.9081632653061226e-06, + "loss": 0.0412, + "step": 17730 + }, + { + "epoch": 90.51020408163265, + "grad_norm": 1.9563815593719482, + "learning_rate": 1.8979591836734695e-06, + "loss": 0.2965, + "step": 17740 + }, + { + "epoch": 90.56122448979592, + "grad_norm": 1.6661403179168701, + "learning_rate": 1.8877551020408163e-06, + "loss": 0.1251, + "step": 17750 + }, + { + "epoch": 90.61224489795919, + "grad_norm": 4.153669834136963, + "learning_rate": 1.8775510204081634e-06, + "loss": 0.1293, + "step": 17760 + }, + { + "epoch": 90.66326530612245, + "grad_norm": 1.759790062904358, + "learning_rate": 1.8673469387755102e-06, + "loss": 0.3679, + "step": 17770 + }, + { + "epoch": 90.71428571428571, + "grad_norm": 41.29878616333008, + "learning_rate": 1.8571428571428573e-06, + "loss": 0.2852, + "step": 17780 + }, + { + "epoch": 90.76530612244898, + "grad_norm": 0.7316180467605591, + "learning_rate": 1.8469387755102042e-06, + "loss": 0.1709, + "step": 17790 + }, + { + "epoch": 90.81632653061224, + "grad_norm": 12.635185241699219, + "learning_rate": 1.8367346938775512e-06, + "loss": 0.2007, + "step": 17800 + }, + { + "epoch": 90.86734693877551, + "grad_norm": 1.3366835117340088, + "learning_rate": 1.826530612244898e-06, + "loss": 0.1843, + "step": 17810 + }, + { + "epoch": 90.91836734693878, + "grad_norm": 32.194828033447266, + "learning_rate": 1.8163265306122451e-06, + "loss": 0.2808, + "step": 17820 + }, + { + "epoch": 90.96938775510205, + "grad_norm": 46.601688385009766, + "learning_rate": 1.806122448979592e-06, + "loss": 0.1968, + "step": 17830 + }, + { + "epoch": 91.0, + "eval_accuracy": 0.8700361010830325, + "eval_loss": 0.5528404712677002, + "eval_runtime": 1.0259, + "eval_samples_per_second": 270.019, + "eval_steps_per_second": 34.118, + "step": 17836 + }, + { + "epoch": 91.0204081632653, + "grad_norm": 27.904356002807617, + "learning_rate": 1.7959183673469388e-06, + "loss": 0.1043, + "step": 17840 + }, + { + "epoch": 91.07142857142857, + "grad_norm": 12.28512954711914, + "learning_rate": 1.7857142857142859e-06, + "loss": 0.1217, + "step": 17850 + }, + { + "epoch": 91.12244897959184, + "grad_norm": 26.008310317993164, + "learning_rate": 1.7755102040816327e-06, + "loss": 0.3206, + "step": 17860 + }, + { + "epoch": 91.1734693877551, + "grad_norm": 38.30603790283203, + "learning_rate": 1.7653061224489798e-06, + "loss": 0.1656, + "step": 17870 + }, + { + "epoch": 91.22448979591837, + "grad_norm": 16.18921661376953, + "learning_rate": 1.7551020408163267e-06, + "loss": 0.3669, + "step": 17880 + }, + { + "epoch": 91.27551020408163, + "grad_norm": 13.811570167541504, + "learning_rate": 1.7448979591836737e-06, + "loss": 0.2863, + "step": 17890 + }, + { + "epoch": 91.3265306122449, + "grad_norm": 41.77347946166992, + "learning_rate": 1.7346938775510206e-06, + "loss": 0.2969, + "step": 17900 + }, + { + "epoch": 91.37755102040816, + "grad_norm": 0.12460380792617798, + "learning_rate": 1.7244897959183676e-06, + "loss": 0.0412, + "step": 17910 + }, + { + "epoch": 91.42857142857143, + "grad_norm": 1.255277395248413, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.1302, + "step": 17920 + }, + { + "epoch": 91.4795918367347, + "grad_norm": 0.5951548218727112, + "learning_rate": 1.7040816326530613e-06, + "loss": 0.2502, + "step": 17930 + }, + { + "epoch": 91.53061224489795, + "grad_norm": 22.74351692199707, + "learning_rate": 1.6938775510204084e-06, + "loss": 0.1801, + "step": 17940 + }, + { + "epoch": 91.58163265306122, + "grad_norm": 1.810703992843628, + "learning_rate": 1.6836734693877552e-06, + "loss": 0.0783, + "step": 17950 + }, + { + "epoch": 91.63265306122449, + "grad_norm": 16.341957092285156, + "learning_rate": 1.6734693877551023e-06, + "loss": 0.1056, + "step": 17960 + }, + { + "epoch": 91.68367346938776, + "grad_norm": 32.91526412963867, + "learning_rate": 1.6632653061224492e-06, + "loss": 0.3162, + "step": 17970 + }, + { + "epoch": 91.73469387755102, + "grad_norm": 14.67264175415039, + "learning_rate": 1.6530612244897962e-06, + "loss": 0.1149, + "step": 17980 + }, + { + "epoch": 91.78571428571429, + "grad_norm": 5.475572109222412, + "learning_rate": 1.642857142857143e-06, + "loss": 0.0868, + "step": 17990 + }, + { + "epoch": 91.83673469387755, + "grad_norm": 47.4354248046875, + "learning_rate": 1.6326530612244897e-06, + "loss": 0.6311, + "step": 18000 + }, + { + "epoch": 91.88775510204081, + "grad_norm": 41.562313079833984, + "learning_rate": 1.622448979591837e-06, + "loss": 0.2466, + "step": 18010 + }, + { + "epoch": 91.93877551020408, + "grad_norm": 0.28112781047821045, + "learning_rate": 1.6122448979591836e-06, + "loss": 0.1907, + "step": 18020 + }, + { + "epoch": 91.98979591836735, + "grad_norm": 7.19261360168457, + "learning_rate": 1.602040816326531e-06, + "loss": 0.1694, + "step": 18030 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.8953068592057761, + "eval_loss": 0.4338068664073944, + "eval_runtime": 1.1037, + "eval_samples_per_second": 250.985, + "eval_steps_per_second": 31.713, + "step": 18032 + }, + { + "epoch": 92.04081632653062, + "grad_norm": 0.43761497735977173, + "learning_rate": 1.5918367346938775e-06, + "loss": 0.1164, + "step": 18040 + }, + { + "epoch": 92.09183673469387, + "grad_norm": 42.277122497558594, + "learning_rate": 1.5816326530612248e-06, + "loss": 0.3001, + "step": 18050 + }, + { + "epoch": 92.14285714285714, + "grad_norm": 0.1179385706782341, + "learning_rate": 1.5714285714285714e-06, + "loss": 0.2197, + "step": 18060 + }, + { + "epoch": 92.1938775510204, + "grad_norm": 11.542866706848145, + "learning_rate": 1.5612244897959187e-06, + "loss": 0.2909, + "step": 18070 + }, + { + "epoch": 92.24489795918367, + "grad_norm": 1.2789950370788574, + "learning_rate": 1.5510204081632654e-06, + "loss": 0.2554, + "step": 18080 + }, + { + "epoch": 92.29591836734694, + "grad_norm": 19.425533294677734, + "learning_rate": 1.5408163265306122e-06, + "loss": 0.3152, + "step": 18090 + }, + { + "epoch": 92.34693877551021, + "grad_norm": 34.185455322265625, + "learning_rate": 1.5306122448979593e-06, + "loss": 0.2152, + "step": 18100 + }, + { + "epoch": 92.39795918367346, + "grad_norm": 11.890663146972656, + "learning_rate": 1.5204081632653061e-06, + "loss": 0.3514, + "step": 18110 + }, + { + "epoch": 92.44897959183673, + "grad_norm": 2.1180689334869385, + "learning_rate": 1.5102040816326532e-06, + "loss": 0.1918, + "step": 18120 + }, + { + "epoch": 92.5, + "grad_norm": 15.912369728088379, + "learning_rate": 1.5e-06, + "loss": 0.2589, + "step": 18130 + }, + { + "epoch": 92.55102040816327, + "grad_norm": 3.3726866245269775, + "learning_rate": 1.489795918367347e-06, + "loss": 0.2786, + "step": 18140 + }, + { + "epoch": 92.60204081632654, + "grad_norm": 31.400827407836914, + "learning_rate": 1.479591836734694e-06, + "loss": 0.2618, + "step": 18150 + }, + { + "epoch": 92.65306122448979, + "grad_norm": 10.679868698120117, + "learning_rate": 1.469387755102041e-06, + "loss": 0.2371, + "step": 18160 + }, + { + "epoch": 92.70408163265306, + "grad_norm": 1.3537969589233398, + "learning_rate": 1.4591836734693879e-06, + "loss": 0.1366, + "step": 18170 + }, + { + "epoch": 92.75510204081633, + "grad_norm": 33.548553466796875, + "learning_rate": 1.4489795918367347e-06, + "loss": 0.2752, + "step": 18180 + }, + { + "epoch": 92.8061224489796, + "grad_norm": 24.411346435546875, + "learning_rate": 1.4387755102040818e-06, + "loss": 0.4567, + "step": 18190 + }, + { + "epoch": 92.85714285714286, + "grad_norm": 3.608860492706299, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.192, + "step": 18200 + }, + { + "epoch": 92.90816326530613, + "grad_norm": 12.382462501525879, + "learning_rate": 1.4183673469387757e-06, + "loss": 0.2702, + "step": 18210 + }, + { + "epoch": 92.95918367346938, + "grad_norm": 32.972591400146484, + "learning_rate": 1.4081632653061225e-06, + "loss": 0.2083, + "step": 18220 + }, + { + "epoch": 93.0, + "eval_accuracy": 0.8808664259927798, + "eval_loss": 1.1285901069641113, + "eval_runtime": 1.0172, + "eval_samples_per_second": 272.303, + "eval_steps_per_second": 34.406, + "step": 18228 + }, + { + "epoch": 93.01020408163265, + "grad_norm": 26.126474380493164, + "learning_rate": 1.3979591836734696e-06, + "loss": 0.6115, + "step": 18230 + }, + { + "epoch": 93.06122448979592, + "grad_norm": 13.585494041442871, + "learning_rate": 1.3877551020408165e-06, + "loss": 0.293, + "step": 18240 + }, + { + "epoch": 93.11224489795919, + "grad_norm": 30.562759399414062, + "learning_rate": 1.3775510204081633e-06, + "loss": 0.3055, + "step": 18250 + }, + { + "epoch": 93.16326530612245, + "grad_norm": 7.133956432342529, + "learning_rate": 1.3673469387755104e-06, + "loss": 0.1278, + "step": 18260 + }, + { + "epoch": 93.21428571428571, + "grad_norm": 19.81590461730957, + "learning_rate": 1.3571428571428572e-06, + "loss": 0.2476, + "step": 18270 + }, + { + "epoch": 93.26530612244898, + "grad_norm": 8.866189956665039, + "learning_rate": 1.3469387755102043e-06, + "loss": 0.299, + "step": 18280 + }, + { + "epoch": 93.31632653061224, + "grad_norm": 11.5589017868042, + "learning_rate": 1.3367346938775511e-06, + "loss": 0.2597, + "step": 18290 + }, + { + "epoch": 93.36734693877551, + "grad_norm": 8.648991584777832, + "learning_rate": 1.3265306122448982e-06, + "loss": 0.1339, + "step": 18300 + }, + { + "epoch": 93.41836734693878, + "grad_norm": 11.000016212463379, + "learning_rate": 1.316326530612245e-06, + "loss": 0.1786, + "step": 18310 + }, + { + "epoch": 93.46938775510205, + "grad_norm": 0.12376013398170471, + "learning_rate": 1.3061224489795921e-06, + "loss": 0.2248, + "step": 18320 + }, + { + "epoch": 93.5204081632653, + "grad_norm": 7.646286487579346, + "learning_rate": 1.295918367346939e-06, + "loss": 0.168, + "step": 18330 + }, + { + "epoch": 93.57142857142857, + "grad_norm": 20.714702606201172, + "learning_rate": 1.2857142857142856e-06, + "loss": 0.2933, + "step": 18340 + }, + { + "epoch": 93.62244897959184, + "grad_norm": 1.1556758880615234, + "learning_rate": 1.2755102040816329e-06, + "loss": 0.3014, + "step": 18350 + }, + { + "epoch": 93.6734693877551, + "grad_norm": 5.045319080352783, + "learning_rate": 1.2653061224489795e-06, + "loss": 0.1799, + "step": 18360 + }, + { + "epoch": 93.72448979591837, + "grad_norm": 10.567604064941406, + "learning_rate": 1.2551020408163268e-06, + "loss": 0.1117, + "step": 18370 + }, + { + "epoch": 93.77551020408163, + "grad_norm": 34.76498031616211, + "learning_rate": 1.2448979591836734e-06, + "loss": 0.3669, + "step": 18380 + }, + { + "epoch": 93.8265306122449, + "grad_norm": 0.39042800664901733, + "learning_rate": 1.2346938775510205e-06, + "loss": 0.1084, + "step": 18390 + }, + { + "epoch": 93.87755102040816, + "grad_norm": 1.0814882516860962, + "learning_rate": 1.2244897959183673e-06, + "loss": 0.112, + "step": 18400 + }, + { + "epoch": 93.92857142857143, + "grad_norm": 0.7276666164398193, + "learning_rate": 1.2142857142857144e-06, + "loss": 0.0641, + "step": 18410 + }, + { + "epoch": 93.9795918367347, + "grad_norm": 54.44386672973633, + "learning_rate": 1.2040816326530612e-06, + "loss": 0.3666, + "step": 18420 + }, + { + "epoch": 94.0, + "eval_accuracy": 0.8267148014440433, + "eval_loss": 0.6878820657730103, + "eval_runtime": 1.0825, + "eval_samples_per_second": 255.901, + "eval_steps_per_second": 32.334, + "step": 18424 + }, + { + "epoch": 94.03061224489795, + "grad_norm": 1.2125033140182495, + "learning_rate": 1.1938775510204083e-06, + "loss": 0.165, + "step": 18430 + }, + { + "epoch": 94.08163265306122, + "grad_norm": 17.543792724609375, + "learning_rate": 1.1836734693877552e-06, + "loss": 0.1585, + "step": 18440 + }, + { + "epoch": 94.13265306122449, + "grad_norm": 44.09790802001953, + "learning_rate": 1.1734693877551022e-06, + "loss": 0.2914, + "step": 18450 + }, + { + "epoch": 94.18367346938776, + "grad_norm": 28.964426040649414, + "learning_rate": 1.163265306122449e-06, + "loss": 0.2918, + "step": 18460 + }, + { + "epoch": 94.23469387755102, + "grad_norm": 29.72899627685547, + "learning_rate": 1.153061224489796e-06, + "loss": 0.4414, + "step": 18470 + }, + { + "epoch": 94.28571428571429, + "grad_norm": 9.558052062988281, + "learning_rate": 1.142857142857143e-06, + "loss": 0.0891, + "step": 18480 + }, + { + "epoch": 94.33673469387755, + "grad_norm": 26.460769653320312, + "learning_rate": 1.1326530612244898e-06, + "loss": 0.1868, + "step": 18490 + }, + { + "epoch": 94.38775510204081, + "grad_norm": 5.038394927978516, + "learning_rate": 1.122448979591837e-06, + "loss": 0.1816, + "step": 18500 + }, + { + "epoch": 94.43877551020408, + "grad_norm": 10.404385566711426, + "learning_rate": 1.1122448979591838e-06, + "loss": 0.1307, + "step": 18510 + }, + { + "epoch": 94.48979591836735, + "grad_norm": 1.5355292558670044, + "learning_rate": 1.1020408163265308e-06, + "loss": 0.1956, + "step": 18520 + }, + { + "epoch": 94.54081632653062, + "grad_norm": 2.196859359741211, + "learning_rate": 1.0918367346938777e-06, + "loss": 0.1143, + "step": 18530 + }, + { + "epoch": 94.59183673469387, + "grad_norm": 8.2168550491333, + "learning_rate": 1.0816326530612247e-06, + "loss": 0.215, + "step": 18540 + }, + { + "epoch": 94.64285714285714, + "grad_norm": 16.13858413696289, + "learning_rate": 1.0714285714285714e-06, + "loss": 0.1547, + "step": 18550 + }, + { + "epoch": 94.6938775510204, + "grad_norm": 5.680481433868408, + "learning_rate": 1.0612244897959184e-06, + "loss": 0.2431, + "step": 18560 + }, + { + "epoch": 94.74489795918367, + "grad_norm": 11.419181823730469, + "learning_rate": 1.0510204081632653e-06, + "loss": 0.1831, + "step": 18570 + }, + { + "epoch": 94.79591836734694, + "grad_norm": 2.1457908153533936, + "learning_rate": 1.0408163265306123e-06, + "loss": 0.2518, + "step": 18580 + }, + { + "epoch": 94.84693877551021, + "grad_norm": 14.09632682800293, + "learning_rate": 1.0306122448979592e-06, + "loss": 0.2727, + "step": 18590 + }, + { + "epoch": 94.89795918367346, + "grad_norm": 0.37104547023773193, + "learning_rate": 1.0204081632653063e-06, + "loss": 0.1408, + "step": 18600 + }, + { + "epoch": 94.94897959183673, + "grad_norm": 38.16410446166992, + "learning_rate": 1.010204081632653e-06, + "loss": 0.1984, + "step": 18610 + }, + { + "epoch": 95.0, + "grad_norm": 1.753860354423523, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.1358, + "step": 18620 + }, + { + "epoch": 95.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.5071461796760559, + "eval_runtime": 1.0514, + "eval_samples_per_second": 263.47, + "eval_steps_per_second": 33.29, + "step": 18620 + }, + { + "epoch": 95.05102040816327, + "grad_norm": 0.2637849748134613, + "learning_rate": 9.89795918367347e-07, + "loss": 0.0295, + "step": 18630 + }, + { + "epoch": 95.10204081632654, + "grad_norm": 10.043828010559082, + "learning_rate": 9.795918367346939e-07, + "loss": 0.2364, + "step": 18640 + }, + { + "epoch": 95.15306122448979, + "grad_norm": 19.842586517333984, + "learning_rate": 9.69387755102041e-07, + "loss": 0.1724, + "step": 18650 + }, + { + "epoch": 95.20408163265306, + "grad_norm": 1.2304280996322632, + "learning_rate": 9.591836734693878e-07, + "loss": 0.1706, + "step": 18660 + }, + { + "epoch": 95.25510204081633, + "grad_norm": 8.632827758789062, + "learning_rate": 9.489795918367347e-07, + "loss": 0.1381, + "step": 18670 + }, + { + "epoch": 95.3061224489796, + "grad_norm": 4.460491180419922, + "learning_rate": 9.387755102040817e-07, + "loss": 0.2005, + "step": 18680 + }, + { + "epoch": 95.35714285714286, + "grad_norm": 16.073646545410156, + "learning_rate": 9.285714285714287e-07, + "loss": 0.0801, + "step": 18690 + }, + { + "epoch": 95.40816326530613, + "grad_norm": 11.442094802856445, + "learning_rate": 9.183673469387756e-07, + "loss": 0.1978, + "step": 18700 + }, + { + "epoch": 95.45918367346938, + "grad_norm": 31.842926025390625, + "learning_rate": 9.081632653061226e-07, + "loss": 0.4098, + "step": 18710 + }, + { + "epoch": 95.51020408163265, + "grad_norm": 2.456465005874634, + "learning_rate": 8.979591836734694e-07, + "loss": 0.1716, + "step": 18720 + }, + { + "epoch": 95.56122448979592, + "grad_norm": 1.8315030336380005, + "learning_rate": 8.877551020408164e-07, + "loss": 0.2477, + "step": 18730 + }, + { + "epoch": 95.61224489795919, + "grad_norm": 45.02452087402344, + "learning_rate": 8.775510204081633e-07, + "loss": 0.2799, + "step": 18740 + }, + { + "epoch": 95.66326530612245, + "grad_norm": 29.419157028198242, + "learning_rate": 8.673469387755103e-07, + "loss": 0.1828, + "step": 18750 + }, + { + "epoch": 95.71428571428571, + "grad_norm": 40.39503479003906, + "learning_rate": 8.571428571428572e-07, + "loss": 0.2706, + "step": 18760 + }, + { + "epoch": 95.76530612244898, + "grad_norm": 0.40194839239120483, + "learning_rate": 8.469387755102042e-07, + "loss": 0.1603, + "step": 18770 + }, + { + "epoch": 95.81632653061224, + "grad_norm": 38.446529388427734, + "learning_rate": 8.367346938775512e-07, + "loss": 0.2996, + "step": 18780 + }, + { + "epoch": 95.86734693877551, + "grad_norm": 3.734571695327759, + "learning_rate": 8.265306122448981e-07, + "loss": 0.2217, + "step": 18790 + }, + { + "epoch": 95.91836734693878, + "grad_norm": 3.71414852142334, + "learning_rate": 8.163265306122449e-07, + "loss": 0.052, + "step": 18800 + }, + { + "epoch": 95.96938775510205, + "grad_norm": 16.66282081604004, + "learning_rate": 8.061224489795918e-07, + "loss": 0.2247, + "step": 18810 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.851985559566787, + "eval_loss": 0.5940964818000793, + "eval_runtime": 1.001, + "eval_samples_per_second": 276.727, + "eval_steps_per_second": 34.966, + "step": 18816 + }, + { + "epoch": 96.0204081632653, + "grad_norm": 1.963152289390564, + "learning_rate": 7.959183673469388e-07, + "loss": 0.1373, + "step": 18820 + }, + { + "epoch": 96.07142857142857, + "grad_norm": 0.35374435782432556, + "learning_rate": 7.857142857142857e-07, + "loss": 0.0362, + "step": 18830 + }, + { + "epoch": 96.12244897959184, + "grad_norm": 29.8096923828125, + "learning_rate": 7.755102040816327e-07, + "loss": 0.2348, + "step": 18840 + }, + { + "epoch": 96.1734693877551, + "grad_norm": 4.010699272155762, + "learning_rate": 7.653061224489796e-07, + "loss": 0.5075, + "step": 18850 + }, + { + "epoch": 96.22448979591837, + "grad_norm": 2.4223620891571045, + "learning_rate": 7.551020408163266e-07, + "loss": 0.1775, + "step": 18860 + }, + { + "epoch": 96.27551020408163, + "grad_norm": 4.8529462814331055, + "learning_rate": 7.448979591836736e-07, + "loss": 0.3701, + "step": 18870 + }, + { + "epoch": 96.3265306122449, + "grad_norm": 0.5863433480262756, + "learning_rate": 7.346938775510205e-07, + "loss": 0.0738, + "step": 18880 + }, + { + "epoch": 96.37755102040816, + "grad_norm": 16.176774978637695, + "learning_rate": 7.244897959183674e-07, + "loss": 0.048, + "step": 18890 + }, + { + "epoch": 96.42857142857143, + "grad_norm": 8.167757987976074, + "learning_rate": 7.142857142857143e-07, + "loss": 0.2088, + "step": 18900 + }, + { + "epoch": 96.4795918367347, + "grad_norm": 0.6351049542427063, + "learning_rate": 7.040816326530613e-07, + "loss": 0.1351, + "step": 18910 + }, + { + "epoch": 96.53061224489795, + "grad_norm": 25.39814567565918, + "learning_rate": 6.938775510204082e-07, + "loss": 0.1722, + "step": 18920 + }, + { + "epoch": 96.58163265306122, + "grad_norm": 12.256932258605957, + "learning_rate": 6.836734693877552e-07, + "loss": 0.1337, + "step": 18930 + }, + { + "epoch": 96.63265306122449, + "grad_norm": 4.017277240753174, + "learning_rate": 6.734693877551021e-07, + "loss": 0.2842, + "step": 18940 + }, + { + "epoch": 96.68367346938776, + "grad_norm": 16.411767959594727, + "learning_rate": 6.632653061224491e-07, + "loss": 0.1565, + "step": 18950 + }, + { + "epoch": 96.73469387755102, + "grad_norm": 14.98448371887207, + "learning_rate": 6.530612244897961e-07, + "loss": 0.1721, + "step": 18960 + }, + { + "epoch": 96.78571428571429, + "grad_norm": 10.545024871826172, + "learning_rate": 6.428571428571428e-07, + "loss": 0.1638, + "step": 18970 + }, + { + "epoch": 96.83673469387755, + "grad_norm": 10.720052719116211, + "learning_rate": 6.326530612244898e-07, + "loss": 0.2894, + "step": 18980 + }, + { + "epoch": 96.88775510204081, + "grad_norm": 2.4388601779937744, + "learning_rate": 6.224489795918367e-07, + "loss": 0.3709, + "step": 18990 + }, + { + "epoch": 96.93877551020408, + "grad_norm": 3.3613178730010986, + "learning_rate": 6.122448979591837e-07, + "loss": 0.175, + "step": 19000 + }, + { + "epoch": 96.98979591836735, + "grad_norm": 0.7901637554168701, + "learning_rate": 6.020408163265306e-07, + "loss": 0.2682, + "step": 19010 + }, + { + "epoch": 97.0, + "eval_accuracy": 0.8592057761732852, + "eval_loss": 0.5219435691833496, + "eval_runtime": 0.9989, + "eval_samples_per_second": 277.316, + "eval_steps_per_second": 35.04, + "step": 19012 + }, + { + "epoch": 97.04081632653062, + "grad_norm": 12.261013984680176, + "learning_rate": 5.918367346938776e-07, + "loss": 0.2906, + "step": 19020 + }, + { + "epoch": 97.09183673469387, + "grad_norm": 1.7032982110977173, + "learning_rate": 5.816326530612245e-07, + "loss": 0.159, + "step": 19030 + }, + { + "epoch": 97.14285714285714, + "grad_norm": 0.55592942237854, + "learning_rate": 5.714285714285715e-07, + "loss": 0.2551, + "step": 19040 + }, + { + "epoch": 97.1938775510204, + "grad_norm": 4.4312968254089355, + "learning_rate": 5.612244897959184e-07, + "loss": 0.1837, + "step": 19050 + }, + { + "epoch": 97.24489795918367, + "grad_norm": 29.95174789428711, + "learning_rate": 5.510204081632654e-07, + "loss": 0.0967, + "step": 19060 + }, + { + "epoch": 97.29591836734694, + "grad_norm": 7.447702884674072, + "learning_rate": 5.408163265306124e-07, + "loss": 0.2065, + "step": 19070 + }, + { + "epoch": 97.34693877551021, + "grad_norm": 33.5028076171875, + "learning_rate": 5.306122448979592e-07, + "loss": 0.0913, + "step": 19080 + }, + { + "epoch": 97.39795918367346, + "grad_norm": 18.93910026550293, + "learning_rate": 5.204081632653062e-07, + "loss": 0.1063, + "step": 19090 + }, + { + "epoch": 97.44897959183673, + "grad_norm": 18.313825607299805, + "learning_rate": 5.102040816326531e-07, + "loss": 0.1751, + "step": 19100 + }, + { + "epoch": 97.5, + "grad_norm": 44.13178253173828, + "learning_rate": 5.000000000000001e-07, + "loss": 0.3763, + "step": 19110 + }, + { + "epoch": 97.55102040816327, + "grad_norm": 6.703692436218262, + "learning_rate": 4.897959183673469e-07, + "loss": 0.1509, + "step": 19120 + }, + { + "epoch": 97.60204081632654, + "grad_norm": 34.19178771972656, + "learning_rate": 4.795918367346939e-07, + "loss": 0.5041, + "step": 19130 + }, + { + "epoch": 97.65306122448979, + "grad_norm": 0.3778209090232849, + "learning_rate": 4.6938775510204085e-07, + "loss": 0.3616, + "step": 19140 + }, + { + "epoch": 97.70408163265306, + "grad_norm": 45.12972640991211, + "learning_rate": 4.591836734693878e-07, + "loss": 0.2279, + "step": 19150 + }, + { + "epoch": 97.75510204081633, + "grad_norm": 0.23751859366893768, + "learning_rate": 4.489795918367347e-07, + "loss": 0.2344, + "step": 19160 + }, + { + "epoch": 97.8061224489796, + "grad_norm": 30.989025115966797, + "learning_rate": 4.3877551020408166e-07, + "loss": 0.4227, + "step": 19170 + }, + { + "epoch": 97.85714285714286, + "grad_norm": 9.003046035766602, + "learning_rate": 4.285714285714286e-07, + "loss": 0.1233, + "step": 19180 + }, + { + "epoch": 97.90816326530613, + "grad_norm": 30.046829223632812, + "learning_rate": 4.183673469387756e-07, + "loss": 0.2289, + "step": 19190 + }, + { + "epoch": 97.95918367346938, + "grad_norm": 45.24127960205078, + "learning_rate": 4.0816326530612243e-07, + "loss": 0.1762, + "step": 19200 + }, + { + "epoch": 98.0, + "eval_accuracy": 0.851985559566787, + "eval_loss": 0.6928915977478027, + "eval_runtime": 1.0087, + "eval_samples_per_second": 274.605, + "eval_steps_per_second": 34.697, + "step": 19208 + }, + { + "epoch": 98.01020408163265, + "grad_norm": 0.18704822659492493, + "learning_rate": 3.979591836734694e-07, + "loss": 0.1294, + "step": 19210 + }, + { + "epoch": 98.06122448979592, + "grad_norm": 8.705056190490723, + "learning_rate": 3.8775510204081634e-07, + "loss": 0.2828, + "step": 19220 + }, + { + "epoch": 98.11224489795919, + "grad_norm": 30.089208602905273, + "learning_rate": 3.775510204081633e-07, + "loss": 0.2035, + "step": 19230 + }, + { + "epoch": 98.16326530612245, + "grad_norm": 47.732261657714844, + "learning_rate": 3.6734693877551025e-07, + "loss": 0.2625, + "step": 19240 + }, + { + "epoch": 98.21428571428571, + "grad_norm": 40.21767807006836, + "learning_rate": 3.5714285714285716e-07, + "loss": 0.1829, + "step": 19250 + }, + { + "epoch": 98.26530612244898, + "grad_norm": 11.317173957824707, + "learning_rate": 3.469387755102041e-07, + "loss": 0.2994, + "step": 19260 + }, + { + "epoch": 98.31632653061224, + "grad_norm": 4.681125640869141, + "learning_rate": 3.3673469387755107e-07, + "loss": 0.3794, + "step": 19270 + }, + { + "epoch": 98.36734693877551, + "grad_norm": 5.775115489959717, + "learning_rate": 3.2653061224489803e-07, + "loss": 0.199, + "step": 19280 + }, + { + "epoch": 98.41836734693878, + "grad_norm": 7.328399658203125, + "learning_rate": 3.163265306122449e-07, + "loss": 0.1, + "step": 19290 + }, + { + "epoch": 98.46938775510205, + "grad_norm": 10.001800537109375, + "learning_rate": 3.0612244897959183e-07, + "loss": 0.2971, + "step": 19300 + }, + { + "epoch": 98.5204081632653, + "grad_norm": 5.522895812988281, + "learning_rate": 2.959183673469388e-07, + "loss": 0.1368, + "step": 19310 + }, + { + "epoch": 98.57142857142857, + "grad_norm": 1.189679741859436, + "learning_rate": 2.8571428571428575e-07, + "loss": 0.161, + "step": 19320 + }, + { + "epoch": 98.62244897959184, + "grad_norm": 27.806896209716797, + "learning_rate": 2.755102040816327e-07, + "loss": 0.1541, + "step": 19330 + }, + { + "epoch": 98.6734693877551, + "grad_norm": 12.556029319763184, + "learning_rate": 2.653061224489796e-07, + "loss": 0.2224, + "step": 19340 + }, + { + "epoch": 98.72448979591837, + "grad_norm": 6.083463668823242, + "learning_rate": 2.5510204081632656e-07, + "loss": 0.3263, + "step": 19350 + }, + { + "epoch": 98.77551020408163, + "grad_norm": 6.754245758056641, + "learning_rate": 2.4489795918367347e-07, + "loss": 0.2957, + "step": 19360 + }, + { + "epoch": 98.8265306122449, + "grad_norm": 32.86932373046875, + "learning_rate": 2.3469387755102042e-07, + "loss": 0.0831, + "step": 19370 + }, + { + "epoch": 98.87755102040816, + "grad_norm": 15.35261344909668, + "learning_rate": 2.2448979591836735e-07, + "loss": 0.2743, + "step": 19380 + }, + { + "epoch": 98.92857142857143, + "grad_norm": 0.2775244414806366, + "learning_rate": 2.142857142857143e-07, + "loss": 0.2112, + "step": 19390 + }, + { + "epoch": 98.9795918367347, + "grad_norm": 31.919103622436523, + "learning_rate": 2.0408163265306121e-07, + "loss": 0.2368, + "step": 19400 + }, + { + "epoch": 99.0, + "eval_accuracy": 0.8844765342960289, + "eval_loss": 0.5323509573936462, + "eval_runtime": 1.0079, + "eval_samples_per_second": 274.838, + "eval_steps_per_second": 34.727, + "step": 19404 + }, + { + "epoch": 99.03061224489795, + "grad_norm": 22.669710159301758, + "learning_rate": 1.9387755102040817e-07, + "loss": 0.1606, + "step": 19410 + }, + { + "epoch": 99.08163265306122, + "grad_norm": 10.97887134552002, + "learning_rate": 1.8367346938775513e-07, + "loss": 0.3762, + "step": 19420 + }, + { + "epoch": 99.13265306122449, + "grad_norm": 28.65227508544922, + "learning_rate": 1.7346938775510206e-07, + "loss": 0.1669, + "step": 19430 + }, + { + "epoch": 99.18367346938776, + "grad_norm": 1.6867096424102783, + "learning_rate": 1.6326530612244901e-07, + "loss": 0.6833, + "step": 19440 + }, + { + "epoch": 99.23469387755102, + "grad_norm": 34.86143112182617, + "learning_rate": 1.5306122448979592e-07, + "loss": 0.374, + "step": 19450 + }, + { + "epoch": 99.28571428571429, + "grad_norm": 23.0856876373291, + "learning_rate": 1.4285714285714287e-07, + "loss": 0.0832, + "step": 19460 + }, + { + "epoch": 99.33673469387755, + "grad_norm": 10.77336311340332, + "learning_rate": 1.326530612244898e-07, + "loss": 0.347, + "step": 19470 + }, + { + "epoch": 99.38775510204081, + "grad_norm": 10.197904586791992, + "learning_rate": 1.2244897959183673e-07, + "loss": 0.1792, + "step": 19480 + }, + { + "epoch": 99.43877551020408, + "grad_norm": 27.76247787475586, + "learning_rate": 1.1224489795918368e-07, + "loss": 0.2279, + "step": 19490 + }, + { + "epoch": 99.48979591836735, + "grad_norm": 6.777796745300293, + "learning_rate": 1.0204081632653061e-07, + "loss": 0.2408, + "step": 19500 + }, + { + "epoch": 99.54081632653062, + "grad_norm": 3.417832374572754, + "learning_rate": 9.183673469387756e-08, + "loss": 0.0954, + "step": 19510 + }, + { + "epoch": 99.59183673469387, + "grad_norm": 8.597440719604492, + "learning_rate": 8.163265306122451e-08, + "loss": 0.1775, + "step": 19520 + }, + { + "epoch": 99.64285714285714, + "grad_norm": 41.13750457763672, + "learning_rate": 7.142857142857144e-08, + "loss": 0.2322, + "step": 19530 + }, + { + "epoch": 99.6938775510204, + "grad_norm": 24.9324951171875, + "learning_rate": 6.122448979591837e-08, + "loss": 0.3047, + "step": 19540 + }, + { + "epoch": 99.74489795918367, + "grad_norm": 15.776062965393066, + "learning_rate": 5.1020408163265303e-08, + "loss": 0.0964, + "step": 19550 + }, + { + "epoch": 99.79591836734694, + "grad_norm": 1.1919376850128174, + "learning_rate": 4.0816326530612253e-08, + "loss": 0.1048, + "step": 19560 + }, + { + "epoch": 99.84693877551021, + "grad_norm": 4.413532733917236, + "learning_rate": 3.0612244897959183e-08, + "loss": 0.252, + "step": 19570 + }, + { + "epoch": 99.89795918367346, + "grad_norm": 5.540408611297607, + "learning_rate": 2.0408163265306127e-08, + "loss": 0.0535, + "step": 19580 + }, + { + "epoch": 99.94897959183673, + "grad_norm": 7.951544284820557, + "learning_rate": 1.0204081632653063e-08, + "loss": 0.1003, + "step": 19590 + }, + { + "epoch": 100.0, + "grad_norm": 0.22311386466026306, + "learning_rate": 0.0, + "loss": 0.1268, + "step": 19600 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.8880866425992779, + "eval_loss": 0.6160133481025696, + "eval_runtime": 1.0078, + "eval_samples_per_second": 274.864, + "eval_steps_per_second": 34.73, + "step": 19600 + }, + { + "epoch": 100.0, + "step": 19600, + "total_flos": 4.122421290860544e+17, + "train_loss": 0.30076538943894665, + "train_runtime": 888.1447, + "train_samples_per_second": 176.323, + "train_steps_per_second": 22.068 + } + ], + "logging_steps": 10, + "max_steps": 19600, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.122421290860544e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}