{ "best_metric": 0.4501282572746277, "best_model_checkpoint": "./vit-lr-poly/checkpoint-800", "epoch": 5.607476635514018, "eval_steps": 100, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "grad_norm": 5.695468902587891, "learning_rate": 9.994398916149883e-05, "loss": 1.3052, "step": 10 }, { "epoch": 0.06, "grad_norm": 3.898674964904785, "learning_rate": 9.98817733173203e-05, "loss": 0.7328, "step": 20 }, { "epoch": 0.09, "grad_norm": 4.696613788604736, "learning_rate": 9.981957686348152e-05, "loss": 0.8478, "step": 30 }, { "epoch": 0.12, "grad_norm": 10.709759712219238, "learning_rate": 9.975739979998252e-05, "loss": 1.0301, "step": 40 }, { "epoch": 0.16, "grad_norm": 4.479094505310059, "learning_rate": 9.969524212682331e-05, "loss": 0.8175, "step": 50 }, { "epoch": 0.19, "grad_norm": 3.9092047214508057, "learning_rate": 9.963310384400386e-05, "loss": 0.8635, "step": 60 }, { "epoch": 0.22, "grad_norm": 3.7493269443511963, "learning_rate": 9.957098495152415e-05, "loss": 0.8305, "step": 70 }, { "epoch": 0.25, "grad_norm": 4.050891399383545, "learning_rate": 9.950888544938422e-05, "loss": 0.6569, "step": 80 }, { "epoch": 0.28, "grad_norm": 4.24185848236084, "learning_rate": 9.944680533758408e-05, "loss": 0.6027, "step": 90 }, { "epoch": 0.31, "grad_norm": 4.011294364929199, "learning_rate": 9.938474461612368e-05, "loss": 0.5905, "step": 100 }, { "epoch": 0.31, "eval_accuracy": 0.7787794729542302, "eval_f1": 0.7699961142525454, "eval_loss": 0.6206506490707397, "eval_precision": 0.7718616052169852, "eval_recall": 0.7787794729542302, "eval_runtime": 36.2352, "eval_samples_per_second": 79.591, "eval_steps_per_second": 9.963, "step": 100 }, { "epoch": 0.34, "grad_norm": 3.816782236099243, "learning_rate": 9.932270328500305e-05, "loss": 0.6752, "step": 110 }, { "epoch": 0.37, "grad_norm": 7.028879165649414, "learning_rate": 9.926068134422221e-05, "loss": 0.638, "step": 120 }, { "epoch": 0.4, "grad_norm": 7.382256984710693, "learning_rate": 9.91986787937811e-05, "loss": 0.6574, "step": 130 }, { "epoch": 0.44, "grad_norm": 1.597434163093567, "learning_rate": 9.91366956336798e-05, "loss": 0.5008, "step": 140 }, { "epoch": 0.47, "grad_norm": 3.9064269065856934, "learning_rate": 9.907473186391824e-05, "loss": 0.57, "step": 150 }, { "epoch": 0.5, "grad_norm": 8.174847602844238, "learning_rate": 9.901278748449648e-05, "loss": 0.5817, "step": 160 }, { "epoch": 0.53, "grad_norm": 2.610459804534912, "learning_rate": 9.895086249541445e-05, "loss": 0.5776, "step": 170 }, { "epoch": 0.56, "grad_norm": 5.772677421569824, "learning_rate": 9.88889568966722e-05, "loss": 0.7565, "step": 180 }, { "epoch": 0.59, "grad_norm": 4.980686187744141, "learning_rate": 9.882707068826973e-05, "loss": 0.599, "step": 190 }, { "epoch": 0.62, "grad_norm": 6.120123863220215, "learning_rate": 9.8765203870207e-05, "loss": 0.5605, "step": 200 }, { "epoch": 0.62, "eval_accuracy": 0.7621359223300971, "eval_f1": 0.71517419238615, "eval_loss": 0.7324841022491455, "eval_precision": 0.7502866979236575, "eval_recall": 0.7621359223300971, "eval_runtime": 36.8774, "eval_samples_per_second": 78.205, "eval_steps_per_second": 9.789, "step": 200 }, { "epoch": 0.65, "grad_norm": 5.4300689697265625, "learning_rate": 9.870335644248407e-05, "loss": 0.6962, "step": 210 }, { "epoch": 0.69, "grad_norm": 7.0893940925598145, "learning_rate": 9.864152840510089e-05, "loss": 0.7061, "step": 220 }, { "epoch": 0.72, "grad_norm": 2.917738437652588, "learning_rate": 9.857971975805747e-05, "loss": 0.3734, "step": 230 }, { "epoch": 0.75, "grad_norm": 2.710495710372925, "learning_rate": 9.851793050135383e-05, "loss": 0.5123, "step": 240 }, { "epoch": 0.78, "grad_norm": 4.768627643585205, "learning_rate": 9.845616063498996e-05, "loss": 0.6632, "step": 250 }, { "epoch": 0.81, "grad_norm": 6.290271282196045, "learning_rate": 9.839441015896587e-05, "loss": 0.5159, "step": 260 }, { "epoch": 0.84, "grad_norm": 3.4417402744293213, "learning_rate": 9.833267907328151e-05, "loss": 0.4535, "step": 270 }, { "epoch": 0.87, "grad_norm": 4.1035990715026855, "learning_rate": 9.827096737793694e-05, "loss": 0.5092, "step": 280 }, { "epoch": 0.9, "grad_norm": 7.296667575836182, "learning_rate": 9.820927507293215e-05, "loss": 0.6068, "step": 290 }, { "epoch": 0.93, "grad_norm": 7.422231197357178, "learning_rate": 9.814760215826709e-05, "loss": 0.7068, "step": 300 }, { "epoch": 0.93, "eval_accuracy": 0.7919556171983356, "eval_f1": 0.7952442316282051, "eval_loss": 0.5868750214576721, "eval_precision": 0.8183606673488866, "eval_recall": 0.7919556171983356, "eval_runtime": 36.032, "eval_samples_per_second": 80.04, "eval_steps_per_second": 10.019, "step": 300 }, { "epoch": 0.97, "grad_norm": 2.943356513977051, "learning_rate": 9.808594863394183e-05, "loss": 0.5773, "step": 310 }, { "epoch": 1.0, "grad_norm": 4.4447174072265625, "learning_rate": 9.802431449995633e-05, "loss": 0.542, "step": 320 }, { "epoch": 1.03, "grad_norm": 4.077796936035156, "learning_rate": 9.796269975631059e-05, "loss": 0.4072, "step": 330 }, { "epoch": 1.06, "grad_norm": 4.858616352081299, "learning_rate": 9.790110440300463e-05, "loss": 0.3614, "step": 340 }, { "epoch": 1.09, "grad_norm": 2.937171459197998, "learning_rate": 9.783952844003844e-05, "loss": 0.4749, "step": 350 }, { "epoch": 1.12, "grad_norm": 2.457247257232666, "learning_rate": 9.777797186741202e-05, "loss": 0.292, "step": 360 }, { "epoch": 1.15, "grad_norm": 2.8374574184417725, "learning_rate": 9.771643468512534e-05, "loss": 0.3613, "step": 370 }, { "epoch": 1.18, "grad_norm": 7.913370132446289, "learning_rate": 9.765491689317844e-05, "loss": 0.4369, "step": 380 }, { "epoch": 1.21, "grad_norm": 5.162067890167236, "learning_rate": 9.759341849157134e-05, "loss": 0.5097, "step": 390 }, { "epoch": 1.25, "grad_norm": 2.6675124168395996, "learning_rate": 9.753193948030396e-05, "loss": 0.3773, "step": 400 }, { "epoch": 1.25, "eval_accuracy": 0.7853675450762829, "eval_f1": 0.7964243734764243, "eval_loss": 0.5412221550941467, "eval_precision": 0.8200331042136985, "eval_recall": 0.7853675450762829, "eval_runtime": 36.3769, "eval_samples_per_second": 79.281, "eval_steps_per_second": 9.924, "step": 400 }, { "epoch": 1.28, "grad_norm": 3.57259464263916, "learning_rate": 9.747047985937637e-05, "loss": 0.3536, "step": 410 }, { "epoch": 1.31, "grad_norm": 3.470672369003296, "learning_rate": 9.740903962878855e-05, "loss": 0.4072, "step": 420 }, { "epoch": 1.34, "grad_norm": 4.211917400360107, "learning_rate": 9.734761878854048e-05, "loss": 0.4524, "step": 430 }, { "epoch": 1.37, "grad_norm": 5.042629241943359, "learning_rate": 9.728621733863219e-05, "loss": 0.3893, "step": 440 }, { "epoch": 1.4, "grad_norm": 5.246495246887207, "learning_rate": 9.722483527906369e-05, "loss": 0.423, "step": 450 }, { "epoch": 1.43, "grad_norm": 4.988239288330078, "learning_rate": 9.716347260983494e-05, "loss": 0.3906, "step": 460 }, { "epoch": 1.46, "grad_norm": 3.558497428894043, "learning_rate": 9.710212933094593e-05, "loss": 0.3468, "step": 470 }, { "epoch": 1.5, "grad_norm": 7.515442848205566, "learning_rate": 9.704080544239672e-05, "loss": 0.4708, "step": 480 }, { "epoch": 1.53, "grad_norm": 4.515080451965332, "learning_rate": 9.697950094418728e-05, "loss": 0.3882, "step": 490 }, { "epoch": 1.56, "grad_norm": 2.9388675689697266, "learning_rate": 9.691821583631758e-05, "loss": 0.3501, "step": 500 }, { "epoch": 1.56, "eval_accuracy": 0.8214285714285714, "eval_f1": 0.8028714396273016, "eval_loss": 0.5547745823860168, "eval_precision": 0.8133141676284655, "eval_recall": 0.8214285714285714, "eval_runtime": 38.4765, "eval_samples_per_second": 74.955, "eval_steps_per_second": 9.382, "step": 500 }, { "epoch": 1.59, "grad_norm": 2.793440341949463, "learning_rate": 9.685695011878768e-05, "loss": 0.4404, "step": 510 }, { "epoch": 1.62, "grad_norm": 7.056702613830566, "learning_rate": 9.679570379159753e-05, "loss": 0.4143, "step": 520 }, { "epoch": 1.65, "grad_norm": 7.398943901062012, "learning_rate": 9.673447685474714e-05, "loss": 0.4094, "step": 530 }, { "epoch": 1.68, "grad_norm": 6.008811950683594, "learning_rate": 9.667326930823652e-05, "loss": 0.2855, "step": 540 }, { "epoch": 1.71, "grad_norm": 4.911908149719238, "learning_rate": 9.661208115206569e-05, "loss": 0.3992, "step": 550 }, { "epoch": 1.74, "grad_norm": 2.747983694076538, "learning_rate": 9.655091238623463e-05, "loss": 0.4206, "step": 560 }, { "epoch": 1.78, "grad_norm": 2.9494051933288574, "learning_rate": 9.64897630107433e-05, "loss": 0.3551, "step": 570 }, { "epoch": 1.81, "grad_norm": 3.394073486328125, "learning_rate": 9.642863302559176e-05, "loss": 0.2764, "step": 580 }, { "epoch": 1.84, "grad_norm": 4.953189373016357, "learning_rate": 9.636752243077999e-05, "loss": 0.413, "step": 590 }, { "epoch": 1.87, "grad_norm": 3.485837697982788, "learning_rate": 9.630643122630798e-05, "loss": 0.31, "step": 600 }, { "epoch": 1.87, "eval_accuracy": 0.7881414701803051, "eval_f1": 0.790614795332927, "eval_loss": 0.6006519794464111, "eval_precision": 0.8344810048759083, "eval_recall": 0.7881414701803051, "eval_runtime": 36.9835, "eval_samples_per_second": 77.981, "eval_steps_per_second": 9.761, "step": 600 }, { "epoch": 1.9, "grad_norm": 4.762577533721924, "learning_rate": 9.624535941217575e-05, "loss": 0.4681, "step": 610 }, { "epoch": 1.93, "grad_norm": 1.4589699506759644, "learning_rate": 9.618430698838328e-05, "loss": 0.4165, "step": 620 }, { "epoch": 1.96, "grad_norm": 3.6753602027893066, "learning_rate": 9.612327395493055e-05, "loss": 0.2908, "step": 630 }, { "epoch": 1.99, "grad_norm": 3.2867565155029297, "learning_rate": 9.606226031181762e-05, "loss": 0.4076, "step": 640 }, { "epoch": 2.02, "grad_norm": 2.701131582260132, "learning_rate": 9.600126605904447e-05, "loss": 0.1856, "step": 650 }, { "epoch": 2.06, "grad_norm": 4.516611099243164, "learning_rate": 9.594029119661107e-05, "loss": 0.1592, "step": 660 }, { "epoch": 2.09, "grad_norm": 3.767346143722534, "learning_rate": 9.587933572451742e-05, "loss": 0.2221, "step": 670 }, { "epoch": 2.12, "grad_norm": 5.093253135681152, "learning_rate": 9.581839964276357e-05, "loss": 0.2349, "step": 680 }, { "epoch": 2.15, "grad_norm": 2.8508188724517822, "learning_rate": 9.575748295134947e-05, "loss": 0.2499, "step": 690 }, { "epoch": 2.18, "grad_norm": 2.7991459369659424, "learning_rate": 9.569658565027513e-05, "loss": 0.1492, "step": 700 }, { "epoch": 2.18, "eval_accuracy": 0.8370319001386962, "eval_f1": 0.8339788445217495, "eval_loss": 0.4844985902309418, "eval_precision": 0.8433140578515553, "eval_recall": 0.8370319001386962, "eval_runtime": 37.1459, "eval_samples_per_second": 77.64, "eval_steps_per_second": 9.718, "step": 700 }, { "epoch": 2.21, "grad_norm": 1.0620613098144531, "learning_rate": 9.563570773954057e-05, "loss": 0.2182, "step": 710 }, { "epoch": 2.24, "grad_norm": 4.515988826751709, "learning_rate": 9.557484921914579e-05, "loss": 0.3229, "step": 720 }, { "epoch": 2.27, "grad_norm": 4.338324069976807, "learning_rate": 9.551401008909074e-05, "loss": 0.2658, "step": 730 }, { "epoch": 2.31, "grad_norm": 1.7240196466445923, "learning_rate": 9.54531903493755e-05, "loss": 0.1616, "step": 740 }, { "epoch": 2.34, "grad_norm": 12.458456039428711, "learning_rate": 9.539239000000001e-05, "loss": 0.238, "step": 750 }, { "epoch": 2.37, "grad_norm": 0.4481103718280792, "learning_rate": 9.53316090409643e-05, "loss": 0.1242, "step": 760 }, { "epoch": 2.4, "grad_norm": 1.5546164512634277, "learning_rate": 9.527084747226832e-05, "loss": 0.3907, "step": 770 }, { "epoch": 2.43, "grad_norm": 2.9556872844696045, "learning_rate": 9.521010529391213e-05, "loss": 0.2527, "step": 780 }, { "epoch": 2.46, "grad_norm": 4.50968599319458, "learning_rate": 9.514938250589572e-05, "loss": 0.2755, "step": 790 }, { "epoch": 2.49, "grad_norm": 2.879509925842285, "learning_rate": 9.508867910821905e-05, "loss": 0.185, "step": 800 }, { "epoch": 2.49, "eval_accuracy": 0.8488210818307905, "eval_f1": 0.8427497163826466, "eval_loss": 0.4501282572746277, "eval_precision": 0.841801048207429, "eval_recall": 0.8488210818307905, "eval_runtime": 36.9446, "eval_samples_per_second": 78.063, "eval_steps_per_second": 9.771, "step": 800 }, { "epoch": 2.52, "grad_norm": 2.9909250736236572, "learning_rate": 9.502799510088217e-05, "loss": 0.2218, "step": 810 }, { "epoch": 2.55, "grad_norm": 6.980457305908203, "learning_rate": 9.496733048388506e-05, "loss": 0.2174, "step": 820 }, { "epoch": 2.59, "grad_norm": 2.7773592472076416, "learning_rate": 9.49066852572277e-05, "loss": 0.1812, "step": 830 }, { "epoch": 2.62, "grad_norm": 6.581889629364014, "learning_rate": 9.484605942091012e-05, "loss": 0.166, "step": 840 }, { "epoch": 2.65, "grad_norm": 0.6430343389511108, "learning_rate": 9.478545297493231e-05, "loss": 0.1123, "step": 850 }, { "epoch": 2.68, "grad_norm": 4.716948509216309, "learning_rate": 9.472486591929429e-05, "loss": 0.1928, "step": 860 }, { "epoch": 2.71, "grad_norm": 5.417154312133789, "learning_rate": 9.466429825399598e-05, "loss": 0.3012, "step": 870 }, { "epoch": 2.74, "grad_norm": 6.211470127105713, "learning_rate": 9.460374997903748e-05, "loss": 0.2583, "step": 880 }, { "epoch": 2.77, "grad_norm": 0.7301052808761597, "learning_rate": 9.454322109441874e-05, "loss": 0.3225, "step": 890 }, { "epoch": 2.8, "grad_norm": 5.008696556091309, "learning_rate": 9.448271160013974e-05, "loss": 0.2438, "step": 900 }, { "epoch": 2.8, "eval_accuracy": 0.8439667128987517, "eval_f1": 0.8337945015001851, "eval_loss": 0.49756672978401184, "eval_precision": 0.8412404392497574, "eval_recall": 0.8439667128987517, "eval_runtime": 37.4296, "eval_samples_per_second": 77.051, "eval_steps_per_second": 9.645, "step": 900 }, { "epoch": 2.83, "grad_norm": 4.861326217651367, "learning_rate": 9.442222149620054e-05, "loss": 0.2704, "step": 910 }, { "epoch": 2.87, "grad_norm": 6.5918049812316895, "learning_rate": 9.436175078260111e-05, "loss": 0.3867, "step": 920 }, { "epoch": 2.9, "grad_norm": 0.6792032122612, "learning_rate": 9.430129945934142e-05, "loss": 0.2872, "step": 930 }, { "epoch": 2.93, "grad_norm": 7.400968074798584, "learning_rate": 9.424086752642153e-05, "loss": 0.234, "step": 940 }, { "epoch": 2.96, "grad_norm": 1.187596321105957, "learning_rate": 9.41804549838414e-05, "loss": 0.1667, "step": 950 }, { "epoch": 2.99, "grad_norm": 3.2632896900177, "learning_rate": 9.412006183160102e-05, "loss": 0.1619, "step": 960 }, { "epoch": 3.02, "grad_norm": 3.960529327392578, "learning_rate": 9.405968806970041e-05, "loss": 0.0815, "step": 970 }, { "epoch": 3.05, "grad_norm": 1.774207592010498, "learning_rate": 9.399933369813957e-05, "loss": 0.0998, "step": 980 }, { "epoch": 3.08, "grad_norm": 0.6511847972869873, "learning_rate": 9.393899871691853e-05, "loss": 0.0799, "step": 990 }, { "epoch": 3.12, "grad_norm": 7.667677879333496, "learning_rate": 9.387868312603721e-05, "loss": 0.0604, "step": 1000 }, { "epoch": 3.12, "eval_accuracy": 0.8408460471567267, "eval_f1": 0.8405195257417607, "eval_loss": 0.5850290060043335, "eval_precision": 0.8425271150900508, "eval_recall": 0.8408460471567267, "eval_runtime": 35.5098, "eval_samples_per_second": 81.217, "eval_steps_per_second": 10.166, "step": 1000 }, { "epoch": 3.15, "grad_norm": 3.683607578277588, "learning_rate": 9.381838692549569e-05, "loss": 0.0586, "step": 1010 }, { "epoch": 3.18, "grad_norm": 3.7742743492126465, "learning_rate": 9.375811011529392e-05, "loss": 0.1661, "step": 1020 }, { "epoch": 3.21, "grad_norm": 3.867095470428467, "learning_rate": 9.36978526954319e-05, "loss": 0.0806, "step": 1030 }, { "epoch": 3.24, "grad_norm": 1.3434181213378906, "learning_rate": 9.363761466590968e-05, "loss": 0.1466, "step": 1040 }, { "epoch": 3.27, "grad_norm": 1.3125090599060059, "learning_rate": 9.357739602672723e-05, "loss": 0.0751, "step": 1050 }, { "epoch": 3.3, "grad_norm": 6.323131561279297, "learning_rate": 9.351719677788455e-05, "loss": 0.0722, "step": 1060 }, { "epoch": 3.33, "grad_norm": 0.1433458775281906, "learning_rate": 9.34570169193816e-05, "loss": 0.1136, "step": 1070 }, { "epoch": 3.36, "grad_norm": 5.401894569396973, "learning_rate": 9.339685645121846e-05, "loss": 0.1496, "step": 1080 }, { "epoch": 3.4, "grad_norm": 3.846517562866211, "learning_rate": 9.333671537339507e-05, "loss": 0.0966, "step": 1090 }, { "epoch": 3.43, "grad_norm": 0.643323540687561, "learning_rate": 9.327659368591143e-05, "loss": 0.0545, "step": 1100 }, { "epoch": 3.43, "eval_accuracy": 0.8491678224687933, "eval_f1": 0.8444584144624515, "eval_loss": 0.5684630274772644, "eval_precision": 0.8476053349150144, "eval_recall": 0.8491678224687933, "eval_runtime": 35.9415, "eval_samples_per_second": 80.242, "eval_steps_per_second": 10.044, "step": 1100 }, { "epoch": 3.46, "grad_norm": 1.1942723989486694, "learning_rate": 9.321649138876758e-05, "loss": 0.0621, "step": 1110 }, { "epoch": 3.49, "grad_norm": 4.568288803100586, "learning_rate": 9.31564084819635e-05, "loss": 0.1283, "step": 1120 }, { "epoch": 3.52, "grad_norm": 5.671831130981445, "learning_rate": 9.309634496549916e-05, "loss": 0.1161, "step": 1130 }, { "epoch": 3.55, "grad_norm": 4.5746917724609375, "learning_rate": 9.303630083937462e-05, "loss": 0.1513, "step": 1140 }, { "epoch": 3.58, "grad_norm": 4.237031936645508, "learning_rate": 9.297627610358985e-05, "loss": 0.0785, "step": 1150 }, { "epoch": 3.61, "grad_norm": 1.3817404508590698, "learning_rate": 9.291627075814483e-05, "loss": 0.1151, "step": 1160 }, { "epoch": 3.64, "grad_norm": 0.11316975206136703, "learning_rate": 9.285628480303956e-05, "loss": 0.0901, "step": 1170 }, { "epoch": 3.68, "grad_norm": 0.21596325933933258, "learning_rate": 9.279631823827408e-05, "loss": 0.1352, "step": 1180 }, { "epoch": 3.71, "grad_norm": 4.8247809410095215, "learning_rate": 9.273637106384838e-05, "loss": 0.0838, "step": 1190 }, { "epoch": 3.74, "grad_norm": 2.2498106956481934, "learning_rate": 9.267644327976243e-05, "loss": 0.0719, "step": 1200 }, { "epoch": 3.74, "eval_accuracy": 0.8522884882108183, "eval_f1": 0.8444890358302154, "eval_loss": 0.6310722231864929, "eval_precision": 0.8490390048196649, "eval_recall": 0.8522884882108183, "eval_runtime": 35.788, "eval_samples_per_second": 80.586, "eval_steps_per_second": 10.087, "step": 1200 }, { "epoch": 3.77, "grad_norm": 4.390284061431885, "learning_rate": 9.261653488601624e-05, "loss": 0.0779, "step": 1210 }, { "epoch": 3.8, "grad_norm": 2.4371981620788574, "learning_rate": 9.255664588260985e-05, "loss": 0.2537, "step": 1220 }, { "epoch": 3.83, "grad_norm": 6.534292221069336, "learning_rate": 9.249677626954318e-05, "loss": 0.1229, "step": 1230 }, { "epoch": 3.86, "grad_norm": 0.5418087244033813, "learning_rate": 9.243692604681632e-05, "loss": 0.0768, "step": 1240 }, { "epoch": 3.89, "grad_norm": 15.01469898223877, "learning_rate": 9.237709521442921e-05, "loss": 0.1121, "step": 1250 }, { "epoch": 3.93, "grad_norm": 0.7105635404586792, "learning_rate": 9.231728377238189e-05, "loss": 0.1304, "step": 1260 }, { "epoch": 3.96, "grad_norm": 2.275129795074463, "learning_rate": 9.225749172067429e-05, "loss": 0.1176, "step": 1270 }, { "epoch": 3.99, "grad_norm": 0.08892109990119934, "learning_rate": 9.21977190593065e-05, "loss": 0.1751, "step": 1280 }, { "epoch": 4.02, "grad_norm": 0.09105070680379868, "learning_rate": 9.213796578827847e-05, "loss": 0.179, "step": 1290 }, { "epoch": 4.05, "grad_norm": 4.883812427520752, "learning_rate": 9.207823190759017e-05, "loss": 0.0809, "step": 1300 }, { "epoch": 4.05, "eval_accuracy": 0.8561026352288488, "eval_f1": 0.8527753193864097, "eval_loss": 0.5320934653282166, "eval_precision": 0.8515337008001435, "eval_recall": 0.8561026352288488, "eval_runtime": 35.6371, "eval_samples_per_second": 80.927, "eval_steps_per_second": 10.13, "step": 1300 }, { "epoch": 4.08, "grad_norm": 0.3753306269645691, "learning_rate": 9.201851741724169e-05, "loss": 0.0181, "step": 1310 }, { "epoch": 4.11, "grad_norm": 0.15603381395339966, "learning_rate": 9.195882231723296e-05, "loss": 0.0306, "step": 1320 }, { "epoch": 4.14, "grad_norm": 0.13865770399570465, "learning_rate": 9.189914660756399e-05, "loss": 0.0212, "step": 1330 }, { "epoch": 4.17, "grad_norm": 0.05446525663137436, "learning_rate": 9.183949028823477e-05, "loss": 0.0186, "step": 1340 }, { "epoch": 4.21, "grad_norm": 0.13288724422454834, "learning_rate": 9.177985335924536e-05, "loss": 0.0805, "step": 1350 }, { "epoch": 4.24, "grad_norm": 0.02726615034043789, "learning_rate": 9.17202358205957e-05, "loss": 0.038, "step": 1360 }, { "epoch": 4.27, "grad_norm": 0.43022477626800537, "learning_rate": 9.16606376722858e-05, "loss": 0.079, "step": 1370 }, { "epoch": 4.3, "grad_norm": 8.176656723022461, "learning_rate": 9.160105891431567e-05, "loss": 0.0821, "step": 1380 }, { "epoch": 4.33, "grad_norm": 10.36427116394043, "learning_rate": 9.154149954668532e-05, "loss": 0.0604, "step": 1390 }, { "epoch": 4.36, "grad_norm": 0.1851787269115448, "learning_rate": 9.14819595693947e-05, "loss": 0.0259, "step": 1400 }, { "epoch": 4.36, "eval_accuracy": 0.8408460471567267, "eval_f1": 0.8288193262531939, "eval_loss": 0.8158000707626343, "eval_precision": 0.8329411191101798, "eval_recall": 0.8408460471567267, "eval_runtime": 35.3881, "eval_samples_per_second": 81.496, "eval_steps_per_second": 10.201, "step": 1400 }, { "epoch": 4.39, "grad_norm": 6.811817169189453, "learning_rate": 9.14224389824439e-05, "loss": 0.1077, "step": 1410 }, { "epoch": 4.42, "grad_norm": 0.7091176509857178, "learning_rate": 9.136293778583283e-05, "loss": 0.0112, "step": 1420 }, { "epoch": 4.45, "grad_norm": 0.18676434457302094, "learning_rate": 9.130345597956153e-05, "loss": 0.0525, "step": 1430 }, { "epoch": 4.49, "grad_norm": 3.2488603591918945, "learning_rate": 9.124399356363002e-05, "loss": 0.127, "step": 1440 }, { "epoch": 4.52, "grad_norm": 7.18580961227417, "learning_rate": 9.118455053803827e-05, "loss": 0.1577, "step": 1450 }, { "epoch": 4.55, "grad_norm": 0.014005272649228573, "learning_rate": 9.112512690278629e-05, "loss": 0.0383, "step": 1460 }, { "epoch": 4.58, "grad_norm": 4.266292095184326, "learning_rate": 9.106572265787405e-05, "loss": 0.0452, "step": 1470 }, { "epoch": 4.61, "grad_norm": 6.448909282684326, "learning_rate": 9.100633780330161e-05, "loss": 0.1252, "step": 1480 }, { "epoch": 4.64, "grad_norm": 0.5100437998771667, "learning_rate": 9.094697233906893e-05, "loss": 0.1767, "step": 1490 }, { "epoch": 4.67, "grad_norm": 0.4025322496891022, "learning_rate": 9.088762626517599e-05, "loss": 0.0586, "step": 1500 }, { "epoch": 4.67, "eval_accuracy": 0.8314840499306518, "eval_f1": 0.8248902198935069, "eval_loss": 0.7027890086174011, "eval_precision": 0.835863783837249, "eval_recall": 0.8314840499306518, "eval_runtime": 36.7655, "eval_samples_per_second": 78.443, "eval_steps_per_second": 9.819, "step": 1500 }, { "epoch": 4.7, "grad_norm": 0.016589829698204994, "learning_rate": 9.082829958162286e-05, "loss": 0.0305, "step": 1510 }, { "epoch": 4.74, "grad_norm": 3.5105388164520264, "learning_rate": 9.076899228840948e-05, "loss": 0.0072, "step": 1520 }, { "epoch": 4.77, "grad_norm": 0.04372234269976616, "learning_rate": 9.070970438553588e-05, "loss": 0.0515, "step": 1530 }, { "epoch": 4.8, "grad_norm": 1.5736526250839233, "learning_rate": 9.0650435873002e-05, "loss": 0.0861, "step": 1540 }, { "epoch": 4.83, "grad_norm": 0.2429836392402649, "learning_rate": 9.059118675080795e-05, "loss": 0.1179, "step": 1550 }, { "epoch": 4.86, "grad_norm": 0.09316762536764145, "learning_rate": 9.053195701895363e-05, "loss": 0.0479, "step": 1560 }, { "epoch": 4.89, "grad_norm": 5.398721218109131, "learning_rate": 9.047274667743907e-05, "loss": 0.0712, "step": 1570 }, { "epoch": 4.92, "grad_norm": 0.16753439605236053, "learning_rate": 9.041355572626431e-05, "loss": 0.0371, "step": 1580 }, { "epoch": 4.95, "grad_norm": 1.2653841972351074, "learning_rate": 9.035438416542931e-05, "loss": 0.0113, "step": 1590 }, { "epoch": 4.98, "grad_norm": 0.04981589317321777, "learning_rate": 9.029523199493405e-05, "loss": 0.0218, "step": 1600 }, { "epoch": 4.98, "eval_accuracy": 0.8380721220527045, "eval_f1": 0.8315617007851284, "eval_loss": 0.8059434294700623, "eval_precision": 0.8379796199992479, "eval_recall": 0.8380721220527045, "eval_runtime": 36.8, "eval_samples_per_second": 78.37, "eval_steps_per_second": 9.81, "step": 1600 }, { "epoch": 5.02, "grad_norm": 0.02359694428741932, "learning_rate": 9.023609921477859e-05, "loss": 0.0477, "step": 1610 }, { "epoch": 5.05, "grad_norm": 0.0039889197796583176, "learning_rate": 9.01769858249629e-05, "loss": 0.0296, "step": 1620 }, { "epoch": 5.08, "grad_norm": 0.005103422328829765, "learning_rate": 9.011789182548695e-05, "loss": 0.0268, "step": 1630 }, { "epoch": 5.11, "grad_norm": 0.03737448528409004, "learning_rate": 9.005881721635077e-05, "loss": 0.0261, "step": 1640 }, { "epoch": 5.14, "grad_norm": 0.11978302896022797, "learning_rate": 8.999976199755439e-05, "loss": 0.047, "step": 1650 }, { "epoch": 5.17, "grad_norm": 0.2578228712081909, "learning_rate": 8.994072616909775e-05, "loss": 0.0055, "step": 1660 }, { "epoch": 5.2, "grad_norm": 0.010554115287959576, "learning_rate": 8.988170973098087e-05, "loss": 0.0138, "step": 1670 }, { "epoch": 5.23, "grad_norm": 0.31181907653808594, "learning_rate": 8.982271268320377e-05, "loss": 0.0359, "step": 1680 }, { "epoch": 5.26, "grad_norm": 0.14737311005592346, "learning_rate": 8.976373502576646e-05, "loss": 0.0229, "step": 1690 }, { "epoch": 5.3, "grad_norm": 0.08029153198003769, "learning_rate": 8.970477675866888e-05, "loss": 0.0108, "step": 1700 }, { "epoch": 5.3, "eval_accuracy": 0.8474341192787794, "eval_f1": 0.8469746641885382, "eval_loss": 0.7947927117347717, "eval_precision": 0.8483617060803095, "eval_recall": 0.8474341192787794, "eval_runtime": 36.6238, "eval_samples_per_second": 78.747, "eval_steps_per_second": 9.857, "step": 1700 }, { "epoch": 5.33, "grad_norm": 0.0705069750547409, "learning_rate": 8.964583788191108e-05, "loss": 0.0242, "step": 1710 }, { "epoch": 5.36, "grad_norm": 0.6708030700683594, "learning_rate": 8.958691839549306e-05, "loss": 0.1067, "step": 1720 }, { "epoch": 5.39, "grad_norm": 0.2017916589975357, "learning_rate": 8.952801829941481e-05, "loss": 0.0229, "step": 1730 }, { "epoch": 5.42, "grad_norm": 1.3122460842132568, "learning_rate": 8.94691375936763e-05, "loss": 0.0172, "step": 1740 }, { "epoch": 5.45, "grad_norm": 0.001811229856684804, "learning_rate": 8.941027627827759e-05, "loss": 0.0169, "step": 1750 }, { "epoch": 5.48, "grad_norm": 0.027441857382655144, "learning_rate": 8.935143435321863e-05, "loss": 0.0333, "step": 1760 }, { "epoch": 5.51, "grad_norm": 0.015237187966704369, "learning_rate": 8.929261181849943e-05, "loss": 0.0804, "step": 1770 }, { "epoch": 5.55, "grad_norm": 0.010735694319009781, "learning_rate": 8.923380867412002e-05, "loss": 0.0556, "step": 1780 }, { "epoch": 5.58, "grad_norm": 4.350671768188477, "learning_rate": 8.917502492008036e-05, "loss": 0.0199, "step": 1790 }, { "epoch": 5.61, "grad_norm": 8.591043472290039, "learning_rate": 8.911626055638046e-05, "loss": 0.1129, "step": 1800 }, { "epoch": 5.61, "eval_accuracy": 0.8425797503467406, "eval_f1": 0.8430512515932435, "eval_loss": 0.8089113831520081, "eval_precision": 0.8492235808576796, "eval_recall": 0.8425797503467406, "eval_runtime": 36.2744, "eval_samples_per_second": 79.505, "eval_steps_per_second": 9.952, "step": 1800 }, { "epoch": 5.61, "step": 1800, "total_flos": 2.2287694956200755e+18, "train_loss": 0.26335394654423, "train_runtime": 1291.437, "train_samples_per_second": 397.077, "train_steps_per_second": 24.856 } ], "logging_steps": 10, "max_steps": 32100, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 100, "total_flos": 2.2287694956200755e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }