{ "best_metric": 0.9778142974527526, "best_model_checkpoint": "teacher-status-van-tiny-256-1-2/checkpoint-703", "epoch": 29.55223880597015, "eval_steps": 500, "global_step": 990, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3, "learning_rate": 5.050505050505051e-06, "loss": 0.6865, "step": 10 }, { "epoch": 0.6, "learning_rate": 1.0101010101010101e-05, "loss": 0.683, "step": 20 }, { "epoch": 0.9, "learning_rate": 1.5151515151515153e-05, "loss": 0.6722, "step": 30 }, { "epoch": 0.99, "eval_accuracy": 0.640083945435467, "eval_f1_score": 0.780550223928343, "eval_loss": 0.6498541831970215, "eval_precision": 0.640083945435467, "eval_recall": 1.0, "eval_runtime": 6.6964, "eval_samples_per_second": 142.315, "eval_steps_per_second": 1.195, "step": 33 }, { "epoch": 1.19, "learning_rate": 2.0202020202020203e-05, "loss": 0.646, "step": 40 }, { "epoch": 1.49, "learning_rate": 2.5252525252525256e-05, "loss": 0.6078, "step": 50 }, { "epoch": 1.79, "learning_rate": 3.0303030303030306e-05, "loss": 0.5431, "step": 60 }, { "epoch": 2.0, "eval_accuracy": 0.7817418677859391, "eval_f1_score": 0.8531073446327684, "eval_loss": 0.4163793921470642, "eval_precision": 0.749379652605459, "eval_recall": 0.9901639344262295, "eval_runtime": 6.7638, "eval_samples_per_second": 140.898, "eval_steps_per_second": 1.183, "step": 67 }, { "epoch": 2.09, "learning_rate": 3.535353535353535e-05, "loss": 0.497, "step": 70 }, { "epoch": 2.39, "learning_rate": 4.0404040404040405e-05, "loss": 0.4483, "step": 80 }, { "epoch": 2.69, "learning_rate": 4.545454545454546e-05, "loss": 0.4193, "step": 90 }, { "epoch": 2.99, "learning_rate": 4.994388327721661e-05, "loss": 0.393, "step": 100 }, { "epoch": 2.99, "eval_accuracy": 0.887722980062959, "eval_f1_score": 0.9078380706287683, "eval_loss": 0.2832907438278198, "eval_precision": 0.956442831215971, "eval_recall": 0.8639344262295082, "eval_runtime": 6.6321, "eval_samples_per_second": 143.695, "eval_steps_per_second": 1.206, "step": 100 }, { "epoch": 3.28, "learning_rate": 4.938271604938271e-05, "loss": 0.3729, "step": 110 }, { "epoch": 3.58, "learning_rate": 4.882154882154882e-05, "loss": 0.3616, "step": 120 }, { "epoch": 3.88, "learning_rate": 4.8260381593714935e-05, "loss": 0.354, "step": 130 }, { "epoch": 4.0, "eval_accuracy": 0.9275970619097587, "eval_f1_score": 0.9435813573180704, "eval_loss": 0.19304586946964264, "eval_precision": 0.9412724306688418, "eval_recall": 0.9459016393442623, "eval_runtime": 6.5785, "eval_samples_per_second": 144.865, "eval_steps_per_second": 1.216, "step": 134 }, { "epoch": 4.18, "learning_rate": 4.7699214365881036e-05, "loss": 0.3214, "step": 140 }, { "epoch": 4.48, "learning_rate": 4.713804713804714e-05, "loss": 0.3272, "step": 150 }, { "epoch": 4.78, "learning_rate": 4.6576879910213244e-05, "loss": 0.3007, "step": 160 }, { "epoch": 4.99, "eval_accuracy": 0.9370409233997902, "eval_f1_score": 0.9510603588907015, "eval_loss": 0.15851934254169464, "eval_precision": 0.9464285714285714, "eval_recall": 0.9557377049180328, "eval_runtime": 6.4566, "eval_samples_per_second": 147.6, "eval_steps_per_second": 1.239, "step": 167 }, { "epoch": 5.07, "learning_rate": 4.601571268237935e-05, "loss": 0.3006, "step": 170 }, { "epoch": 5.37, "learning_rate": 4.545454545454546e-05, "loss": 0.2935, "step": 180 }, { "epoch": 5.67, "learning_rate": 4.4893378226711566e-05, "loss": 0.3092, "step": 190 }, { "epoch": 5.97, "learning_rate": 4.433221099887767e-05, "loss": 0.2898, "step": 200 }, { "epoch": 6.0, "eval_accuracy": 0.9464847848898216, "eval_f1_score": 0.9580936729663106, "eval_loss": 0.144499272108078, "eval_precision": 0.9604612850082372, "eval_recall": 0.9557377049180328, "eval_runtime": 6.0258, "eval_samples_per_second": 158.154, "eval_steps_per_second": 1.328, "step": 201 }, { "epoch": 6.27, "learning_rate": 4.3771043771043774e-05, "loss": 0.2696, "step": 210 }, { "epoch": 6.57, "learning_rate": 4.3209876543209875e-05, "loss": 0.3025, "step": 220 }, { "epoch": 6.87, "learning_rate": 4.264870931537598e-05, "loss": 0.2824, "step": 230 }, { "epoch": 6.99, "eval_accuracy": 0.9464847848898216, "eval_f1_score": 0.9579554822753504, "eval_loss": 0.13527622818946838, "eval_precision": 0.9635157545605307, "eval_recall": 0.9524590163934427, "eval_runtime": 6.0159, "eval_samples_per_second": 158.414, "eval_steps_per_second": 1.33, "step": 234 }, { "epoch": 7.16, "learning_rate": 4.208754208754209e-05, "loss": 0.2782, "step": 240 }, { "epoch": 7.46, "learning_rate": 4.15263748597082e-05, "loss": 0.2629, "step": 250 }, { "epoch": 7.76, "learning_rate": 4.0965207631874305e-05, "loss": 0.2763, "step": 260 }, { "epoch": 8.0, "eval_accuracy": 0.9485834207764953, "eval_f1_score": 0.960323886639676, "eval_loss": 0.13593612611293793, "eval_precision": 0.9488, "eval_recall": 0.9721311475409836, "eval_runtime": 6.3017, "eval_samples_per_second": 151.228, "eval_steps_per_second": 1.269, "step": 268 }, { "epoch": 8.06, "learning_rate": 4.0404040404040405e-05, "loss": 0.2698, "step": 270 }, { "epoch": 8.36, "learning_rate": 3.984287317620651e-05, "loss": 0.2621, "step": 280 }, { "epoch": 8.66, "learning_rate": 3.9281705948372613e-05, "loss": 0.2631, "step": 290 }, { "epoch": 8.96, "learning_rate": 3.872053872053872e-05, "loss": 0.2473, "step": 300 }, { "epoch": 8.99, "eval_accuracy": 0.9569779643231899, "eval_f1_score": 0.9664209664209664, "eval_loss": 0.12131528556346893, "eval_precision": 0.9656301145662848, "eval_recall": 0.9672131147540983, "eval_runtime": 6.691, "eval_samples_per_second": 142.431, "eval_steps_per_second": 1.196, "step": 301 }, { "epoch": 9.25, "learning_rate": 3.815937149270483e-05, "loss": 0.2339, "step": 310 }, { "epoch": 9.55, "learning_rate": 3.7598204264870936e-05, "loss": 0.249, "step": 320 }, { "epoch": 9.85, "learning_rate": 3.7037037037037037e-05, "loss": 0.2598, "step": 330 }, { "epoch": 10.0, "eval_accuracy": 0.9569779643231899, "eval_f1_score": 0.966530612244898, "eval_loss": 0.10907502472400665, "eval_precision": 0.9626016260162602, "eval_recall": 0.9704918032786886, "eval_runtime": 6.9942, "eval_samples_per_second": 136.256, "eval_steps_per_second": 1.144, "step": 335 }, { "epoch": 10.15, "learning_rate": 3.6475869809203144e-05, "loss": 0.2497, "step": 340 }, { "epoch": 10.45, "learning_rate": 3.5914702581369245e-05, "loss": 0.253, "step": 350 }, { "epoch": 10.75, "learning_rate": 3.535353535353535e-05, "loss": 0.2476, "step": 360 }, { "epoch": 10.99, "eval_accuracy": 0.9632738719832109, "eval_f1_score": 0.9714285714285714, "eval_loss": 0.1040654331445694, "eval_precision": 0.967479674796748, "eval_recall": 0.9754098360655737, "eval_runtime": 6.6817, "eval_samples_per_second": 142.629, "eval_steps_per_second": 1.197, "step": 368 }, { "epoch": 11.04, "learning_rate": 3.4792368125701466e-05, "loss": 0.2314, "step": 370 }, { "epoch": 11.34, "learning_rate": 3.423120089786757e-05, "loss": 0.2422, "step": 380 }, { "epoch": 11.64, "learning_rate": 3.3670033670033675e-05, "loss": 0.2406, "step": 390 }, { "epoch": 11.94, "learning_rate": 3.3108866442199775e-05, "loss": 0.2376, "step": 400 }, { "epoch": 12.0, "eval_accuracy": 0.9601259181532005, "eval_f1_score": 0.9686468646864688, "eval_loss": 0.09974055737257004, "eval_precision": 0.9750830564784053, "eval_recall": 0.9622950819672131, "eval_runtime": 6.6227, "eval_samples_per_second": 143.899, "eval_steps_per_second": 1.208, "step": 402 }, { "epoch": 12.24, "learning_rate": 3.254769921436588e-05, "loss": 0.2282, "step": 410 }, { "epoch": 12.54, "learning_rate": 3.198653198653199e-05, "loss": 0.2264, "step": 420 }, { "epoch": 12.84, "learning_rate": 3.14253647586981e-05, "loss": 0.2402, "step": 430 }, { "epoch": 12.99, "eval_accuracy": 0.9622245540398741, "eval_f1_score": 0.9703947368421052, "eval_loss": 0.09718549996614456, "eval_precision": 0.9735973597359736, "eval_recall": 0.9672131147540983, "eval_runtime": 6.1102, "eval_samples_per_second": 155.969, "eval_steps_per_second": 1.309, "step": 435 }, { "epoch": 13.13, "learning_rate": 3.08641975308642e-05, "loss": 0.2216, "step": 440 }, { "epoch": 13.43, "learning_rate": 3.0303030303030306e-05, "loss": 0.2361, "step": 450 }, { "epoch": 13.73, "learning_rate": 2.9741863075196406e-05, "loss": 0.2324, "step": 460 }, { "epoch": 14.0, "eval_accuracy": 0.9664218258132214, "eval_f1_score": 0.973941368078176, "eval_loss": 0.09498707950115204, "eval_precision": 0.9676375404530745, "eval_recall": 0.980327868852459, "eval_runtime": 6.0066, "eval_samples_per_second": 158.659, "eval_steps_per_second": 1.332, "step": 469 }, { "epoch": 14.03, "learning_rate": 2.9180695847362517e-05, "loss": 0.2223, "step": 470 }, { "epoch": 14.33, "learning_rate": 2.8619528619528618e-05, "loss": 0.213, "step": 480 }, { "epoch": 14.63, "learning_rate": 2.8058361391694725e-05, "loss": 0.2206, "step": 490 }, { "epoch": 14.93, "learning_rate": 2.7497194163860833e-05, "loss": 0.2256, "step": 500 }, { "epoch": 14.99, "eval_accuracy": 0.9706190975865687, "eval_f1_score": 0.9770114942528735, "eval_loss": 0.09094734489917755, "eval_precision": 0.9786184210526315, "eval_recall": 0.9754098360655737, "eval_runtime": 5.9668, "eval_samples_per_second": 159.718, "eval_steps_per_second": 1.341, "step": 502 }, { "epoch": 15.22, "learning_rate": 2.6936026936026937e-05, "loss": 0.209, "step": 510 }, { "epoch": 15.52, "learning_rate": 2.6374859708193044e-05, "loss": 0.2254, "step": 520 }, { "epoch": 15.82, "learning_rate": 2.581369248035915e-05, "loss": 0.21, "step": 530 }, { "epoch": 16.0, "eval_accuracy": 0.9622245540398741, "eval_f1_score": 0.970345963756178, "eval_loss": 0.09220422059297562, "eval_precision": 0.9751655629139073, "eval_recall": 0.9655737704918033, "eval_runtime": 6.5986, "eval_samples_per_second": 144.424, "eval_steps_per_second": 1.212, "step": 536 }, { "epoch": 16.12, "learning_rate": 2.5252525252525256e-05, "loss": 0.2114, "step": 540 }, { "epoch": 16.42, "learning_rate": 2.4691358024691357e-05, "loss": 0.1963, "step": 550 }, { "epoch": 16.72, "learning_rate": 2.4130190796857467e-05, "loss": 0.217, "step": 560 }, { "epoch": 16.99, "eval_accuracy": 0.9611752360965372, "eval_f1_score": 0.9695473251028807, "eval_loss": 0.09329694509506226, "eval_precision": 0.9735537190082645, "eval_recall": 0.9655737704918033, "eval_runtime": 6.5505, "eval_samples_per_second": 145.485, "eval_steps_per_second": 1.221, "step": 569 }, { "epoch": 17.01, "learning_rate": 2.356902356902357e-05, "loss": 0.219, "step": 570 }, { "epoch": 17.31, "learning_rate": 2.3007856341189676e-05, "loss": 0.2154, "step": 580 }, { "epoch": 17.61, "learning_rate": 2.2446689113355783e-05, "loss": 0.2157, "step": 590 }, { "epoch": 17.91, "learning_rate": 2.1885521885521887e-05, "loss": 0.2092, "step": 600 }, { "epoch": 18.0, "eval_accuracy": 0.9664218258132214, "eval_f1_score": 0.9738134206219312, "eval_loss": 0.08909059315919876, "eval_precision": 0.9722222222222222, "eval_recall": 0.9754098360655737, "eval_runtime": 5.9927, "eval_samples_per_second": 159.027, "eval_steps_per_second": 1.335, "step": 603 }, { "epoch": 18.21, "learning_rate": 2.132435465768799e-05, "loss": 0.1979, "step": 610 }, { "epoch": 18.51, "learning_rate": 2.07631874298541e-05, "loss": 0.1975, "step": 620 }, { "epoch": 18.81, "learning_rate": 2.0202020202020203e-05, "loss": 0.2063, "step": 630 }, { "epoch": 18.99, "eval_accuracy": 0.9653725078698846, "eval_f1_score": 0.972972972972973, "eval_loss": 0.09130384773015976, "eval_precision": 0.972176759410802, "eval_recall": 0.9737704918032787, "eval_runtime": 5.9748, "eval_samples_per_second": 159.504, "eval_steps_per_second": 1.339, "step": 636 }, { "epoch": 19.1, "learning_rate": 1.9640852974186307e-05, "loss": 0.215, "step": 640 }, { "epoch": 19.4, "learning_rate": 1.9079685746352414e-05, "loss": 0.214, "step": 650 }, { "epoch": 19.7, "learning_rate": 1.8518518518518518e-05, "loss": 0.2072, "step": 660 }, { "epoch": 20.0, "learning_rate": 1.7957351290684622e-05, "loss": 0.2217, "step": 670 }, { "epoch": 20.0, "eval_accuracy": 0.9643231899265478, "eval_f1_score": 0.971993410214168, "eval_loss": 0.09169190376996994, "eval_precision": 0.9768211920529801, "eval_recall": 0.9672131147540983, "eval_runtime": 6.2954, "eval_samples_per_second": 151.381, "eval_steps_per_second": 1.271, "step": 670 }, { "epoch": 20.3, "learning_rate": 1.7396184062850733e-05, "loss": 0.1996, "step": 680 }, { "epoch": 20.6, "learning_rate": 1.6835016835016837e-05, "loss": 0.1968, "step": 690 }, { "epoch": 20.9, "learning_rate": 1.627384960718294e-05, "loss": 0.1952, "step": 700 }, { "epoch": 20.99, "eval_accuracy": 0.9716684155299056, "eval_f1_score": 0.9778142974527526, "eval_loss": 0.0859055444598198, "eval_precision": 0.9802306425041186, "eval_recall": 0.9754098360655737, "eval_runtime": 6.6363, "eval_samples_per_second": 143.604, "eval_steps_per_second": 1.205, "step": 703 }, { "epoch": 21.19, "learning_rate": 1.571268237934905e-05, "loss": 0.2051, "step": 710 }, { "epoch": 21.49, "learning_rate": 1.5151515151515153e-05, "loss": 0.1927, "step": 720 }, { "epoch": 21.79, "learning_rate": 1.4590347923681259e-05, "loss": 0.2068, "step": 730 }, { "epoch": 22.0, "eval_accuracy": 0.968520461699895, "eval_f1_score": 0.9754500818330606, "eval_loss": 0.09068847447633743, "eval_precision": 0.9738562091503268, "eval_recall": 0.9770491803278688, "eval_runtime": 6.1893, "eval_samples_per_second": 153.975, "eval_steps_per_second": 1.293, "step": 737 }, { "epoch": 22.09, "learning_rate": 1.4029180695847363e-05, "loss": 0.1957, "step": 740 }, { "epoch": 22.39, "learning_rate": 1.3468013468013468e-05, "loss": 0.2021, "step": 750 }, { "epoch": 22.69, "learning_rate": 1.2906846240179574e-05, "loss": 0.202, "step": 760 }, { "epoch": 22.99, "learning_rate": 1.2345679012345678e-05, "loss": 0.1914, "step": 770 }, { "epoch": 22.99, "eval_accuracy": 0.9695697796432319, "eval_f1_score": 0.9762878168438266, "eval_loss": 0.08471482992172241, "eval_precision": 0.9738988580750407, "eval_recall": 0.978688524590164, "eval_runtime": 5.9499, "eval_samples_per_second": 160.172, "eval_steps_per_second": 1.345, "step": 770 }, { "epoch": 23.28, "learning_rate": 1.1784511784511786e-05, "loss": 0.1937, "step": 780 }, { "epoch": 23.58, "learning_rate": 1.1223344556677892e-05, "loss": 0.1876, "step": 790 }, { "epoch": 23.88, "learning_rate": 1.0662177328843996e-05, "loss": 0.1961, "step": 800 }, { "epoch": 24.0, "eval_accuracy": 0.968520461699895, "eval_f1_score": 0.9754500818330606, "eval_loss": 0.08703567832708359, "eval_precision": 0.9738562091503268, "eval_recall": 0.9770491803278688, "eval_runtime": 6.0627, "eval_samples_per_second": 157.191, "eval_steps_per_second": 1.32, "step": 804 }, { "epoch": 24.18, "learning_rate": 1.0101010101010101e-05, "loss": 0.1817, "step": 810 }, { "epoch": 24.48, "learning_rate": 9.539842873176207e-06, "loss": 0.1869, "step": 820 }, { "epoch": 24.78, "learning_rate": 8.978675645342311e-06, "loss": 0.1911, "step": 830 }, { "epoch": 24.99, "eval_accuracy": 0.9664218258132214, "eval_f1_score": 0.9738562091503268, "eval_loss": 0.08837948739528656, "eval_precision": 0.9706840390879479, "eval_recall": 0.9770491803278688, "eval_runtime": 6.585, "eval_samples_per_second": 144.722, "eval_steps_per_second": 1.215, "step": 837 }, { "epoch": 25.07, "learning_rate": 8.417508417508419e-06, "loss": 0.1904, "step": 840 }, { "epoch": 25.37, "learning_rate": 7.856341189674524e-06, "loss": 0.1831, "step": 850 }, { "epoch": 25.67, "learning_rate": 7.295173961840629e-06, "loss": 0.184, "step": 860 }, { "epoch": 25.97, "learning_rate": 6.734006734006734e-06, "loss": 0.1961, "step": 870 }, { "epoch": 26.0, "eval_accuracy": 0.968520461699895, "eval_f1_score": 0.9753694581280787, "eval_loss": 0.08696460723876953, "eval_precision": 0.9769736842105263, "eval_recall": 0.9737704918032787, "eval_runtime": 6.5592, "eval_samples_per_second": 145.293, "eval_steps_per_second": 1.22, "step": 871 }, { "epoch": 26.27, "learning_rate": 6.172839506172839e-06, "loss": 0.1988, "step": 880 }, { "epoch": 26.57, "learning_rate": 5.611672278338946e-06, "loss": 0.1796, "step": 890 }, { "epoch": 26.87, "learning_rate": 5.050505050505051e-06, "loss": 0.1978, "step": 900 }, { "epoch": 26.99, "eval_accuracy": 0.968520461699895, "eval_f1_score": 0.9754098360655737, "eval_loss": 0.08714743703603745, "eval_precision": 0.9754098360655737, "eval_recall": 0.9754098360655737, "eval_runtime": 5.8888, "eval_samples_per_second": 161.832, "eval_steps_per_second": 1.359, "step": 904 }, { "epoch": 27.16, "learning_rate": 4.489337822671156e-06, "loss": 0.2099, "step": 910 }, { "epoch": 27.46, "learning_rate": 3.928170594837262e-06, "loss": 0.1867, "step": 920 }, { "epoch": 27.76, "learning_rate": 3.367003367003367e-06, "loss": 0.1854, "step": 930 }, { "epoch": 28.0, "eval_accuracy": 0.968520461699895, "eval_f1_score": 0.9754500818330606, "eval_loss": 0.08582841604948044, "eval_precision": 0.9738562091503268, "eval_recall": 0.9770491803278688, "eval_runtime": 5.9622, "eval_samples_per_second": 159.839, "eval_steps_per_second": 1.342, "step": 938 }, { "epoch": 28.06, "learning_rate": 2.805836139169473e-06, "loss": 0.1931, "step": 940 }, { "epoch": 28.36, "learning_rate": 2.244668911335578e-06, "loss": 0.1957, "step": 950 }, { "epoch": 28.66, "learning_rate": 1.6835016835016836e-06, "loss": 0.1849, "step": 960 }, { "epoch": 28.96, "learning_rate": 1.122334455667789e-06, "loss": 0.1733, "step": 970 }, { "epoch": 28.99, "eval_accuracy": 0.968520461699895, "eval_f1_score": 0.9753694581280787, "eval_loss": 0.08600697666406631, "eval_precision": 0.9769736842105263, "eval_recall": 0.9737704918032787, "eval_runtime": 6.5847, "eval_samples_per_second": 144.729, "eval_steps_per_second": 1.215, "step": 971 }, { "epoch": 29.25, "learning_rate": 5.611672278338944e-07, "loss": 0.202, "step": 980 }, { "epoch": 29.55, "learning_rate": 0.0, "loss": 0.1762, "step": 990 }, { "epoch": 29.55, "eval_accuracy": 0.9664218258132214, "eval_f1_score": 0.9737704918032787, "eval_loss": 0.0858435109257698, "eval_precision": 0.9737704918032787, "eval_recall": 0.9737704918032787, "eval_runtime": 6.517, "eval_samples_per_second": 146.233, "eval_steps_per_second": 1.228, "step": 990 }, { "epoch": 29.55, "step": 990, "total_flos": 1.1503239286004122e+18, "train_loss": 0.2626483961789295, "train_runtime": 6214.0737, "train_samples_per_second": 41.379, "train_steps_per_second": 0.159 } ], "logging_steps": 10, "max_steps": 990, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "total_flos": 1.1503239286004122e+18, "train_batch_size": 128, "trial_name": null, "trial_params": null }