{ "best_metric": 0.08559587597846985, "best_model_checkpoint": "./vit-base-chest-xray/checkpoint-500", "epoch": 4.0, "eval_steps": 100, "global_step": 3060, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013071895424836602, "grad_norm": 1.1339586973190308, "learning_rate": 0.0001993464052287582, "loss": 0.4681, "step": 10 }, { "epoch": 0.026143790849673203, "grad_norm": 3.4900288581848145, "learning_rate": 0.00019869281045751635, "loss": 0.2909, "step": 20 }, { "epoch": 0.0392156862745098, "grad_norm": 4.287875652313232, "learning_rate": 0.00019803921568627454, "loss": 0.2672, "step": 30 }, { "epoch": 0.05228758169934641, "grad_norm": 0.31267789006233215, "learning_rate": 0.0001973856209150327, "loss": 0.1288, "step": 40 }, { "epoch": 0.06535947712418301, "grad_norm": 0.20445282757282257, "learning_rate": 0.00019673202614379085, "loss": 0.2329, "step": 50 }, { "epoch": 0.0784313725490196, "grad_norm": 1.763299584388733, "learning_rate": 0.000196078431372549, "loss": 0.1666, "step": 60 }, { "epoch": 0.0915032679738562, "grad_norm": 4.610985279083252, "learning_rate": 0.0001954248366013072, "loss": 0.1897, "step": 70 }, { "epoch": 0.10457516339869281, "grad_norm": 0.2082081139087677, "learning_rate": 0.00019477124183006535, "loss": 0.1638, "step": 80 }, { "epoch": 0.11764705882352941, "grad_norm": 2.0618653297424316, "learning_rate": 0.00019411764705882354, "loss": 0.1161, "step": 90 }, { "epoch": 0.13071895424836602, "grad_norm": 2.996493101119995, "learning_rate": 0.0001934640522875817, "loss": 0.1891, "step": 100 }, { "epoch": 0.13071895424836602, "eval_accuracy": 0.9665236051502146, "eval_loss": 0.10277536511421204, "eval_runtime": 24.9127, "eval_samples_per_second": 46.763, "eval_steps_per_second": 5.86, "step": 100 }, { "epoch": 0.1437908496732026, "grad_norm": 1.4748905897140503, "learning_rate": 0.00019281045751633988, "loss": 0.181, "step": 110 }, { "epoch": 0.1568627450980392, "grad_norm": 2.811959743499756, "learning_rate": 0.00019215686274509807, "loss": 0.2343, "step": 120 }, { "epoch": 0.16993464052287582, "grad_norm": 1.9155172109603882, "learning_rate": 0.00019150326797385623, "loss": 0.1792, "step": 130 }, { "epoch": 0.1830065359477124, "grad_norm": 0.3125442564487457, "learning_rate": 0.0001908496732026144, "loss": 0.1898, "step": 140 }, { "epoch": 0.19607843137254902, "grad_norm": 0.10497033596038818, "learning_rate": 0.00019019607843137254, "loss": 0.0737, "step": 150 }, { "epoch": 0.20915032679738563, "grad_norm": 0.12402518093585968, "learning_rate": 0.00018954248366013073, "loss": 0.0448, "step": 160 }, { "epoch": 0.2222222222222222, "grad_norm": 3.1576356887817383, "learning_rate": 0.00018888888888888888, "loss": 0.2302, "step": 170 }, { "epoch": 0.23529411764705882, "grad_norm": 0.15159478783607483, "learning_rate": 0.00018823529411764707, "loss": 0.094, "step": 180 }, { "epoch": 0.24836601307189543, "grad_norm": 0.0732378214597702, "learning_rate": 0.00018758169934640523, "loss": 0.1228, "step": 190 }, { "epoch": 0.26143790849673204, "grad_norm": 3.1713297367095947, "learning_rate": 0.0001869281045751634, "loss": 0.2123, "step": 200 }, { "epoch": 0.26143790849673204, "eval_accuracy": 0.9562231759656652, "eval_loss": 0.12540382146835327, "eval_runtime": 15.5185, "eval_samples_per_second": 75.072, "eval_steps_per_second": 9.408, "step": 200 }, { "epoch": 0.27450980392156865, "grad_norm": 0.42199578881263733, "learning_rate": 0.00018633986928104577, "loss": 0.1039, "step": 210 }, { "epoch": 0.2875816993464052, "grad_norm": 1.0121151208877563, "learning_rate": 0.00018568627450980392, "loss": 0.146, "step": 220 }, { "epoch": 0.3006535947712418, "grad_norm": 0.16740572452545166, "learning_rate": 0.0001850326797385621, "loss": 0.1232, "step": 230 }, { "epoch": 0.3137254901960784, "grad_norm": 0.7064481973648071, "learning_rate": 0.0001843790849673203, "loss": 0.2105, "step": 240 }, { "epoch": 0.32679738562091504, "grad_norm": 0.23291558027267456, "learning_rate": 0.00018372549019607842, "loss": 0.1186, "step": 250 }, { "epoch": 0.33986928104575165, "grad_norm": 0.3717576861381531, "learning_rate": 0.0001830718954248366, "loss": 0.1671, "step": 260 }, { "epoch": 0.35294117647058826, "grad_norm": 2.2546300888061523, "learning_rate": 0.00018241830065359477, "loss": 0.1799, "step": 270 }, { "epoch": 0.3660130718954248, "grad_norm": 1.6236902475357056, "learning_rate": 0.00018176470588235295, "loss": 0.5579, "step": 280 }, { "epoch": 0.3790849673202614, "grad_norm": 0.9936646819114685, "learning_rate": 0.0001811111111111111, "loss": 0.1809, "step": 290 }, { "epoch": 0.39215686274509803, "grad_norm": 4.444874286651611, "learning_rate": 0.0001804575163398693, "loss": 0.0536, "step": 300 }, { "epoch": 0.39215686274509803, "eval_accuracy": 0.9690987124463519, "eval_loss": 0.11418166011571884, "eval_runtime": 14.394, "eval_samples_per_second": 80.937, "eval_steps_per_second": 10.143, "step": 300 }, { "epoch": 0.40522875816993464, "grad_norm": 0.05227786302566528, "learning_rate": 0.00017980392156862745, "loss": 0.0896, "step": 310 }, { "epoch": 0.41830065359477125, "grad_norm": 0.06666386872529984, "learning_rate": 0.00017915032679738564, "loss": 0.0992, "step": 320 }, { "epoch": 0.43137254901960786, "grad_norm": 0.2057139277458191, "learning_rate": 0.0001784967320261438, "loss": 0.1225, "step": 330 }, { "epoch": 0.4444444444444444, "grad_norm": 1.088228702545166, "learning_rate": 0.00017784313725490195, "loss": 0.0892, "step": 340 }, { "epoch": 0.45751633986928103, "grad_norm": 0.7964250445365906, "learning_rate": 0.00017718954248366014, "loss": 0.1447, "step": 350 }, { "epoch": 0.47058823529411764, "grad_norm": 0.7459284663200378, "learning_rate": 0.0001765359477124183, "loss": 0.1589, "step": 360 }, { "epoch": 0.48366013071895425, "grad_norm": 0.40795373916625977, "learning_rate": 0.00017588235294117648, "loss": 0.2556, "step": 370 }, { "epoch": 0.49673202614379086, "grad_norm": 2.7890219688415527, "learning_rate": 0.00017522875816993464, "loss": 0.1933, "step": 380 }, { "epoch": 0.5098039215686274, "grad_norm": 1.0104824304580688, "learning_rate": 0.00017457516339869283, "loss": 0.1709, "step": 390 }, { "epoch": 0.5228758169934641, "grad_norm": 0.8147939443588257, "learning_rate": 0.00017392156862745098, "loss": 0.0799, "step": 400 }, { "epoch": 0.5228758169934641, "eval_accuracy": 0.9648068669527897, "eval_loss": 0.1173361986875534, "eval_runtime": 14.4858, "eval_samples_per_second": 80.423, "eval_steps_per_second": 10.079, "step": 400 }, { "epoch": 0.5359477124183006, "grad_norm": 0.25372856855392456, "learning_rate": 0.00017326797385620917, "loss": 0.1718, "step": 410 }, { "epoch": 0.5490196078431373, "grad_norm": 0.6395456194877625, "learning_rate": 0.00017261437908496733, "loss": 0.1003, "step": 420 }, { "epoch": 0.5620915032679739, "grad_norm": 1.1706054210662842, "learning_rate": 0.0001719607843137255, "loss": 0.2023, "step": 430 }, { "epoch": 0.5751633986928104, "grad_norm": 1.8857362270355225, "learning_rate": 0.00017130718954248367, "loss": 0.1288, "step": 440 }, { "epoch": 0.5882352941176471, "grad_norm": 0.07465291023254395, "learning_rate": 0.00017065359477124183, "loss": 0.0458, "step": 450 }, { "epoch": 0.6013071895424836, "grad_norm": 0.05449380353093147, "learning_rate": 0.00017, "loss": 0.112, "step": 460 }, { "epoch": 0.6143790849673203, "grad_norm": 2.8383517265319824, "learning_rate": 0.00016934640522875817, "loss": 0.1434, "step": 470 }, { "epoch": 0.6274509803921569, "grad_norm": 2.2419204711914062, "learning_rate": 0.00016869281045751636, "loss": 0.1108, "step": 480 }, { "epoch": 0.6405228758169934, "grad_norm": 1.1179159879684448, "learning_rate": 0.0001680392156862745, "loss": 0.0392, "step": 490 }, { "epoch": 0.6535947712418301, "grad_norm": 1.6863864660263062, "learning_rate": 0.0001673856209150327, "loss": 0.0537, "step": 500 }, { "epoch": 0.6535947712418301, "eval_accuracy": 0.9742489270386266, "eval_loss": 0.08559587597846985, "eval_runtime": 15.4, "eval_samples_per_second": 75.649, "eval_steps_per_second": 9.481, "step": 500 }, { "epoch": 0.6666666666666666, "grad_norm": 2.383035659790039, "learning_rate": 0.00016673202614379086, "loss": 0.2613, "step": 510 }, { "epoch": 0.6797385620915033, "grad_norm": 0.652946412563324, "learning_rate": 0.00016607843137254904, "loss": 0.1499, "step": 520 }, { "epoch": 0.6928104575163399, "grad_norm": 1.1430766582489014, "learning_rate": 0.0001654248366013072, "loss": 0.082, "step": 530 }, { "epoch": 0.7058823529411765, "grad_norm": 0.6167986989021301, "learning_rate": 0.00016477124183006536, "loss": 0.017, "step": 540 }, { "epoch": 0.7189542483660131, "grad_norm": 0.06140501797199249, "learning_rate": 0.00016411764705882354, "loss": 0.0802, "step": 550 }, { "epoch": 0.7320261437908496, "grad_norm": 0.047364696860313416, "learning_rate": 0.0001634640522875817, "loss": 0.1183, "step": 560 }, { "epoch": 0.7450980392156863, "grad_norm": 0.4764581024646759, "learning_rate": 0.00016281045751633989, "loss": 0.0451, "step": 570 }, { "epoch": 0.7581699346405228, "grad_norm": 0.05738784372806549, "learning_rate": 0.00016215686274509804, "loss": 0.0137, "step": 580 }, { "epoch": 0.7712418300653595, "grad_norm": 2.4237992763519287, "learning_rate": 0.00016150326797385623, "loss": 0.0707, "step": 590 }, { "epoch": 0.7843137254901961, "grad_norm": 0.10018886625766754, "learning_rate": 0.00016084967320261439, "loss": 0.0911, "step": 600 }, { "epoch": 0.7843137254901961, "eval_accuracy": 0.9424892703862661, "eval_loss": 0.20054155588150024, "eval_runtime": 14.614, "eval_samples_per_second": 79.718, "eval_steps_per_second": 9.99, "step": 600 }, { "epoch": 0.7973856209150327, "grad_norm": 0.5596518516540527, "learning_rate": 0.00016019607843137257, "loss": 0.0513, "step": 610 }, { "epoch": 0.8104575163398693, "grad_norm": 0.052388403564691544, "learning_rate": 0.00015954248366013073, "loss": 0.0463, "step": 620 }, { "epoch": 0.8235294117647058, "grad_norm": 0.567359983921051, "learning_rate": 0.0001588888888888889, "loss": 0.0966, "step": 630 }, { "epoch": 0.8366013071895425, "grad_norm": 2.3820440769195557, "learning_rate": 0.00015823529411764707, "loss": 0.0887, "step": 640 }, { "epoch": 0.8496732026143791, "grad_norm": 0.05958991125226021, "learning_rate": 0.00015758169934640523, "loss": 0.1156, "step": 650 }, { "epoch": 0.8627450980392157, "grad_norm": 0.09057465195655823, "learning_rate": 0.00015692810457516342, "loss": 0.0545, "step": 660 }, { "epoch": 0.8758169934640523, "grad_norm": 0.6337572932243347, "learning_rate": 0.00015627450980392157, "loss": 0.0771, "step": 670 }, { "epoch": 0.8888888888888888, "grad_norm": 0.7565958499908447, "learning_rate": 0.00015562091503267976, "loss": 0.1179, "step": 680 }, { "epoch": 0.9019607843137255, "grad_norm": 3.6693177223205566, "learning_rate": 0.00015496732026143792, "loss": 0.2026, "step": 690 }, { "epoch": 0.9150326797385621, "grad_norm": 0.13483676314353943, "learning_rate": 0.0001543137254901961, "loss": 0.1027, "step": 700 }, { "epoch": 0.9150326797385621, "eval_accuracy": 0.9708154506437768, "eval_loss": 0.08694365620613098, "eval_runtime": 14.2568, "eval_samples_per_second": 81.715, "eval_steps_per_second": 10.241, "step": 700 }, { "epoch": 0.9281045751633987, "grad_norm": 0.8454955220222473, "learning_rate": 0.00015366013071895426, "loss": 0.0757, "step": 710 }, { "epoch": 0.9411764705882353, "grad_norm": 4.228348255157471, "learning_rate": 0.00015300653594771242, "loss": 0.0729, "step": 720 }, { "epoch": 0.954248366013072, "grad_norm": 0.35448652505874634, "learning_rate": 0.00015235294117647057, "loss": 0.0605, "step": 730 }, { "epoch": 0.9673202614379085, "grad_norm": 0.04271356761455536, "learning_rate": 0.00015169934640522876, "loss": 0.0747, "step": 740 }, { "epoch": 0.9803921568627451, "grad_norm": 0.11471935361623764, "learning_rate": 0.00015104575163398694, "loss": 0.0345, "step": 750 }, { "epoch": 0.9934640522875817, "grad_norm": 0.3417186439037323, "learning_rate": 0.0001503921568627451, "loss": 0.1179, "step": 760 }, { "epoch": 1.0065359477124183, "grad_norm": 0.07955777645111084, "learning_rate": 0.0001497385620915033, "loss": 0.0091, "step": 770 }, { "epoch": 1.0196078431372548, "grad_norm": 0.07078742980957031, "learning_rate": 0.00014908496732026145, "loss": 0.0229, "step": 780 }, { "epoch": 1.0326797385620916, "grad_norm": 0.11356745660305023, "learning_rate": 0.00014843137254901963, "loss": 0.0351, "step": 790 }, { "epoch": 1.0457516339869282, "grad_norm": 1.0611521005630493, "learning_rate": 0.0001477777777777778, "loss": 0.1011, "step": 800 }, { "epoch": 1.0457516339869282, "eval_accuracy": 0.9630901287553648, "eval_loss": 0.10628381371498108, "eval_runtime": 14.4937, "eval_samples_per_second": 80.38, "eval_steps_per_second": 10.073, "step": 800 }, { "epoch": 1.0588235294117647, "grad_norm": 0.5016999244689941, "learning_rate": 0.00014712418300653597, "loss": 0.0779, "step": 810 }, { "epoch": 1.0718954248366013, "grad_norm": 0.2654229402542114, "learning_rate": 0.0001464705882352941, "loss": 0.1233, "step": 820 }, { "epoch": 1.0849673202614378, "grad_norm": 0.16079501807689667, "learning_rate": 0.0001458169934640523, "loss": 0.07, "step": 830 }, { "epoch": 1.0980392156862746, "grad_norm": 0.08982982486486435, "learning_rate": 0.00014516339869281045, "loss": 0.0525, "step": 840 }, { "epoch": 1.1111111111111112, "grad_norm": 0.029190940782427788, "learning_rate": 0.00014450980392156863, "loss": 0.0844, "step": 850 }, { "epoch": 1.1241830065359477, "grad_norm": 0.035515930503606796, "learning_rate": 0.0001438562091503268, "loss": 0.0231, "step": 860 }, { "epoch": 1.1372549019607843, "grad_norm": 0.19514258205890656, "learning_rate": 0.00014320261437908498, "loss": 0.0435, "step": 870 }, { "epoch": 1.1503267973856208, "grad_norm": 0.04219866171479225, "learning_rate": 0.00014254901960784316, "loss": 0.0517, "step": 880 }, { "epoch": 1.1633986928104576, "grad_norm": 0.0360812209546566, "learning_rate": 0.00014189542483660132, "loss": 0.0311, "step": 890 }, { "epoch": 1.1764705882352942, "grad_norm": 0.4207318425178528, "learning_rate": 0.0001412418300653595, "loss": 0.0717, "step": 900 }, { "epoch": 1.1764705882352942, "eval_accuracy": 0.9587982832618026, "eval_loss": 0.14242781698703766, "eval_runtime": 15.8491, "eval_samples_per_second": 73.506, "eval_steps_per_second": 9.212, "step": 900 }, { "epoch": 1.1895424836601307, "grad_norm": 2.1643173694610596, "learning_rate": 0.00014058823529411763, "loss": 0.0269, "step": 910 }, { "epoch": 1.2026143790849673, "grad_norm": 0.03318187966942787, "learning_rate": 0.00013993464052287582, "loss": 0.0272, "step": 920 }, { "epoch": 1.215686274509804, "grad_norm": 0.7758973240852356, "learning_rate": 0.00013928104575163398, "loss": 0.0633, "step": 930 }, { "epoch": 1.2287581699346406, "grad_norm": 1.6854296922683716, "learning_rate": 0.00013862745098039216, "loss": 0.0678, "step": 940 }, { "epoch": 1.2418300653594772, "grad_norm": 3.3588085174560547, "learning_rate": 0.00013797385620915032, "loss": 0.1088, "step": 950 }, { "epoch": 1.2549019607843137, "grad_norm": 0.2561059892177582, "learning_rate": 0.0001373202614379085, "loss": 0.0657, "step": 960 }, { "epoch": 1.2679738562091503, "grad_norm": 1.3019442558288574, "learning_rate": 0.00013666666666666666, "loss": 0.0405, "step": 970 }, { "epoch": 1.2810457516339868, "grad_norm": 0.23876748979091644, "learning_rate": 0.00013601307189542485, "loss": 0.0277, "step": 980 }, { "epoch": 1.2941176470588236, "grad_norm": 0.21840398013591766, "learning_rate": 0.00013535947712418303, "loss": 0.0356, "step": 990 }, { "epoch": 1.3071895424836601, "grad_norm": 0.05377864092588425, "learning_rate": 0.0001347058823529412, "loss": 0.0605, "step": 1000 }, { "epoch": 1.3071895424836601, "eval_accuracy": 0.9648068669527897, "eval_loss": 0.1524645835161209, "eval_runtime": 14.9355, "eval_samples_per_second": 78.002, "eval_steps_per_second": 9.775, "step": 1000 }, { "epoch": 1.3202614379084967, "grad_norm": 0.13815192878246307, "learning_rate": 0.00013405228758169935, "loss": 0.0715, "step": 1010 }, { "epoch": 1.3333333333333333, "grad_norm": 3.1915783882141113, "learning_rate": 0.0001333986928104575, "loss": 0.0762, "step": 1020 }, { "epoch": 1.34640522875817, "grad_norm": 0.12432447075843811, "learning_rate": 0.0001327450980392157, "loss": 0.0514, "step": 1030 }, { "epoch": 1.3594771241830066, "grad_norm": 0.029387619346380234, "learning_rate": 0.00013209150326797385, "loss": 0.0094, "step": 1040 }, { "epoch": 1.3725490196078431, "grad_norm": 0.02788469009101391, "learning_rate": 0.00013143790849673204, "loss": 0.0128, "step": 1050 }, { "epoch": 1.3856209150326797, "grad_norm": 0.38828742504119873, "learning_rate": 0.0001307843137254902, "loss": 0.1181, "step": 1060 }, { "epoch": 1.3986928104575163, "grad_norm": 0.029526783153414726, "learning_rate": 0.00013013071895424838, "loss": 0.0701, "step": 1070 }, { "epoch": 1.4117647058823528, "grad_norm": 8.863903999328613, "learning_rate": 0.00012947712418300654, "loss": 0.0349, "step": 1080 }, { "epoch": 1.4248366013071896, "grad_norm": 0.05002022534608841, "learning_rate": 0.00012882352941176472, "loss": 0.0111, "step": 1090 }, { "epoch": 1.4379084967320261, "grad_norm": 0.04968211427330971, "learning_rate": 0.00012816993464052288, "loss": 0.0573, "step": 1100 }, { "epoch": 1.4379084967320261, "eval_accuracy": 0.9699570815450643, "eval_loss": 0.09700144827365875, "eval_runtime": 14.9347, "eval_samples_per_second": 78.006, "eval_steps_per_second": 9.776, "step": 1100 }, { "epoch": 1.4509803921568627, "grad_norm": 0.021018337458372116, "learning_rate": 0.00012751633986928104, "loss": 0.0338, "step": 1110 }, { "epoch": 1.4640522875816995, "grad_norm": 0.05960996448993683, "learning_rate": 0.00012686274509803922, "loss": 0.0577, "step": 1120 }, { "epoch": 1.477124183006536, "grad_norm": 0.02618427947163582, "learning_rate": 0.00012620915032679738, "loss": 0.055, "step": 1130 }, { "epoch": 1.4901960784313726, "grad_norm": 0.05656788498163223, "learning_rate": 0.00012555555555555557, "loss": 0.1561, "step": 1140 }, { "epoch": 1.5032679738562091, "grad_norm": 0.046731386333703995, "learning_rate": 0.00012490196078431372, "loss": 0.0314, "step": 1150 }, { "epoch": 1.5163398692810457, "grad_norm": 0.06606756895780563, "learning_rate": 0.0001242483660130719, "loss": 0.0359, "step": 1160 }, { "epoch": 1.5294117647058822, "grad_norm": 0.065465547144413, "learning_rate": 0.00012359477124183007, "loss": 0.0312, "step": 1170 }, { "epoch": 1.5424836601307188, "grad_norm": 0.05767370015382767, "learning_rate": 0.00012294117647058825, "loss": 0.0548, "step": 1180 }, { "epoch": 1.5555555555555556, "grad_norm": 0.02790243923664093, "learning_rate": 0.0001222875816993464, "loss": 0.0483, "step": 1190 }, { "epoch": 1.5686274509803921, "grad_norm": 0.03534754365682602, "learning_rate": 0.00012163398692810457, "loss": 0.024, "step": 1200 }, { "epoch": 1.5686274509803921, "eval_accuracy": 0.975107296137339, "eval_loss": 0.08669988811016083, "eval_runtime": 16.5277, "eval_samples_per_second": 70.488, "eval_steps_per_second": 8.834, "step": 1200 }, { "epoch": 1.581699346405229, "grad_norm": 0.03731567785143852, "learning_rate": 0.00012098039215686274, "loss": 0.0143, "step": 1210 }, { "epoch": 1.5947712418300655, "grad_norm": 0.05173814296722412, "learning_rate": 0.00012032679738562091, "loss": 0.0096, "step": 1220 }, { "epoch": 1.607843137254902, "grad_norm": 0.07455731183290482, "learning_rate": 0.00011967320261437908, "loss": 0.0251, "step": 1230 }, { "epoch": 1.6209150326797386, "grad_norm": 0.13736297190189362, "learning_rate": 0.00011901960784313725, "loss": 0.011, "step": 1240 }, { "epoch": 1.6339869281045751, "grad_norm": 2.5091629028320312, "learning_rate": 0.00011836601307189544, "loss": 0.0372, "step": 1250 }, { "epoch": 1.6470588235294117, "grad_norm": 0.048637136816978455, "learning_rate": 0.00011771241830065361, "loss": 0.0373, "step": 1260 }, { "epoch": 1.6601307189542482, "grad_norm": 0.019804753363132477, "learning_rate": 0.00011705882352941178, "loss": 0.0171, "step": 1270 }, { "epoch": 1.673202614379085, "grad_norm": 2.910708427429199, "learning_rate": 0.00011640522875816995, "loss": 0.0118, "step": 1280 }, { "epoch": 1.6862745098039216, "grad_norm": 0.02344467304646969, "learning_rate": 0.00011575163398692812, "loss": 0.0259, "step": 1290 }, { "epoch": 1.6993464052287581, "grad_norm": 0.0272055696696043, "learning_rate": 0.00011509803921568627, "loss": 0.0056, "step": 1300 }, { "epoch": 1.6993464052287581, "eval_accuracy": 0.9759656652360515, "eval_loss": 0.08883309364318848, "eval_runtime": 14.3413, "eval_samples_per_second": 81.234, "eval_steps_per_second": 10.18, "step": 1300 }, { "epoch": 1.712418300653595, "grad_norm": 0.12058671563863754, "learning_rate": 0.00011444444444444444, "loss": 0.0078, "step": 1310 }, { "epoch": 1.7254901960784315, "grad_norm": 0.012682071886956692, "learning_rate": 0.00011379084967320261, "loss": 0.014, "step": 1320 }, { "epoch": 1.738562091503268, "grad_norm": 0.011953895911574364, "learning_rate": 0.00011313725490196078, "loss": 0.0195, "step": 1330 }, { "epoch": 1.7516339869281046, "grad_norm": 4.999667167663574, "learning_rate": 0.00011248366013071895, "loss": 0.0472, "step": 1340 }, { "epoch": 1.7647058823529411, "grad_norm": 0.011710714548826218, "learning_rate": 0.00011183006535947713, "loss": 0.0014, "step": 1350 }, { "epoch": 1.7777777777777777, "grad_norm": 2.8777430057525635, "learning_rate": 0.0001111764705882353, "loss": 0.0422, "step": 1360 }, { "epoch": 1.7908496732026142, "grad_norm": 0.15594060719013214, "learning_rate": 0.00011052287581699348, "loss": 0.0046, "step": 1370 }, { "epoch": 1.803921568627451, "grad_norm": 0.01740155927836895, "learning_rate": 0.00010986928104575165, "loss": 0.015, "step": 1380 }, { "epoch": 1.8169934640522876, "grad_norm": 0.010606693103909492, "learning_rate": 0.0001092156862745098, "loss": 0.022, "step": 1390 }, { "epoch": 1.8300653594771243, "grad_norm": 0.02794831059873104, "learning_rate": 0.00010856209150326797, "loss": 0.0051, "step": 1400 }, { "epoch": 1.8300653594771243, "eval_accuracy": 0.976824034334764, "eval_loss": 0.10543067008256912, "eval_runtime": 16.6018, "eval_samples_per_second": 70.173, "eval_steps_per_second": 8.794, "step": 1400 }, { "epoch": 1.843137254901961, "grad_norm": 0.038915157318115234, "learning_rate": 0.00010790849673202614, "loss": 0.0068, "step": 1410 }, { "epoch": 1.8562091503267975, "grad_norm": 0.01105284970253706, "learning_rate": 0.00010725490196078431, "loss": 0.0011, "step": 1420 }, { "epoch": 1.869281045751634, "grad_norm": 0.00978641677647829, "learning_rate": 0.00010660130718954248, "loss": 0.0286, "step": 1430 }, { "epoch": 1.8823529411764706, "grad_norm": 2.092639923095703, "learning_rate": 0.00010594771241830066, "loss": 0.0618, "step": 1440 }, { "epoch": 1.8954248366013071, "grad_norm": 0.11766193807125092, "learning_rate": 0.00010529411764705883, "loss": 0.0048, "step": 1450 }, { "epoch": 1.9084967320261437, "grad_norm": 0.013952341862022877, "learning_rate": 0.000104640522875817, "loss": 0.0041, "step": 1460 }, { "epoch": 1.9215686274509802, "grad_norm": 0.019551385194063187, "learning_rate": 0.00010398692810457517, "loss": 0.0297, "step": 1470 }, { "epoch": 1.934640522875817, "grad_norm": 0.020047994330525398, "learning_rate": 0.00010333333333333334, "loss": 0.0029, "step": 1480 }, { "epoch": 1.9477124183006536, "grad_norm": 1.58188796043396, "learning_rate": 0.0001026797385620915, "loss": 0.0436, "step": 1490 }, { "epoch": 1.9607843137254903, "grad_norm": 5.982904434204102, "learning_rate": 0.00010202614379084967, "loss": 0.063, "step": 1500 }, { "epoch": 1.9607843137254903, "eval_accuracy": 0.9570815450643777, "eval_loss": 0.18960346281528473, "eval_runtime": 14.9121, "eval_samples_per_second": 78.124, "eval_steps_per_second": 9.791, "step": 1500 }, { "epoch": 1.973856209150327, "grad_norm": 0.1691681146621704, "learning_rate": 0.00010137254901960784, "loss": 0.0505, "step": 1510 }, { "epoch": 1.9869281045751634, "grad_norm": 0.16387327015399933, "learning_rate": 0.00010071895424836601, "loss": 0.0139, "step": 1520 }, { "epoch": 2.0, "grad_norm": 0.01866912469267845, "learning_rate": 0.00010006535947712419, "loss": 0.0318, "step": 1530 }, { "epoch": 2.0130718954248366, "grad_norm": 0.43450427055358887, "learning_rate": 9.941176470588236e-05, "loss": 0.0491, "step": 1540 }, { "epoch": 2.026143790849673, "grad_norm": 0.12364810705184937, "learning_rate": 9.875816993464053e-05, "loss": 0.0289, "step": 1550 }, { "epoch": 2.0392156862745097, "grad_norm": 0.008522446267306805, "learning_rate": 9.810457516339869e-05, "loss": 0.0225, "step": 1560 }, { "epoch": 2.052287581699346, "grad_norm": 0.047463927417993546, "learning_rate": 9.745098039215686e-05, "loss": 0.0038, "step": 1570 }, { "epoch": 2.065359477124183, "grad_norm": 0.008024625480175018, "learning_rate": 9.679738562091504e-05, "loss": 0.0014, "step": 1580 }, { "epoch": 2.0784313725490198, "grad_norm": 0.008238660171627998, "learning_rate": 9.614379084967322e-05, "loss": 0.0084, "step": 1590 }, { "epoch": 2.0915032679738563, "grad_norm": 0.007536654360592365, "learning_rate": 9.549019607843139e-05, "loss": 0.002, "step": 1600 }, { "epoch": 2.0915032679738563, "eval_accuracy": 0.9587982832618026, "eval_loss": 0.18859457969665527, "eval_runtime": 15.741, "eval_samples_per_second": 74.01, "eval_steps_per_second": 9.275, "step": 1600 }, { "epoch": 2.104575163398693, "grad_norm": 0.012532038614153862, "learning_rate": 9.483660130718954e-05, "loss": 0.0028, "step": 1610 }, { "epoch": 2.1176470588235294, "grad_norm": 0.006995632313191891, "learning_rate": 9.418300653594772e-05, "loss": 0.0012, "step": 1620 }, { "epoch": 2.130718954248366, "grad_norm": 0.007673881947994232, "learning_rate": 9.352941176470589e-05, "loss": 0.0008, "step": 1630 }, { "epoch": 2.1437908496732025, "grad_norm": 0.008022445254027843, "learning_rate": 9.287581699346406e-05, "loss": 0.0625, "step": 1640 }, { "epoch": 2.156862745098039, "grad_norm": 0.008978744968771935, "learning_rate": 9.222222222222223e-05, "loss": 0.0328, "step": 1650 }, { "epoch": 2.1699346405228757, "grad_norm": 0.008565125055611134, "learning_rate": 9.156862745098039e-05, "loss": 0.0429, "step": 1660 }, { "epoch": 2.183006535947712, "grad_norm": 0.009053260087966919, "learning_rate": 9.091503267973856e-05, "loss": 0.0227, "step": 1670 }, { "epoch": 2.196078431372549, "grad_norm": 0.008361272513866425, "learning_rate": 9.026143790849673e-05, "loss": 0.0011, "step": 1680 }, { "epoch": 2.2091503267973858, "grad_norm": 0.008024726063013077, "learning_rate": 8.96078431372549e-05, "loss": 0.001, "step": 1690 }, { "epoch": 2.2222222222222223, "grad_norm": 0.00964928325265646, "learning_rate": 8.895424836601307e-05, "loss": 0.005, "step": 1700 }, { "epoch": 2.2222222222222223, "eval_accuracy": 0.9733905579399141, "eval_loss": 0.11838679015636444, "eval_runtime": 15.5963, "eval_samples_per_second": 74.697, "eval_steps_per_second": 9.361, "step": 1700 }, { "epoch": 2.235294117647059, "grad_norm": 0.00880915205925703, "learning_rate": 8.830065359477125e-05, "loss": 0.0011, "step": 1710 }, { "epoch": 2.2483660130718954, "grad_norm": 0.655288577079773, "learning_rate": 8.764705882352942e-05, "loss": 0.0019, "step": 1720 }, { "epoch": 2.261437908496732, "grad_norm": 0.0070320917293429375, "learning_rate": 8.699346405228759e-05, "loss": 0.0009, "step": 1730 }, { "epoch": 2.2745098039215685, "grad_norm": 0.006975400261580944, "learning_rate": 8.633986928104576e-05, "loss": 0.0017, "step": 1740 }, { "epoch": 2.287581699346405, "grad_norm": 0.007141157519072294, "learning_rate": 8.568627450980392e-05, "loss": 0.001, "step": 1750 }, { "epoch": 2.3006535947712417, "grad_norm": 0.007537210825830698, "learning_rate": 8.503267973856209e-05, "loss": 0.0012, "step": 1760 }, { "epoch": 2.313725490196078, "grad_norm": 0.764282763004303, "learning_rate": 8.437908496732026e-05, "loss": 0.009, "step": 1770 }, { "epoch": 2.326797385620915, "grad_norm": 0.009738568216562271, "learning_rate": 8.372549019607843e-05, "loss": 0.0899, "step": 1780 }, { "epoch": 2.3398692810457518, "grad_norm": 0.05287899821996689, "learning_rate": 8.30718954248366e-05, "loss": 0.0027, "step": 1790 }, { "epoch": 2.3529411764705883, "grad_norm": 0.0178971029818058, "learning_rate": 8.241830065359478e-05, "loss": 0.0083, "step": 1800 }, { "epoch": 2.3529411764705883, "eval_accuracy": 0.9759656652360515, "eval_loss": 0.10839741677045822, "eval_runtime": 14.7686, "eval_samples_per_second": 78.884, "eval_steps_per_second": 9.886, "step": 1800 }, { "epoch": 2.366013071895425, "grad_norm": 0.01301596313714981, "learning_rate": 8.176470588235295e-05, "loss": 0.0016, "step": 1810 }, { "epoch": 2.3790849673202614, "grad_norm": 0.01324276439845562, "learning_rate": 8.111111111111112e-05, "loss": 0.0135, "step": 1820 }, { "epoch": 2.392156862745098, "grad_norm": 0.013326325453817844, "learning_rate": 8.045751633986929e-05, "loss": 0.0334, "step": 1830 }, { "epoch": 2.4052287581699345, "grad_norm": 0.011139987967908382, "learning_rate": 7.980392156862746e-05, "loss": 0.0014, "step": 1840 }, { "epoch": 2.418300653594771, "grad_norm": 0.0077317566610872746, "learning_rate": 7.915032679738562e-05, "loss": 0.0011, "step": 1850 }, { "epoch": 2.431372549019608, "grad_norm": 0.03215256333351135, "learning_rate": 7.849673202614379e-05, "loss": 0.0354, "step": 1860 }, { "epoch": 2.4444444444444446, "grad_norm": 0.009134942665696144, "learning_rate": 7.784313725490196e-05, "loss": 0.0018, "step": 1870 }, { "epoch": 2.457516339869281, "grad_norm": 0.008007184602320194, "learning_rate": 7.718954248366013e-05, "loss": 0.001, "step": 1880 }, { "epoch": 2.4705882352941178, "grad_norm": 0.015561921522021294, "learning_rate": 7.653594771241829e-05, "loss": 0.0009, "step": 1890 }, { "epoch": 2.4836601307189543, "grad_norm": 0.04056130349636078, "learning_rate": 7.588235294117648e-05, "loss": 0.0013, "step": 1900 }, { "epoch": 2.4836601307189543, "eval_accuracy": 0.9776824034334763, "eval_loss": 0.0902954488992691, "eval_runtime": 14.9792, "eval_samples_per_second": 77.774, "eval_steps_per_second": 9.747, "step": 1900 }, { "epoch": 2.496732026143791, "grad_norm": 0.00714436499401927, "learning_rate": 7.522875816993465e-05, "loss": 0.0434, "step": 1910 }, { "epoch": 2.5098039215686274, "grad_norm": 0.009256112389266491, "learning_rate": 7.457516339869282e-05, "loss": 0.001, "step": 1920 }, { "epoch": 2.522875816993464, "grad_norm": 0.016420746222138405, "learning_rate": 7.392156862745099e-05, "loss": 0.0282, "step": 1930 }, { "epoch": 2.5359477124183005, "grad_norm": 0.013019098900258541, "learning_rate": 7.326797385620915e-05, "loss": 0.0017, "step": 1940 }, { "epoch": 2.549019607843137, "grad_norm": 3.201817274093628, "learning_rate": 7.261437908496732e-05, "loss": 0.0274, "step": 1950 }, { "epoch": 2.5620915032679736, "grad_norm": 0.009792659431695938, "learning_rate": 7.196078431372549e-05, "loss": 0.0011, "step": 1960 }, { "epoch": 2.57516339869281, "grad_norm": 0.010890874080359936, "learning_rate": 7.130718954248366e-05, "loss": 0.0329, "step": 1970 }, { "epoch": 2.588235294117647, "grad_norm": 0.02582782320678234, "learning_rate": 7.065359477124184e-05, "loss": 0.0024, "step": 1980 }, { "epoch": 2.6013071895424837, "grad_norm": 4.812190055847168, "learning_rate": 7e-05, "loss": 0.0134, "step": 1990 }, { "epoch": 2.6143790849673203, "grad_norm": 0.05194231495261192, "learning_rate": 6.934640522875817e-05, "loss": 0.0298, "step": 2000 }, { "epoch": 2.6143790849673203, "eval_accuracy": 0.9733905579399141, "eval_loss": 0.10234156996011734, "eval_runtime": 15.3135, "eval_samples_per_second": 76.077, "eval_steps_per_second": 9.534, "step": 2000 }, { "epoch": 2.627450980392157, "grad_norm": 0.01054445281624794, "learning_rate": 6.869281045751634e-05, "loss": 0.0021, "step": 2010 }, { "epoch": 2.6405228758169934, "grad_norm": 0.010878126136958599, "learning_rate": 6.803921568627452e-05, "loss": 0.0015, "step": 2020 }, { "epoch": 2.65359477124183, "grad_norm": 0.032522134482860565, "learning_rate": 6.73856209150327e-05, "loss": 0.035, "step": 2030 }, { "epoch": 2.6666666666666665, "grad_norm": 0.02457094006240368, "learning_rate": 6.673202614379085e-05, "loss": 0.0032, "step": 2040 }, { "epoch": 2.6797385620915035, "grad_norm": 0.014886660501360893, "learning_rate": 6.607843137254902e-05, "loss": 0.003, "step": 2050 }, { "epoch": 2.69281045751634, "grad_norm": 0.020414328202605247, "learning_rate": 6.54248366013072e-05, "loss": 0.0017, "step": 2060 }, { "epoch": 2.7058823529411766, "grad_norm": 0.009341062046587467, "learning_rate": 6.477124183006537e-05, "loss": 0.0013, "step": 2070 }, { "epoch": 2.718954248366013, "grad_norm": 0.0061281765811145306, "learning_rate": 6.411764705882354e-05, "loss": 0.0009, "step": 2080 }, { "epoch": 2.7320261437908497, "grad_norm": 0.009815551340579987, "learning_rate": 6.34640522875817e-05, "loss": 0.0009, "step": 2090 }, { "epoch": 2.7450980392156863, "grad_norm": 0.0064156195148825645, "learning_rate": 6.281045751633987e-05, "loss": 0.0008, "step": 2100 }, { "epoch": 2.7450980392156863, "eval_accuracy": 0.976824034334764, "eval_loss": 0.110395647585392, "eval_runtime": 15.8536, "eval_samples_per_second": 73.485, "eval_steps_per_second": 9.209, "step": 2100 }, { "epoch": 2.758169934640523, "grad_norm": 0.009114363230764866, "learning_rate": 6.215686274509804e-05, "loss": 0.0008, "step": 2110 }, { "epoch": 2.7712418300653594, "grad_norm": 0.012108026072382927, "learning_rate": 6.150326797385621e-05, "loss": 0.0425, "step": 2120 }, { "epoch": 2.784313725490196, "grad_norm": 0.02400955744087696, "learning_rate": 6.0849673202614375e-05, "loss": 0.0011, "step": 2130 }, { "epoch": 2.7973856209150325, "grad_norm": 0.006131911184638739, "learning_rate": 6.0196078431372546e-05, "loss": 0.001, "step": 2140 }, { "epoch": 2.810457516339869, "grad_norm": 0.01375108677893877, "learning_rate": 5.9542483660130724e-05, "loss": 0.0014, "step": 2150 }, { "epoch": 2.8235294117647056, "grad_norm": 0.27725741267204285, "learning_rate": 5.8888888888888896e-05, "loss": 0.001, "step": 2160 }, { "epoch": 2.8366013071895426, "grad_norm": 0.005803197622299194, "learning_rate": 5.823529411764707e-05, "loss": 0.0012, "step": 2170 }, { "epoch": 2.849673202614379, "grad_norm": 0.005719684064388275, "learning_rate": 5.7581699346405225e-05, "loss": 0.0007, "step": 2180 }, { "epoch": 2.8627450980392157, "grad_norm": 0.049487028270959854, "learning_rate": 5.69281045751634e-05, "loss": 0.0013, "step": 2190 }, { "epoch": 2.8758169934640523, "grad_norm": 0.005333769600838423, "learning_rate": 5.627450980392157e-05, "loss": 0.0011, "step": 2200 }, { "epoch": 2.8758169934640523, "eval_accuracy": 0.9785407725321889, "eval_loss": 0.11278601735830307, "eval_runtime": 15.3235, "eval_samples_per_second": 76.027, "eval_steps_per_second": 9.528, "step": 2200 }, { "epoch": 2.888888888888889, "grad_norm": 0.013878793455660343, "learning_rate": 5.5620915032679746e-05, "loss": 0.0191, "step": 2210 }, { "epoch": 2.9019607843137254, "grad_norm": 0.005247410852462053, "learning_rate": 5.496732026143792e-05, "loss": 0.0008, "step": 2220 }, { "epoch": 2.915032679738562, "grad_norm": 0.010724911466240883, "learning_rate": 5.4313725490196076e-05, "loss": 0.0006, "step": 2230 }, { "epoch": 2.928104575163399, "grad_norm": 0.014705362729728222, "learning_rate": 5.366013071895425e-05, "loss": 0.0006, "step": 2240 }, { "epoch": 2.9411764705882355, "grad_norm": 0.007574844174087048, "learning_rate": 5.300653594771242e-05, "loss": 0.0007, "step": 2250 }, { "epoch": 2.954248366013072, "grad_norm": 0.004934164695441723, "learning_rate": 5.235294117647059e-05, "loss": 0.0015, "step": 2260 }, { "epoch": 2.9673202614379086, "grad_norm": 0.005061435047537088, "learning_rate": 5.169934640522877e-05, "loss": 0.0006, "step": 2270 }, { "epoch": 2.980392156862745, "grad_norm": 0.005057324655354023, "learning_rate": 5.104575163398693e-05, "loss": 0.0006, "step": 2280 }, { "epoch": 2.9934640522875817, "grad_norm": 0.004673287738114595, "learning_rate": 5.045751633986928e-05, "loss": 0.0127, "step": 2290 }, { "epoch": 3.0065359477124183, "grad_norm": 0.004593541845679283, "learning_rate": 4.980392156862745e-05, "loss": 0.0006, "step": 2300 }, { "epoch": 3.0065359477124183, "eval_accuracy": 0.9733905579399141, "eval_loss": 0.139481782913208, "eval_runtime": 14.0319, "eval_samples_per_second": 83.025, "eval_steps_per_second": 10.405, "step": 2300 }, { "epoch": 3.019607843137255, "grad_norm": 0.004796142224222422, "learning_rate": 4.915032679738562e-05, "loss": 0.0006, "step": 2310 }, { "epoch": 3.0326797385620914, "grad_norm": 0.005047705490142107, "learning_rate": 4.8496732026143794e-05, "loss": 0.0006, "step": 2320 }, { "epoch": 3.045751633986928, "grad_norm": 0.004708459600806236, "learning_rate": 4.7843137254901966e-05, "loss": 0.0005, "step": 2330 }, { "epoch": 3.0588235294117645, "grad_norm": 0.004462133627384901, "learning_rate": 4.718954248366013e-05, "loss": 0.0006, "step": 2340 }, { "epoch": 3.0718954248366015, "grad_norm": 0.004772055894136429, "learning_rate": 4.65359477124183e-05, "loss": 0.0009, "step": 2350 }, { "epoch": 3.084967320261438, "grad_norm": 0.004368507768958807, "learning_rate": 4.588235294117647e-05, "loss": 0.0005, "step": 2360 }, { "epoch": 3.0980392156862746, "grad_norm": 0.004448315594345331, "learning_rate": 4.5228758169934645e-05, "loss": 0.0005, "step": 2370 }, { "epoch": 3.111111111111111, "grad_norm": 0.004565922077745199, "learning_rate": 4.4575163398692816e-05, "loss": 0.0173, "step": 2380 }, { "epoch": 3.1241830065359477, "grad_norm": 0.004459939897060394, "learning_rate": 4.392156862745098e-05, "loss": 0.0069, "step": 2390 }, { "epoch": 3.1372549019607843, "grad_norm": 0.004509753547608852, "learning_rate": 4.326797385620915e-05, "loss": 0.0059, "step": 2400 }, { "epoch": 3.1372549019607843, "eval_accuracy": 0.9725321888412017, "eval_loss": 0.14194443821907043, "eval_runtime": 13.8686, "eval_samples_per_second": 84.002, "eval_steps_per_second": 10.527, "step": 2400 }, { "epoch": 3.150326797385621, "grad_norm": 0.004385701846331358, "learning_rate": 4.2614379084967324e-05, "loss": 0.0005, "step": 2410 }, { "epoch": 3.1633986928104574, "grad_norm": 0.5638505220413208, "learning_rate": 4.1960784313725496e-05, "loss": 0.0234, "step": 2420 }, { "epoch": 3.176470588235294, "grad_norm": 0.004813672509044409, "learning_rate": 4.130718954248366e-05, "loss": 0.0005, "step": 2430 }, { "epoch": 3.189542483660131, "grad_norm": 0.0054300627671182156, "learning_rate": 4.065359477124183e-05, "loss": 0.004, "step": 2440 }, { "epoch": 3.2026143790849675, "grad_norm": 0.004265918862074614, "learning_rate": 4e-05, "loss": 0.0005, "step": 2450 }, { "epoch": 3.215686274509804, "grad_norm": 0.005545614752918482, "learning_rate": 3.934640522875817e-05, "loss": 0.0005, "step": 2460 }, { "epoch": 3.2287581699346406, "grad_norm": 0.004381006117910147, "learning_rate": 3.8692810457516346e-05, "loss": 0.0005, "step": 2470 }, { "epoch": 3.241830065359477, "grad_norm": 0.004227633588016033, "learning_rate": 3.803921568627451e-05, "loss": 0.0005, "step": 2480 }, { "epoch": 3.2549019607843137, "grad_norm": 0.016775546595454216, "learning_rate": 3.738562091503268e-05, "loss": 0.0005, "step": 2490 }, { "epoch": 3.2679738562091503, "grad_norm": 0.024223702028393745, "learning_rate": 3.6732026143790854e-05, "loss": 0.0005, "step": 2500 }, { "epoch": 3.2679738562091503, "eval_accuracy": 0.9776824034334763, "eval_loss": 0.13354463875293732, "eval_runtime": 14.2248, "eval_samples_per_second": 81.899, "eval_steps_per_second": 10.264, "step": 2500 }, { "epoch": 3.281045751633987, "grad_norm": 0.0041252183727920055, "learning_rate": 3.607843137254902e-05, "loss": 0.0005, "step": 2510 }, { "epoch": 3.2941176470588234, "grad_norm": 0.004066206980496645, "learning_rate": 3.542483660130719e-05, "loss": 0.0008, "step": 2520 }, { "epoch": 3.30718954248366, "grad_norm": 0.004196736495941877, "learning_rate": 3.477124183006536e-05, "loss": 0.0036, "step": 2530 }, { "epoch": 3.3202614379084965, "grad_norm": 0.004051230847835541, "learning_rate": 3.411764705882353e-05, "loss": 0.0005, "step": 2540 }, { "epoch": 3.3333333333333335, "grad_norm": 0.004058164078742266, "learning_rate": 3.34640522875817e-05, "loss": 0.0175, "step": 2550 }, { "epoch": 3.34640522875817, "grad_norm": 0.0038333344273269176, "learning_rate": 3.281045751633987e-05, "loss": 0.0005, "step": 2560 }, { "epoch": 3.3594771241830066, "grad_norm": 0.004068716894835234, "learning_rate": 3.215686274509804e-05, "loss": 0.0005, "step": 2570 }, { "epoch": 3.372549019607843, "grad_norm": 0.004003152251243591, "learning_rate": 3.150326797385621e-05, "loss": 0.001, "step": 2580 }, { "epoch": 3.3856209150326797, "grad_norm": 0.0039999475702643394, "learning_rate": 3.0849673202614384e-05, "loss": 0.0005, "step": 2590 }, { "epoch": 3.3986928104575163, "grad_norm": 0.003976929467171431, "learning_rate": 3.019607843137255e-05, "loss": 0.0005, "step": 2600 }, { "epoch": 3.3986928104575163, "eval_accuracy": 0.976824034334764, "eval_loss": 0.12492600083351135, "eval_runtime": 13.7805, "eval_samples_per_second": 84.54, "eval_steps_per_second": 10.595, "step": 2600 }, { "epoch": 3.411764705882353, "grad_norm": 0.0038215164095163345, "learning_rate": 2.954248366013072e-05, "loss": 0.0004, "step": 2610 }, { "epoch": 3.4248366013071894, "grad_norm": 0.0037740464322268963, "learning_rate": 2.8888888888888888e-05, "loss": 0.0004, "step": 2620 }, { "epoch": 3.4379084967320264, "grad_norm": 0.003947110380977392, "learning_rate": 2.823529411764706e-05, "loss": 0.0005, "step": 2630 }, { "epoch": 3.450980392156863, "grad_norm": 0.003939367830753326, "learning_rate": 2.758169934640523e-05, "loss": 0.0004, "step": 2640 }, { "epoch": 3.4640522875816995, "grad_norm": 0.0037511338014155626, "learning_rate": 2.69281045751634e-05, "loss": 0.0004, "step": 2650 }, { "epoch": 3.477124183006536, "grad_norm": 0.004019754473119974, "learning_rate": 2.627450980392157e-05, "loss": 0.0367, "step": 2660 }, { "epoch": 3.4901960784313726, "grad_norm": 0.009803508408367634, "learning_rate": 2.562091503267974e-05, "loss": 0.0005, "step": 2670 }, { "epoch": 3.503267973856209, "grad_norm": 0.013077986426651478, "learning_rate": 2.496732026143791e-05, "loss": 0.0007, "step": 2680 }, { "epoch": 3.5163398692810457, "grad_norm": 0.007836658507585526, "learning_rate": 2.431372549019608e-05, "loss": 0.0008, "step": 2690 }, { "epoch": 3.5294117647058822, "grad_norm": 0.004085828084498644, "learning_rate": 2.366013071895425e-05, "loss": 0.0007, "step": 2700 }, { "epoch": 3.5294117647058822, "eval_accuracy": 0.9776824034334763, "eval_loss": 0.1157020702958107, "eval_runtime": 14.8164, "eval_samples_per_second": 78.629, "eval_steps_per_second": 9.854, "step": 2700 }, { "epoch": 3.542483660130719, "grad_norm": 0.004049411043524742, "learning_rate": 2.3006535947712418e-05, "loss": 0.0005, "step": 2710 }, { "epoch": 3.5555555555555554, "grad_norm": 0.015368789434432983, "learning_rate": 2.235294117647059e-05, "loss": 0.0006, "step": 2720 }, { "epoch": 3.568627450980392, "grad_norm": 0.003844345221295953, "learning_rate": 2.169934640522876e-05, "loss": 0.0008, "step": 2730 }, { "epoch": 3.581699346405229, "grad_norm": 0.004043512977659702, "learning_rate": 2.104575163398693e-05, "loss": 0.0005, "step": 2740 }, { "epoch": 3.5947712418300655, "grad_norm": 0.003938615787774324, "learning_rate": 2.0392156862745097e-05, "loss": 0.0005, "step": 2750 }, { "epoch": 3.607843137254902, "grad_norm": 0.008323022164404392, "learning_rate": 1.973856209150327e-05, "loss": 0.0005, "step": 2760 }, { "epoch": 3.6209150326797386, "grad_norm": 0.004000243730843067, "learning_rate": 1.9084967320261437e-05, "loss": 0.0005, "step": 2770 }, { "epoch": 3.633986928104575, "grad_norm": 0.004011669661849737, "learning_rate": 1.843137254901961e-05, "loss": 0.0005, "step": 2780 }, { "epoch": 3.6470588235294117, "grad_norm": 0.01243202667683363, "learning_rate": 1.777777777777778e-05, "loss": 0.0139, "step": 2790 }, { "epoch": 3.6601307189542482, "grad_norm": 0.0176347978413105, "learning_rate": 1.7124183006535948e-05, "loss": 0.0005, "step": 2800 }, { "epoch": 3.6601307189542482, "eval_accuracy": 0.9785407725321889, "eval_loss": 0.12019691616296768, "eval_runtime": 14.517, "eval_samples_per_second": 80.251, "eval_steps_per_second": 10.057, "step": 2800 }, { "epoch": 3.6732026143790852, "grad_norm": 0.0037740224506706, "learning_rate": 1.647058823529412e-05, "loss": 0.0005, "step": 2810 }, { "epoch": 3.686274509803922, "grad_norm": 0.004195007495582104, "learning_rate": 1.5816993464052288e-05, "loss": 0.0008, "step": 2820 }, { "epoch": 3.6993464052287583, "grad_norm": 0.0038069712463766336, "learning_rate": 1.5163398692810458e-05, "loss": 0.0007, "step": 2830 }, { "epoch": 3.712418300653595, "grad_norm": 0.010378316044807434, "learning_rate": 1.4509803921568629e-05, "loss": 0.0006, "step": 2840 }, { "epoch": 3.7254901960784315, "grad_norm": 0.003901832504197955, "learning_rate": 1.3856209150326799e-05, "loss": 0.0005, "step": 2850 }, { "epoch": 3.738562091503268, "grad_norm": 0.003902917029336095, "learning_rate": 1.3202614379084969e-05, "loss": 0.0005, "step": 2860 }, { "epoch": 3.7516339869281046, "grad_norm": 0.004125463310629129, "learning_rate": 1.2549019607843138e-05, "loss": 0.001, "step": 2870 }, { "epoch": 3.764705882352941, "grad_norm": 0.003765885718166828, "learning_rate": 1.1895424836601307e-05, "loss": 0.0006, "step": 2880 }, { "epoch": 3.7777777777777777, "grad_norm": 0.0036050116177648306, "learning_rate": 1.1241830065359478e-05, "loss": 0.001, "step": 2890 }, { "epoch": 3.7908496732026142, "grad_norm": 0.003705250099301338, "learning_rate": 1.0588235294117648e-05, "loss": 0.001, "step": 2900 }, { "epoch": 3.7908496732026142, "eval_accuracy": 0.9776824034334763, "eval_loss": 0.12394391000270844, "eval_runtime": 14.3108, "eval_samples_per_second": 81.407, "eval_steps_per_second": 10.202, "step": 2900 }, { "epoch": 3.803921568627451, "grad_norm": 0.01087539829313755, "learning_rate": 9.934640522875818e-06, "loss": 0.0005, "step": 2910 }, { "epoch": 3.8169934640522873, "grad_norm": 0.004286112263798714, "learning_rate": 9.281045751633987e-06, "loss": 0.0004, "step": 2920 }, { "epoch": 3.8300653594771243, "grad_norm": 0.005555053241550922, "learning_rate": 8.627450980392157e-06, "loss": 0.0006, "step": 2930 }, { "epoch": 3.843137254901961, "grad_norm": 0.05337900295853615, "learning_rate": 7.973856209150327e-06, "loss": 0.0007, "step": 2940 }, { "epoch": 3.8562091503267975, "grad_norm": 0.0040720016695559025, "learning_rate": 7.320261437908498e-06, "loss": 0.0004, "step": 2950 }, { "epoch": 3.869281045751634, "grad_norm": 0.0036330316215753555, "learning_rate": 6.666666666666667e-06, "loss": 0.0006, "step": 2960 }, { "epoch": 3.8823529411764706, "grad_norm": 0.00944702047854662, "learning_rate": 6.013071895424837e-06, "loss": 0.0009, "step": 2970 }, { "epoch": 3.895424836601307, "grad_norm": 0.0034994047600775957, "learning_rate": 5.359477124183006e-06, "loss": 0.0006, "step": 2980 }, { "epoch": 3.9084967320261437, "grad_norm": 0.030153293162584305, "learning_rate": 4.705882352941177e-06, "loss": 0.0007, "step": 2990 }, { "epoch": 3.9215686274509802, "grad_norm": 0.0036808131262660027, "learning_rate": 4.052287581699347e-06, "loss": 0.0004, "step": 3000 }, { "epoch": 3.9215686274509802, "eval_accuracy": 0.976824034334764, "eval_loss": 0.12309978902339935, "eval_runtime": 13.6907, "eval_samples_per_second": 85.094, "eval_steps_per_second": 10.664, "step": 3000 }, { "epoch": 3.9346405228758172, "grad_norm": 0.0034610480070114136, "learning_rate": 3.398692810457516e-06, "loss": 0.0005, "step": 3010 }, { "epoch": 3.947712418300654, "grad_norm": 0.0036175919231027365, "learning_rate": 2.7450980392156863e-06, "loss": 0.0005, "step": 3020 }, { "epoch": 3.9607843137254903, "grad_norm": 0.003583298297598958, "learning_rate": 2.091503267973856e-06, "loss": 0.0005, "step": 3030 }, { "epoch": 3.973856209150327, "grad_norm": 0.003492480143904686, "learning_rate": 1.4379084967320261e-06, "loss": 0.0005, "step": 3040 }, { "epoch": 3.9869281045751634, "grad_norm": 0.003486211644485593, "learning_rate": 7.843137254901962e-07, "loss": 0.0004, "step": 3050 }, { "epoch": 4.0, "grad_norm": 0.0035035167820751667, "learning_rate": 1.30718954248366e-07, "loss": 0.0006, "step": 3060 }, { "epoch": 4.0, "step": 3060, "total_flos": 3.7909081319458406e+18, "train_loss": 0.047429592626662374, "train_runtime": 1590.6048, "train_samples_per_second": 30.756, "train_steps_per_second": 1.924 } ], "logging_steps": 10, "max_steps": 3060, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 3.7909081319458406e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }