diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6727 @@ +{ + "best_metric": 0.23809599876403809, + "best_model_checkpoint": "./trained-gender/checkpoint-10000", + "epoch": 2.0287716709701216, + "eval_steps": 1000, + "global_step": 11000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 0.000199907783105865, + "loss": 0.681, + "step": 10 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019981556621172998, + "loss": 0.6802, + "step": 20 + }, + { + "epoch": 0.01, + "learning_rate": 0.000199723349317595, + "loss": 0.6835, + "step": 30 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019963113242345997, + "loss": 0.6822, + "step": 40 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019953891552932499, + "loss": 0.6896, + "step": 50 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019944669863518997, + "loss": 0.6792, + "step": 60 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019935448174105498, + "loss": 0.6828, + "step": 70 + }, + { + "epoch": 0.01, + "learning_rate": 0.00019926226484691997, + "loss": 0.6648, + "step": 80 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019917004795278495, + "loss": 0.6673, + "step": 90 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019907783105864993, + "loss": 0.6628, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019898561416451495, + "loss": 0.6715, + "step": 110 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019889339727037993, + "loss": 0.6828, + "step": 120 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019880118037624494, + "loss": 0.6602, + "step": 130 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019870896348210993, + "loss": 0.6653, + "step": 140 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019861674658797494, + "loss": 0.6628, + "step": 150 + }, + { + "epoch": 0.03, + "learning_rate": 0.00019852452969383992, + "loss": 0.643, + "step": 160 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001984323127997049, + "loss": 0.64, + "step": 170 + }, + { + "epoch": 0.03, + "learning_rate": 0.0001983400959055699, + "loss": 0.6225, + "step": 180 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001982478790114349, + "loss": 0.6459, + "step": 190 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001981556621172999, + "loss": 0.6327, + "step": 200 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001980634452231649, + "loss": 0.6477, + "step": 210 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019797122832902988, + "loss": 0.6269, + "step": 220 + }, + { + "epoch": 0.04, + "learning_rate": 0.0001978790114348949, + "loss": 0.6123, + "step": 230 + }, + { + "epoch": 0.04, + "learning_rate": 0.00019778679454075988, + "loss": 0.616, + "step": 240 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019769457764662486, + "loss": 0.6306, + "step": 250 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019760236075248985, + "loss": 0.556, + "step": 260 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019751014385835486, + "loss": 0.6079, + "step": 270 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019741792696421984, + "loss": 0.564, + "step": 280 + }, + { + "epoch": 0.05, + "learning_rate": 0.00019732571007008486, + "loss": 0.569, + "step": 290 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019723349317594984, + "loss": 0.5822, + "step": 300 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019714127628181485, + "loss": 0.5743, + "step": 310 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019704905938767984, + "loss": 0.5493, + "step": 320 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019695684249354482, + "loss": 0.5691, + "step": 330 + }, + { + "epoch": 0.06, + "learning_rate": 0.0001968646255994098, + "loss": 0.5501, + "step": 340 + }, + { + "epoch": 0.06, + "learning_rate": 0.00019677240870527482, + "loss": 0.5496, + "step": 350 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001966801918111398, + "loss": 0.5383, + "step": 360 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001965879749170048, + "loss": 0.5355, + "step": 370 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001964957580228698, + "loss": 0.5576, + "step": 380 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001964035411287348, + "loss": 0.5555, + "step": 390 + }, + { + "epoch": 0.07, + "learning_rate": 0.0001963113242345998, + "loss": 0.5702, + "step": 400 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019621910734046478, + "loss": 0.6111, + "step": 410 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019612689044632976, + "loss": 0.5336, + "step": 420 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019603467355219477, + "loss": 0.5369, + "step": 430 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019594245665805976, + "loss": 0.5608, + "step": 440 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019585023976392477, + "loss": 0.5255, + "step": 450 + }, + { + "epoch": 0.08, + "learning_rate": 0.00019575802286978975, + "loss": 0.4979, + "step": 460 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019566580597565476, + "loss": 0.5375, + "step": 470 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019557358908151975, + "loss": 0.5485, + "step": 480 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019548137218738473, + "loss": 0.5324, + "step": 490 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019538915529324972, + "loss": 0.4639, + "step": 500 + }, + { + "epoch": 0.09, + "learning_rate": 0.00019529693839911473, + "loss": 0.4846, + "step": 510 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019520472150497971, + "loss": 0.541, + "step": 520 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019511250461084473, + "loss": 0.5626, + "step": 530 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001950202877167097, + "loss": 0.5714, + "step": 540 + }, + { + "epoch": 0.1, + "learning_rate": 0.00019492807082257472, + "loss": 0.5184, + "step": 550 + }, + { + "epoch": 0.1, + "learning_rate": 0.0001948358539284397, + "loss": 0.5706, + "step": 560 + }, + { + "epoch": 0.11, + "learning_rate": 0.0001947436370343047, + "loss": 0.5423, + "step": 570 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019465142014016967, + "loss": 0.5032, + "step": 580 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019455920324603469, + "loss": 0.4187, + "step": 590 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019446698635189967, + "loss": 0.4939, + "step": 600 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019437476945776468, + "loss": 0.4286, + "step": 610 + }, + { + "epoch": 0.11, + "learning_rate": 0.00019428255256362967, + "loss": 0.4813, + "step": 620 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019419033566949468, + "loss": 0.5191, + "step": 630 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019409811877535966, + "loss": 0.4069, + "step": 640 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019400590188122465, + "loss": 0.4806, + "step": 650 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019391368498708963, + "loss": 0.4167, + "step": 660 + }, + { + "epoch": 0.12, + "learning_rate": 0.00019382146809295462, + "loss": 0.4626, + "step": 670 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019372925119881963, + "loss": 0.4348, + "step": 680 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019363703430468464, + "loss": 0.5163, + "step": 690 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019354481741054962, + "loss": 0.4216, + "step": 700 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001934526005164146, + "loss": 0.4174, + "step": 710 + }, + { + "epoch": 0.13, + "learning_rate": 0.00019336038362227962, + "loss": 0.3745, + "step": 720 + }, + { + "epoch": 0.13, + "learning_rate": 0.0001932681667281446, + "loss": 0.4898, + "step": 730 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001931759498340096, + "loss": 0.4293, + "step": 740 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019308373293987457, + "loss": 0.4105, + "step": 750 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019299151604573958, + "loss": 0.4906, + "step": 760 + }, + { + "epoch": 0.14, + "learning_rate": 0.0001928992991516046, + "loss": 0.4998, + "step": 770 + }, + { + "epoch": 0.14, + "learning_rate": 0.00019280708225746958, + "loss": 0.4471, + "step": 780 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019271486536333456, + "loss": 0.4673, + "step": 790 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019262264846919958, + "loss": 0.4972, + "step": 800 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019253043157506456, + "loss": 0.5052, + "step": 810 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019243821468092954, + "loss": 0.4974, + "step": 820 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019234599778679453, + "loss": 0.4504, + "step": 830 + }, + { + "epoch": 0.15, + "learning_rate": 0.00019225378089265954, + "loss": 0.4322, + "step": 840 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019216156399852455, + "loss": 0.5118, + "step": 850 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019206934710438954, + "loss": 0.4815, + "step": 860 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019197713021025452, + "loss": 0.4534, + "step": 870 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019188491331611953, + "loss": 0.4236, + "step": 880 + }, + { + "epoch": 0.16, + "learning_rate": 0.00019179269642198452, + "loss": 0.4925, + "step": 890 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001917004795278495, + "loss": 0.5201, + "step": 900 + }, + { + "epoch": 0.17, + "learning_rate": 0.00019160826263371449, + "loss": 0.4272, + "step": 910 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001915160457395795, + "loss": 0.4909, + "step": 920 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001914238288454445, + "loss": 0.4485, + "step": 930 + }, + { + "epoch": 0.17, + "learning_rate": 0.0001913316119513095, + "loss": 0.4188, + "step": 940 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019123939505717448, + "loss": 0.437, + "step": 950 + }, + { + "epoch": 0.18, + "learning_rate": 0.0001911471781630395, + "loss": 0.4208, + "step": 960 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019105496126890447, + "loss": 0.5295, + "step": 970 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019096274437476946, + "loss": 0.39, + "step": 980 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019087052748063444, + "loss": 0.4634, + "step": 990 + }, + { + "epoch": 0.18, + "learning_rate": 0.00019077831058649945, + "loss": 0.4277, + "step": 1000 + }, + { + "epoch": 0.18, + "eval_accuracy": 0.8089282453898119, + "eval_loss": 0.4054337739944458, + "eval_runtime": 99.6522, + "eval_samples_per_second": 109.922, + "eval_steps_per_second": 13.748, + "step": 1000 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019068609369236446, + "loss": 0.4296, + "step": 1010 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019059387679822945, + "loss": 0.4035, + "step": 1020 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019050165990409443, + "loss": 0.3379, + "step": 1030 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019040944300995944, + "loss": 0.451, + "step": 1040 + }, + { + "epoch": 0.19, + "learning_rate": 0.00019031722611582443, + "loss": 0.4277, + "step": 1050 + }, + { + "epoch": 0.2, + "learning_rate": 0.00019022500922168941, + "loss": 0.5039, + "step": 1060 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001901327923275544, + "loss": 0.4543, + "step": 1070 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001900405754334194, + "loss": 0.3792, + "step": 1080 + }, + { + "epoch": 0.2, + "learning_rate": 0.00018994835853928442, + "loss": 0.4495, + "step": 1090 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001898561416451494, + "loss": 0.445, + "step": 1100 + }, + { + "epoch": 0.2, + "learning_rate": 0.0001897639247510144, + "loss": 0.4661, + "step": 1110 + }, + { + "epoch": 0.21, + "learning_rate": 0.0001896717078568794, + "loss": 0.4948, + "step": 1120 + }, + { + "epoch": 0.21, + "learning_rate": 0.00018957949096274439, + "loss": 0.4696, + "step": 1130 + }, + { + "epoch": 0.21, + "learning_rate": 0.00018948727406860937, + "loss": 0.3564, + "step": 1140 + }, + { + "epoch": 0.21, + "learning_rate": 0.00018939505717447435, + "loss": 0.3959, + "step": 1150 + }, + { + "epoch": 0.21, + "learning_rate": 0.00018930284028033937, + "loss": 0.3899, + "step": 1160 + }, + { + "epoch": 0.22, + "learning_rate": 0.00018921062338620438, + "loss": 0.3641, + "step": 1170 + }, + { + "epoch": 0.22, + "learning_rate": 0.00018911840649206936, + "loss": 0.4144, + "step": 1180 + }, + { + "epoch": 0.22, + "learning_rate": 0.00018902618959793435, + "loss": 0.3547, + "step": 1190 + }, + { + "epoch": 0.22, + "learning_rate": 0.00018893397270379936, + "loss": 0.5267, + "step": 1200 + }, + { + "epoch": 0.22, + "learning_rate": 0.00018884175580966434, + "loss": 0.4542, + "step": 1210 + }, + { + "epoch": 0.23, + "learning_rate": 0.00018874953891552933, + "loss": 0.4595, + "step": 1220 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001886573220213943, + "loss": 0.4909, + "step": 1230 + }, + { + "epoch": 0.23, + "learning_rate": 0.00018856510512725932, + "loss": 0.4425, + "step": 1240 + }, + { + "epoch": 0.23, + "learning_rate": 0.00018847288823312433, + "loss": 0.393, + "step": 1250 + }, + { + "epoch": 0.23, + "learning_rate": 0.00018838067133898932, + "loss": 0.4167, + "step": 1260 + }, + { + "epoch": 0.23, + "learning_rate": 0.0001882884544448543, + "loss": 0.3037, + "step": 1270 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001881962375507193, + "loss": 0.4309, + "step": 1280 + }, + { + "epoch": 0.24, + "learning_rate": 0.0001881040206565843, + "loss": 0.4805, + "step": 1290 + }, + { + "epoch": 0.24, + "learning_rate": 0.00018801180376244928, + "loss": 0.4452, + "step": 1300 + }, + { + "epoch": 0.24, + "learning_rate": 0.00018791958686831427, + "loss": 0.3939, + "step": 1310 + }, + { + "epoch": 0.24, + "learning_rate": 0.00018782736997417928, + "loss": 0.3899, + "step": 1320 + }, + { + "epoch": 0.25, + "learning_rate": 0.0001877351530800443, + "loss": 0.4246, + "step": 1330 + }, + { + "epoch": 0.25, + "learning_rate": 0.00018764293618590928, + "loss": 0.4555, + "step": 1340 + }, + { + "epoch": 0.25, + "learning_rate": 0.00018755071929177426, + "loss": 0.3036, + "step": 1350 + }, + { + "epoch": 0.25, + "learning_rate": 0.00018745850239763924, + "loss": 0.4121, + "step": 1360 + }, + { + "epoch": 0.25, + "learning_rate": 0.00018736628550350426, + "loss": 0.4736, + "step": 1370 + }, + { + "epoch": 0.25, + "learning_rate": 0.00018727406860936924, + "loss": 0.4223, + "step": 1380 + }, + { + "epoch": 0.26, + "learning_rate": 0.00018718185171523422, + "loss": 0.3058, + "step": 1390 + }, + { + "epoch": 0.26, + "learning_rate": 0.00018708963482109924, + "loss": 0.4031, + "step": 1400 + }, + { + "epoch": 0.26, + "learning_rate": 0.00018699741792696425, + "loss": 0.3931, + "step": 1410 + }, + { + "epoch": 0.26, + "learning_rate": 0.00018690520103282923, + "loss": 0.5029, + "step": 1420 + }, + { + "epoch": 0.26, + "learning_rate": 0.00018681298413869422, + "loss": 0.5434, + "step": 1430 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001867207672445592, + "loss": 0.4353, + "step": 1440 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001866285503504242, + "loss": 0.4171, + "step": 1450 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001865363334562892, + "loss": 0.4074, + "step": 1460 + }, + { + "epoch": 0.27, + "learning_rate": 0.00018644411656215418, + "loss": 0.3915, + "step": 1470 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001863518996680192, + "loss": 0.3376, + "step": 1480 + }, + { + "epoch": 0.27, + "learning_rate": 0.0001862596827738842, + "loss": 0.3333, + "step": 1490 + }, + { + "epoch": 0.28, + "learning_rate": 0.0001861674658797492, + "loss": 0.4467, + "step": 1500 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018607524898561417, + "loss": 0.435, + "step": 1510 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018598303209147916, + "loss": 0.3931, + "step": 1520 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018589081519734417, + "loss": 0.3659, + "step": 1530 + }, + { + "epoch": 0.28, + "learning_rate": 0.00018579859830320915, + "loss": 0.4438, + "step": 1540 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018570638140907414, + "loss": 0.3943, + "step": 1550 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018561416451493915, + "loss": 0.4003, + "step": 1560 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018552194762080416, + "loss": 0.3227, + "step": 1570 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018542973072666915, + "loss": 0.4422, + "step": 1580 + }, + { + "epoch": 0.29, + "learning_rate": 0.00018533751383253413, + "loss": 0.3905, + "step": 1590 + }, + { + "epoch": 0.3, + "learning_rate": 0.00018524529693839911, + "loss": 0.3801, + "step": 1600 + }, + { + "epoch": 0.3, + "learning_rate": 0.00018515308004426413, + "loss": 0.3582, + "step": 1610 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001850608631501291, + "loss": 0.4174, + "step": 1620 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001849686462559941, + "loss": 0.3648, + "step": 1630 + }, + { + "epoch": 0.3, + "learning_rate": 0.0001848764293618591, + "loss": 0.4227, + "step": 1640 + }, + { + "epoch": 0.3, + "learning_rate": 0.00018478421246772412, + "loss": 0.3653, + "step": 1650 + }, + { + "epoch": 0.31, + "learning_rate": 0.0001846919955735891, + "loss": 0.3624, + "step": 1660 + }, + { + "epoch": 0.31, + "learning_rate": 0.00018459977867945409, + "loss": 0.3214, + "step": 1670 + }, + { + "epoch": 0.31, + "learning_rate": 0.00018450756178531907, + "loss": 0.3415, + "step": 1680 + }, + { + "epoch": 0.31, + "learning_rate": 0.00018441534489118408, + "loss": 0.3219, + "step": 1690 + }, + { + "epoch": 0.31, + "learning_rate": 0.00018432312799704907, + "loss": 0.3679, + "step": 1700 + }, + { + "epoch": 0.32, + "learning_rate": 0.00018423091110291405, + "loss": 0.3729, + "step": 1710 + }, + { + "epoch": 0.32, + "learning_rate": 0.00018413869420877906, + "loss": 0.4188, + "step": 1720 + }, + { + "epoch": 0.32, + "learning_rate": 0.00018404647731464407, + "loss": 0.3648, + "step": 1730 + }, + { + "epoch": 0.32, + "learning_rate": 0.00018395426042050906, + "loss": 0.4181, + "step": 1740 + }, + { + "epoch": 0.32, + "learning_rate": 0.00018386204352637404, + "loss": 0.315, + "step": 1750 + }, + { + "epoch": 0.32, + "learning_rate": 0.00018376982663223903, + "loss": 0.3348, + "step": 1760 + }, + { + "epoch": 0.33, + "learning_rate": 0.00018367760973810404, + "loss": 0.4496, + "step": 1770 + }, + { + "epoch": 0.33, + "learning_rate": 0.00018358539284396902, + "loss": 0.3835, + "step": 1780 + }, + { + "epoch": 0.33, + "learning_rate": 0.000183493175949834, + "loss": 0.3148, + "step": 1790 + }, + { + "epoch": 0.33, + "learning_rate": 0.00018340095905569902, + "loss": 0.441, + "step": 1800 + }, + { + "epoch": 0.33, + "learning_rate": 0.00018330874216156403, + "loss": 0.3282, + "step": 1810 + }, + { + "epoch": 0.34, + "learning_rate": 0.00018321652526742901, + "loss": 0.4019, + "step": 1820 + }, + { + "epoch": 0.34, + "learning_rate": 0.000183124308373294, + "loss": 0.3623, + "step": 1830 + }, + { + "epoch": 0.34, + "learning_rate": 0.00018303209147915898, + "loss": 0.3729, + "step": 1840 + }, + { + "epoch": 0.34, + "learning_rate": 0.000182939874585024, + "loss": 0.2986, + "step": 1850 + }, + { + "epoch": 0.34, + "learning_rate": 0.00018284765769088898, + "loss": 0.372, + "step": 1860 + }, + { + "epoch": 0.34, + "learning_rate": 0.00018275544079675396, + "loss": 0.3837, + "step": 1870 + }, + { + "epoch": 0.35, + "learning_rate": 0.00018266322390261898, + "loss": 0.2998, + "step": 1880 + }, + { + "epoch": 0.35, + "learning_rate": 0.000182571007008484, + "loss": 0.3775, + "step": 1890 + }, + { + "epoch": 0.35, + "learning_rate": 0.00018247879011434897, + "loss": 0.2937, + "step": 1900 + }, + { + "epoch": 0.35, + "learning_rate": 0.00018238657322021396, + "loss": 0.2826, + "step": 1910 + }, + { + "epoch": 0.35, + "learning_rate": 0.00018229435632607894, + "loss": 0.4867, + "step": 1920 + }, + { + "epoch": 0.36, + "learning_rate": 0.00018220213943194392, + "loss": 0.3199, + "step": 1930 + }, + { + "epoch": 0.36, + "learning_rate": 0.00018210992253780894, + "loss": 0.3392, + "step": 1940 + }, + { + "epoch": 0.36, + "learning_rate": 0.00018201770564367392, + "loss": 0.3835, + "step": 1950 + }, + { + "epoch": 0.36, + "learning_rate": 0.00018192548874953893, + "loss": 0.3517, + "step": 1960 + }, + { + "epoch": 0.36, + "learning_rate": 0.00018183327185540392, + "loss": 0.3842, + "step": 1970 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018174105496126893, + "loss": 0.3184, + "step": 1980 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001816488380671339, + "loss": 0.3106, + "step": 1990 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001815566211729989, + "loss": 0.315, + "step": 2000 + }, + { + "epoch": 0.37, + "eval_accuracy": 0.8317509585539529, + "eval_loss": 0.3487056791782379, + "eval_runtime": 98.9474, + "eval_samples_per_second": 110.705, + "eval_steps_per_second": 13.846, + "step": 2000 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018146440427886388, + "loss": 0.4269, + "step": 2010 + }, + { + "epoch": 0.37, + "learning_rate": 0.0001813721873847289, + "loss": 0.4103, + "step": 2020 + }, + { + "epoch": 0.37, + "learning_rate": 0.00018127997049059388, + "loss": 0.3383, + "step": 2030 + }, + { + "epoch": 0.38, + "learning_rate": 0.0001811877535964589, + "loss": 0.3569, + "step": 2040 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018109553670232387, + "loss": 0.295, + "step": 2050 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018100331980818888, + "loss": 0.3816, + "step": 2060 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018091110291405387, + "loss": 0.3156, + "step": 2070 + }, + { + "epoch": 0.38, + "learning_rate": 0.00018081888601991885, + "loss": 0.3574, + "step": 2080 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018072666912578384, + "loss": 0.3496, + "step": 2090 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018063445223164885, + "loss": 0.3897, + "step": 2100 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018054223533751383, + "loss": 0.4285, + "step": 2110 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018045001844337885, + "loss": 0.461, + "step": 2120 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018035780154924383, + "loss": 0.3399, + "step": 2130 + }, + { + "epoch": 0.39, + "learning_rate": 0.00018026558465510884, + "loss": 0.3455, + "step": 2140 + }, + { + "epoch": 0.4, + "learning_rate": 0.00018017336776097383, + "loss": 0.3925, + "step": 2150 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001800811508668388, + "loss": 0.3985, + "step": 2160 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001799889339727038, + "loss": 0.3067, + "step": 2170 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001798967170785688, + "loss": 0.3036, + "step": 2180 + }, + { + "epoch": 0.4, + "learning_rate": 0.0001798045001844338, + "loss": 0.3976, + "step": 2190 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001797122832902988, + "loss": 0.3375, + "step": 2200 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017962006639616379, + "loss": 0.4176, + "step": 2210 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001795278495020288, + "loss": 0.3729, + "step": 2220 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017943563260789378, + "loss": 0.4237, + "step": 2230 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017934341571375877, + "loss": 0.3436, + "step": 2240 + }, + { + "epoch": 0.41, + "learning_rate": 0.00017925119881962375, + "loss": 0.2412, + "step": 2250 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017915898192548876, + "loss": 0.3455, + "step": 2260 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017906676503135375, + "loss": 0.3741, + "step": 2270 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017897454813721876, + "loss": 0.2836, + "step": 2280 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017888233124308374, + "loss": 0.2703, + "step": 2290 + }, + { + "epoch": 0.42, + "learning_rate": 0.00017879011434894875, + "loss": 0.3872, + "step": 2300 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017869789745481374, + "loss": 0.3514, + "step": 2310 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017860568056067872, + "loss": 0.3548, + "step": 2320 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001785134636665437, + "loss": 0.4352, + "step": 2330 + }, + { + "epoch": 0.43, + "learning_rate": 0.00017842124677240872, + "loss": 0.2912, + "step": 2340 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001783290298782737, + "loss": 0.3551, + "step": 2350 + }, + { + "epoch": 0.44, + "learning_rate": 0.00017823681298413871, + "loss": 0.4348, + "step": 2360 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001781445960900037, + "loss": 0.4094, + "step": 2370 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001780523791958687, + "loss": 0.3368, + "step": 2380 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001779601623017337, + "loss": 0.3732, + "step": 2390 + }, + { + "epoch": 0.44, + "learning_rate": 0.00017786794540759868, + "loss": 0.2987, + "step": 2400 + }, + { + "epoch": 0.44, + "learning_rate": 0.00017777572851346366, + "loss": 0.341, + "step": 2410 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017768351161932868, + "loss": 0.3953, + "step": 2420 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017759129472519366, + "loss": 0.4143, + "step": 2430 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017749907783105864, + "loss": 0.317, + "step": 2440 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017740686093692366, + "loss": 0.2584, + "step": 2450 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017731464404278867, + "loss": 0.3006, + "step": 2460 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017722242714865365, + "loss": 0.4184, + "step": 2470 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017713021025451864, + "loss": 0.3758, + "step": 2480 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017703799336038362, + "loss": 0.4137, + "step": 2490 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017694577646624863, + "loss": 0.3617, + "step": 2500 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017685355957211362, + "loss": 0.3775, + "step": 2510 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001767613426779786, + "loss": 0.3578, + "step": 2520 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001766691257838436, + "loss": 0.3031, + "step": 2530 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001765769088897086, + "loss": 0.335, + "step": 2540 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001764846919955736, + "loss": 0.3057, + "step": 2550 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001763924751014386, + "loss": 0.4072, + "step": 2560 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017630025820730358, + "loss": 0.4231, + "step": 2570 + }, + { + "epoch": 0.48, + "learning_rate": 0.00017620804131316856, + "loss": 0.3353, + "step": 2580 + }, + { + "epoch": 0.48, + "learning_rate": 0.00017611582441903357, + "loss": 0.3978, + "step": 2590 + }, + { + "epoch": 0.48, + "learning_rate": 0.00017602360752489856, + "loss": 0.2996, + "step": 2600 + }, + { + "epoch": 0.48, + "learning_rate": 0.00017593139063076357, + "loss": 0.3473, + "step": 2610 + }, + { + "epoch": 0.48, + "learning_rate": 0.00017583917373662855, + "loss": 0.3141, + "step": 2620 + }, + { + "epoch": 0.49, + "learning_rate": 0.00017574695684249356, + "loss": 0.3099, + "step": 2630 + }, + { + "epoch": 0.49, + "learning_rate": 0.00017565473994835855, + "loss": 0.3627, + "step": 2640 + }, + { + "epoch": 0.49, + "learning_rate": 0.00017556252305422353, + "loss": 0.2984, + "step": 2650 + }, + { + "epoch": 0.49, + "learning_rate": 0.00017547030616008852, + "loss": 0.308, + "step": 2660 + }, + { + "epoch": 0.49, + "learning_rate": 0.00017537808926595353, + "loss": 0.3109, + "step": 2670 + }, + { + "epoch": 0.49, + "learning_rate": 0.00017528587237181851, + "loss": 0.3344, + "step": 2680 + }, + { + "epoch": 0.5, + "learning_rate": 0.00017519365547768353, + "loss": 0.348, + "step": 2690 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001751014385835485, + "loss": 0.3095, + "step": 2700 + }, + { + "epoch": 0.5, + "learning_rate": 0.00017500922168941352, + "loss": 0.3603, + "step": 2710 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001749170047952785, + "loss": 0.339, + "step": 2720 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001748247879011435, + "loss": 0.359, + "step": 2730 + }, + { + "epoch": 0.51, + "learning_rate": 0.00017473257100700847, + "loss": 0.339, + "step": 2740 + }, + { + "epoch": 0.51, + "learning_rate": 0.00017464035411287349, + "loss": 0.3681, + "step": 2750 + }, + { + "epoch": 0.51, + "learning_rate": 0.00017454813721873847, + "loss": 0.2514, + "step": 2760 + }, + { + "epoch": 0.51, + "learning_rate": 0.00017445592032460348, + "loss": 0.3092, + "step": 2770 + }, + { + "epoch": 0.51, + "learning_rate": 0.00017436370343046847, + "loss": 0.2572, + "step": 2780 + }, + { + "epoch": 0.51, + "learning_rate": 0.00017427148653633348, + "loss": 0.4704, + "step": 2790 + }, + { + "epoch": 0.52, + "learning_rate": 0.00017417926964219846, + "loss": 0.2768, + "step": 2800 + }, + { + "epoch": 0.52, + "learning_rate": 0.00017408705274806345, + "loss": 0.3077, + "step": 2810 + }, + { + "epoch": 0.52, + "learning_rate": 0.00017399483585392843, + "loss": 0.4038, + "step": 2820 + }, + { + "epoch": 0.52, + "learning_rate": 0.00017390261895979344, + "loss": 0.2926, + "step": 2830 + }, + { + "epoch": 0.52, + "learning_rate": 0.00017381040206565843, + "loss": 0.3433, + "step": 2840 + }, + { + "epoch": 0.53, + "learning_rate": 0.00017371818517152344, + "loss": 0.4099, + "step": 2850 + }, + { + "epoch": 0.53, + "learning_rate": 0.00017362596827738842, + "loss": 0.3401, + "step": 2860 + }, + { + "epoch": 0.53, + "learning_rate": 0.00017353375138325343, + "loss": 0.3805, + "step": 2870 + }, + { + "epoch": 0.53, + "learning_rate": 0.00017344153448911842, + "loss": 0.3354, + "step": 2880 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001733493175949834, + "loss": 0.2953, + "step": 2890 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001732571007008484, + "loss": 0.3217, + "step": 2900 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001731648838067134, + "loss": 0.289, + "step": 2910 + }, + { + "epoch": 0.54, + "learning_rate": 0.00017307266691257838, + "loss": 0.4819, + "step": 2920 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001729804500184434, + "loss": 0.288, + "step": 2930 + }, + { + "epoch": 0.54, + "learning_rate": 0.00017288823312430838, + "loss": 0.4555, + "step": 2940 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001727960162301734, + "loss": 0.327, + "step": 2950 + }, + { + "epoch": 0.55, + "learning_rate": 0.00017270379933603838, + "loss": 0.337, + "step": 2960 + }, + { + "epoch": 0.55, + "learning_rate": 0.00017261158244190336, + "loss": 0.2624, + "step": 2970 + }, + { + "epoch": 0.55, + "learning_rate": 0.00017251936554776834, + "loss": 0.2694, + "step": 2980 + }, + { + "epoch": 0.55, + "learning_rate": 0.00017242714865363336, + "loss": 0.4381, + "step": 2990 + }, + { + "epoch": 0.55, + "learning_rate": 0.00017233493175949834, + "loss": 0.3082, + "step": 3000 + }, + { + "epoch": 0.55, + "eval_accuracy": 0.863337593573124, + "eval_loss": 0.30519211292266846, + "eval_runtime": 99.0719, + "eval_samples_per_second": 110.566, + "eval_steps_per_second": 13.828, + "step": 3000 + }, + { + "epoch": 0.56, + "learning_rate": 0.00017224271486536335, + "loss": 0.3275, + "step": 3010 + }, + { + "epoch": 0.56, + "learning_rate": 0.00017215049797122834, + "loss": 0.3314, + "step": 3020 + }, + { + "epoch": 0.56, + "learning_rate": 0.00017205828107709335, + "loss": 0.289, + "step": 3030 + }, + { + "epoch": 0.56, + "learning_rate": 0.00017196606418295833, + "loss": 0.3257, + "step": 3040 + }, + { + "epoch": 0.56, + "learning_rate": 0.00017187384728882332, + "loss": 0.3443, + "step": 3050 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001717816303946883, + "loss": 0.3118, + "step": 3060 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001716894135005533, + "loss": 0.4607, + "step": 3070 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001715971966064183, + "loss": 0.2485, + "step": 3080 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001715049797122833, + "loss": 0.3853, + "step": 3090 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001714127628181483, + "loss": 0.3685, + "step": 3100 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001713205459240133, + "loss": 0.3942, + "step": 3110 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001712283290298783, + "loss": 0.3057, + "step": 3120 + }, + { + "epoch": 0.58, + "learning_rate": 0.00017113611213574327, + "loss": 0.2269, + "step": 3130 + }, + { + "epoch": 0.58, + "learning_rate": 0.00017104389524160826, + "loss": 0.3431, + "step": 3140 + }, + { + "epoch": 0.58, + "learning_rate": 0.00017095167834747327, + "loss": 0.2295, + "step": 3150 + }, + { + "epoch": 0.58, + "learning_rate": 0.00017085946145333825, + "loss": 0.3489, + "step": 3160 + }, + { + "epoch": 0.58, + "learning_rate": 0.00017076724455920326, + "loss": 0.3596, + "step": 3170 + }, + { + "epoch": 0.59, + "learning_rate": 0.00017067502766506825, + "loss": 0.4055, + "step": 3180 + }, + { + "epoch": 0.59, + "learning_rate": 0.00017058281077093323, + "loss": 0.4416, + "step": 3190 + }, + { + "epoch": 0.59, + "learning_rate": 0.00017049059387679825, + "loss": 0.2621, + "step": 3200 + }, + { + "epoch": 0.59, + "learning_rate": 0.00017039837698266323, + "loss": 0.346, + "step": 3210 + }, + { + "epoch": 0.59, + "learning_rate": 0.00017030616008852821, + "loss": 0.3386, + "step": 3220 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001702139431943932, + "loss": 0.3263, + "step": 3230 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001701217263002582, + "loss": 0.3014, + "step": 3240 + }, + { + "epoch": 0.6, + "learning_rate": 0.00017002950940612322, + "loss": 0.3152, + "step": 3250 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001699372925119882, + "loss": 0.2802, + "step": 3260 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001698450756178532, + "loss": 0.3908, + "step": 3270 + }, + { + "epoch": 0.6, + "learning_rate": 0.0001697528587237182, + "loss": 0.2139, + "step": 3280 + }, + { + "epoch": 0.61, + "learning_rate": 0.00016966064182958319, + "loss": 0.3051, + "step": 3290 + }, + { + "epoch": 0.61, + "learning_rate": 0.00016956842493544817, + "loss": 0.3304, + "step": 3300 + }, + { + "epoch": 0.61, + "learning_rate": 0.00016947620804131316, + "loss": 0.4109, + "step": 3310 + }, + { + "epoch": 0.61, + "learning_rate": 0.00016938399114717817, + "loss": 0.298, + "step": 3320 + }, + { + "epoch": 0.61, + "learning_rate": 0.00016929177425304318, + "loss": 0.3299, + "step": 3330 + }, + { + "epoch": 0.62, + "learning_rate": 0.00016919955735890816, + "loss": 0.3102, + "step": 3340 + }, + { + "epoch": 0.62, + "learning_rate": 0.00016910734046477315, + "loss": 0.2846, + "step": 3350 + }, + { + "epoch": 0.62, + "learning_rate": 0.00016901512357063816, + "loss": 0.329, + "step": 3360 + }, + { + "epoch": 0.62, + "learning_rate": 0.00016892290667650314, + "loss": 0.2862, + "step": 3370 + }, + { + "epoch": 0.62, + "learning_rate": 0.00016883068978236813, + "loss": 0.2984, + "step": 3380 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001687384728882331, + "loss": 0.3246, + "step": 3390 + }, + { + "epoch": 0.63, + "learning_rate": 0.00016864625599409812, + "loss": 0.2689, + "step": 3400 + }, + { + "epoch": 0.63, + "learning_rate": 0.00016855403909996313, + "loss": 0.2825, + "step": 3410 + }, + { + "epoch": 0.63, + "learning_rate": 0.00016846182220582812, + "loss": 0.3375, + "step": 3420 + }, + { + "epoch": 0.63, + "learning_rate": 0.0001683696053116931, + "loss": 0.3298, + "step": 3430 + }, + { + "epoch": 0.63, + "learning_rate": 0.00016827738841755812, + "loss": 0.334, + "step": 3440 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001681851715234231, + "loss": 0.3414, + "step": 3450 + }, + { + "epoch": 0.64, + "learning_rate": 0.00016809295462928808, + "loss": 0.2998, + "step": 3460 + }, + { + "epoch": 0.64, + "learning_rate": 0.00016800073773515307, + "loss": 0.3031, + "step": 3470 + }, + { + "epoch": 0.64, + "learning_rate": 0.00016790852084101808, + "loss": 0.2044, + "step": 3480 + }, + { + "epoch": 0.64, + "learning_rate": 0.0001678163039468831, + "loss": 0.4127, + "step": 3490 + }, + { + "epoch": 0.65, + "learning_rate": 0.00016772408705274808, + "loss": 0.2921, + "step": 3500 + }, + { + "epoch": 0.65, + "learning_rate": 0.00016763187015861306, + "loss": 0.3712, + "step": 3510 + }, + { + "epoch": 0.65, + "learning_rate": 0.00016753965326447807, + "loss": 0.342, + "step": 3520 + }, + { + "epoch": 0.65, + "learning_rate": 0.00016744743637034306, + "loss": 0.3056, + "step": 3530 + }, + { + "epoch": 0.65, + "learning_rate": 0.00016735521947620804, + "loss": 0.3342, + "step": 3540 + }, + { + "epoch": 0.65, + "learning_rate": 0.00016726300258207302, + "loss": 0.3005, + "step": 3550 + }, + { + "epoch": 0.66, + "learning_rate": 0.00016717078568793804, + "loss": 0.2978, + "step": 3560 + }, + { + "epoch": 0.66, + "learning_rate": 0.00016707856879380305, + "loss": 0.415, + "step": 3570 + }, + { + "epoch": 0.66, + "learning_rate": 0.00016698635189966803, + "loss": 0.3048, + "step": 3580 + }, + { + "epoch": 0.66, + "learning_rate": 0.00016689413500553302, + "loss": 0.3362, + "step": 3590 + }, + { + "epoch": 0.66, + "learning_rate": 0.00016680191811139803, + "loss": 0.3437, + "step": 3600 + }, + { + "epoch": 0.67, + "learning_rate": 0.000166709701217263, + "loss": 0.3877, + "step": 3610 + }, + { + "epoch": 0.67, + "learning_rate": 0.000166617484323128, + "loss": 0.2679, + "step": 3620 + }, + { + "epoch": 0.67, + "learning_rate": 0.00016652526742899298, + "loss": 0.2978, + "step": 3630 + }, + { + "epoch": 0.67, + "learning_rate": 0.000166433050534858, + "loss": 0.3177, + "step": 3640 + }, + { + "epoch": 0.67, + "learning_rate": 0.000166340833640723, + "loss": 0.2971, + "step": 3650 + }, + { + "epoch": 0.68, + "learning_rate": 0.000166248616746588, + "loss": 0.3507, + "step": 3660 + }, + { + "epoch": 0.68, + "learning_rate": 0.00016615639985245297, + "loss": 0.3602, + "step": 3670 + }, + { + "epoch": 0.68, + "learning_rate": 0.00016606418295831798, + "loss": 0.2991, + "step": 3680 + }, + { + "epoch": 0.68, + "learning_rate": 0.00016597196606418297, + "loss": 0.3303, + "step": 3690 + }, + { + "epoch": 0.68, + "learning_rate": 0.00016587974917004795, + "loss": 0.2405, + "step": 3700 + }, + { + "epoch": 0.68, + "learning_rate": 0.00016578753227591294, + "loss": 0.3534, + "step": 3710 + }, + { + "epoch": 0.69, + "learning_rate": 0.00016569531538177795, + "loss": 0.3164, + "step": 3720 + }, + { + "epoch": 0.69, + "learning_rate": 0.00016560309848764296, + "loss": 0.2857, + "step": 3730 + }, + { + "epoch": 0.69, + "learning_rate": 0.00016551088159350795, + "loss": 0.3527, + "step": 3740 + }, + { + "epoch": 0.69, + "learning_rate": 0.00016541866469937293, + "loss": 0.2585, + "step": 3750 + }, + { + "epoch": 0.69, + "learning_rate": 0.00016532644780523794, + "loss": 0.3003, + "step": 3760 + }, + { + "epoch": 0.7, + "learning_rate": 0.00016523423091110293, + "loss": 0.3127, + "step": 3770 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001651420140169679, + "loss": 0.325, + "step": 3780 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001650497971228329, + "loss": 0.3222, + "step": 3790 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001649575802286979, + "loss": 0.2263, + "step": 3800 + }, + { + "epoch": 0.7, + "learning_rate": 0.00016486536333456292, + "loss": 0.2938, + "step": 3810 + }, + { + "epoch": 0.7, + "learning_rate": 0.0001647731464404279, + "loss": 0.2571, + "step": 3820 + }, + { + "epoch": 0.71, + "learning_rate": 0.00016468092954629289, + "loss": 0.2619, + "step": 3830 + }, + { + "epoch": 0.71, + "learning_rate": 0.00016458871265215787, + "loss": 0.3952, + "step": 3840 + }, + { + "epoch": 0.71, + "learning_rate": 0.00016449649575802288, + "loss": 0.4201, + "step": 3850 + }, + { + "epoch": 0.71, + "learning_rate": 0.00016440427886388787, + "loss": 0.3919, + "step": 3860 + }, + { + "epoch": 0.71, + "learning_rate": 0.00016431206196975285, + "loss": 0.316, + "step": 3870 + }, + { + "epoch": 0.72, + "learning_rate": 0.00016421984507561786, + "loss": 0.3487, + "step": 3880 + }, + { + "epoch": 0.72, + "learning_rate": 0.00016412762818148287, + "loss": 0.3164, + "step": 3890 + }, + { + "epoch": 0.72, + "learning_rate": 0.00016403541128734786, + "loss": 0.311, + "step": 3900 + }, + { + "epoch": 0.72, + "learning_rate": 0.00016394319439321284, + "loss": 0.4413, + "step": 3910 + }, + { + "epoch": 0.72, + "learning_rate": 0.00016385097749907783, + "loss": 0.3052, + "step": 3920 + }, + { + "epoch": 0.72, + "learning_rate": 0.00016375876060494284, + "loss": 0.3408, + "step": 3930 + }, + { + "epoch": 0.73, + "learning_rate": 0.00016366654371080782, + "loss": 0.235, + "step": 3940 + }, + { + "epoch": 0.73, + "learning_rate": 0.0001635743268166728, + "loss": 0.2828, + "step": 3950 + }, + { + "epoch": 0.73, + "learning_rate": 0.00016348210992253782, + "loss": 0.3549, + "step": 3960 + }, + { + "epoch": 0.73, + "learning_rate": 0.00016338989302840283, + "loss": 0.305, + "step": 3970 + }, + { + "epoch": 0.73, + "learning_rate": 0.00016329767613426782, + "loss": 0.22, + "step": 3980 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001632054592401328, + "loss": 0.3241, + "step": 3990 + }, + { + "epoch": 0.74, + "learning_rate": 0.00016311324234599778, + "loss": 0.3235, + "step": 4000 + }, + { + "epoch": 0.74, + "eval_accuracy": 0.868358590469235, + "eval_loss": 0.28994396328926086, + "eval_runtime": 99.0495, + "eval_samples_per_second": 110.591, + "eval_steps_per_second": 13.831, + "step": 4000 + }, + { + "epoch": 0.74, + "learning_rate": 0.0001630210254518628, + "loss": 0.3307, + "step": 4010 + }, + { + "epoch": 0.74, + "learning_rate": 0.00016292880855772778, + "loss": 0.342, + "step": 4020 + }, + { + "epoch": 0.74, + "learning_rate": 0.00016283659166359276, + "loss": 0.3057, + "step": 4030 + }, + { + "epoch": 0.75, + "learning_rate": 0.00016274437476945778, + "loss": 0.3428, + "step": 4040 + }, + { + "epoch": 0.75, + "learning_rate": 0.0001626521578753228, + "loss": 0.3025, + "step": 4050 + }, + { + "epoch": 0.75, + "learning_rate": 0.00016255994098118777, + "loss": 0.3565, + "step": 4060 + }, + { + "epoch": 0.75, + "learning_rate": 0.00016246772408705276, + "loss": 0.2313, + "step": 4070 + }, + { + "epoch": 0.75, + "learning_rate": 0.00016237550719291774, + "loss": 0.2808, + "step": 4080 + }, + { + "epoch": 0.75, + "learning_rate": 0.00016228329029878275, + "loss": 0.2751, + "step": 4090 + }, + { + "epoch": 0.76, + "learning_rate": 0.00016219107340464774, + "loss": 0.3921, + "step": 4100 + }, + { + "epoch": 0.76, + "learning_rate": 0.00016209885651051272, + "loss": 0.2999, + "step": 4110 + }, + { + "epoch": 0.76, + "learning_rate": 0.00016200663961637773, + "loss": 0.2514, + "step": 4120 + }, + { + "epoch": 0.76, + "learning_rate": 0.00016191442272224274, + "loss": 0.2381, + "step": 4130 + }, + { + "epoch": 0.76, + "learning_rate": 0.00016182220582810773, + "loss": 0.3691, + "step": 4140 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001617299889339727, + "loss": 0.2547, + "step": 4150 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001616377720398377, + "loss": 0.2251, + "step": 4160 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001615455551457027, + "loss": 0.3225, + "step": 4170 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001614533382515677, + "loss": 0.403, + "step": 4180 + }, + { + "epoch": 0.77, + "learning_rate": 0.00016136112135743268, + "loss": 0.2988, + "step": 4190 + }, + { + "epoch": 0.77, + "learning_rate": 0.0001612689044632977, + "loss": 0.3268, + "step": 4200 + }, + { + "epoch": 0.78, + "learning_rate": 0.0001611766875691627, + "loss": 0.2629, + "step": 4210 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016108447067502768, + "loss": 0.2314, + "step": 4220 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016099225378089267, + "loss": 0.2813, + "step": 4230 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016090003688675765, + "loss": 0.2538, + "step": 4240 + }, + { + "epoch": 0.78, + "learning_rate": 0.00016080781999262267, + "loss": 0.3281, + "step": 4250 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016071560309848765, + "loss": 0.324, + "step": 4260 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016062338620435263, + "loss": 0.2894, + "step": 4270 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016053116931021765, + "loss": 0.3562, + "step": 4280 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016043895241608266, + "loss": 0.2743, + "step": 4290 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016034673552194764, + "loss": 0.4109, + "step": 4300 + }, + { + "epoch": 0.79, + "learning_rate": 0.00016025451862781263, + "loss": 0.3324, + "step": 4310 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001601623017336776, + "loss": 0.2997, + "step": 4320 + }, + { + "epoch": 0.8, + "learning_rate": 0.00016007008483954262, + "loss": 0.342, + "step": 4330 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001599778679454076, + "loss": 0.3475, + "step": 4340 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001598856510512726, + "loss": 0.2972, + "step": 4350 + }, + { + "epoch": 0.8, + "learning_rate": 0.0001597934341571376, + "loss": 0.3234, + "step": 4360 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001597012172630026, + "loss": 0.3133, + "step": 4370 + }, + { + "epoch": 0.81, + "learning_rate": 0.0001596090003688676, + "loss": 0.3302, + "step": 4380 + }, + { + "epoch": 0.81, + "learning_rate": 0.00015951678347473258, + "loss": 0.3424, + "step": 4390 + }, + { + "epoch": 0.81, + "learning_rate": 0.00015942456658059757, + "loss": 0.2909, + "step": 4400 + }, + { + "epoch": 0.81, + "learning_rate": 0.00015933234968646258, + "loss": 0.433, + "step": 4410 + }, + { + "epoch": 0.82, + "learning_rate": 0.00015924013279232756, + "loss": 0.2782, + "step": 4420 + }, + { + "epoch": 0.82, + "learning_rate": 0.00015914791589819255, + "loss": 0.316, + "step": 4430 + }, + { + "epoch": 0.82, + "learning_rate": 0.00015905569900405756, + "loss": 0.3129, + "step": 4440 + }, + { + "epoch": 0.82, + "learning_rate": 0.00015896348210992254, + "loss": 0.326, + "step": 4450 + }, + { + "epoch": 0.82, + "learning_rate": 0.00015887126521578755, + "loss": 0.2647, + "step": 4460 + }, + { + "epoch": 0.82, + "learning_rate": 0.00015877904832165254, + "loss": 0.316, + "step": 4470 + }, + { + "epoch": 0.83, + "learning_rate": 0.00015868683142751752, + "loss": 0.3266, + "step": 4480 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001585946145333825, + "loss": 0.3605, + "step": 4490 + }, + { + "epoch": 0.83, + "learning_rate": 0.00015850239763924752, + "loss": 0.3225, + "step": 4500 + }, + { + "epoch": 0.83, + "learning_rate": 0.0001584101807451125, + "loss": 0.3426, + "step": 4510 + }, + { + "epoch": 0.83, + "learning_rate": 0.00015831796385097752, + "loss": 0.322, + "step": 4520 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001582257469568425, + "loss": 0.2688, + "step": 4530 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001581335300627075, + "loss": 0.3182, + "step": 4540 + }, + { + "epoch": 0.84, + "learning_rate": 0.0001580413131685725, + "loss": 0.2599, + "step": 4550 + }, + { + "epoch": 0.84, + "learning_rate": 0.00015794909627443748, + "loss": 0.2918, + "step": 4560 + }, + { + "epoch": 0.84, + "learning_rate": 0.00015785687938030246, + "loss": 0.3931, + "step": 4570 + }, + { + "epoch": 0.84, + "learning_rate": 0.00015776466248616748, + "loss": 0.3272, + "step": 4580 + }, + { + "epoch": 0.85, + "learning_rate": 0.00015767244559203246, + "loss": 0.2132, + "step": 4590 + }, + { + "epoch": 0.85, + "learning_rate": 0.00015758022869789747, + "loss": 0.2428, + "step": 4600 + }, + { + "epoch": 0.85, + "learning_rate": 0.00015748801180376246, + "loss": 0.4144, + "step": 4610 + }, + { + "epoch": 0.85, + "learning_rate": 0.00015739579490962747, + "loss": 0.2609, + "step": 4620 + }, + { + "epoch": 0.85, + "learning_rate": 0.00015730357801549245, + "loss": 0.284, + "step": 4630 + }, + { + "epoch": 0.86, + "learning_rate": 0.00015721136112135744, + "loss": 0.315, + "step": 4640 + }, + { + "epoch": 0.86, + "learning_rate": 0.00015711914422722242, + "loss": 0.3503, + "step": 4650 + }, + { + "epoch": 0.86, + "learning_rate": 0.00015702692733308743, + "loss": 0.2572, + "step": 4660 + }, + { + "epoch": 0.86, + "learning_rate": 0.00015693471043895242, + "loss": 0.2936, + "step": 4670 + }, + { + "epoch": 0.86, + "learning_rate": 0.00015684249354481743, + "loss": 0.2479, + "step": 4680 + }, + { + "epoch": 0.86, + "learning_rate": 0.0001567502766506824, + "loss": 0.3408, + "step": 4690 + }, + { + "epoch": 0.87, + "learning_rate": 0.00015665805975654742, + "loss": 0.358, + "step": 4700 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001565658428624124, + "loss": 0.3405, + "step": 4710 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001564736259682774, + "loss": 0.2231, + "step": 4720 + }, + { + "epoch": 0.87, + "learning_rate": 0.00015638140907414238, + "loss": 0.3411, + "step": 4730 + }, + { + "epoch": 0.87, + "learning_rate": 0.0001562891921800074, + "loss": 0.2874, + "step": 4740 + }, + { + "epoch": 0.88, + "learning_rate": 0.00015619697528587237, + "loss": 0.3631, + "step": 4750 + }, + { + "epoch": 0.88, + "learning_rate": 0.00015610475839173736, + "loss": 0.2846, + "step": 4760 + }, + { + "epoch": 0.88, + "learning_rate": 0.00015601254149760237, + "loss": 0.3823, + "step": 4770 + }, + { + "epoch": 0.88, + "learning_rate": 0.00015592032460346738, + "loss": 0.3304, + "step": 4780 + }, + { + "epoch": 0.88, + "learning_rate": 0.00015582810770933237, + "loss": 0.2658, + "step": 4790 + }, + { + "epoch": 0.89, + "learning_rate": 0.00015573589081519735, + "loss": 0.2804, + "step": 4800 + }, + { + "epoch": 0.89, + "learning_rate": 0.00015564367392106233, + "loss": 0.2625, + "step": 4810 + }, + { + "epoch": 0.89, + "learning_rate": 0.00015555145702692735, + "loss": 0.2694, + "step": 4820 + }, + { + "epoch": 0.89, + "learning_rate": 0.00015545924013279233, + "loss": 0.4037, + "step": 4830 + }, + { + "epoch": 0.89, + "learning_rate": 0.00015536702323865731, + "loss": 0.2782, + "step": 4840 + }, + { + "epoch": 0.89, + "learning_rate": 0.00015527480634452233, + "loss": 0.2277, + "step": 4850 + }, + { + "epoch": 0.9, + "learning_rate": 0.00015518258945038734, + "loss": 0.3049, + "step": 4860 + }, + { + "epoch": 0.9, + "learning_rate": 0.00015509037255625232, + "loss": 0.315, + "step": 4870 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001549981556621173, + "loss": 0.2892, + "step": 4880 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001549059387679823, + "loss": 0.2593, + "step": 4890 + }, + { + "epoch": 0.9, + "learning_rate": 0.0001548137218738473, + "loss": 0.2331, + "step": 4900 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001547215049797123, + "loss": 0.3328, + "step": 4910 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015462928808557727, + "loss": 0.2698, + "step": 4920 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015453707119144228, + "loss": 0.2729, + "step": 4930 + }, + { + "epoch": 0.91, + "learning_rate": 0.0001544448542973073, + "loss": 0.2428, + "step": 4940 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015435263740317228, + "loss": 0.3144, + "step": 4950 + }, + { + "epoch": 0.91, + "learning_rate": 0.00015426042050903726, + "loss": 0.2702, + "step": 4960 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015416820361490225, + "loss": 0.2847, + "step": 4970 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015407598672076726, + "loss": 0.2186, + "step": 4980 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015398376982663224, + "loss": 0.3029, + "step": 4990 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015389155293249723, + "loss": 0.2505, + "step": 5000 + }, + { + "epoch": 0.92, + "eval_accuracy": 0.8784918751141135, + "eval_loss": 0.26932471990585327, + "eval_runtime": 99.0676, + "eval_samples_per_second": 110.571, + "eval_steps_per_second": 13.829, + "step": 5000 + }, + { + "epoch": 0.92, + "learning_rate": 0.00015379933603836224, + "loss": 0.2593, + "step": 5010 + }, + { + "epoch": 0.93, + "learning_rate": 0.00015370711914422725, + "loss": 0.3717, + "step": 5020 + }, + { + "epoch": 0.93, + "learning_rate": 0.00015361490225009223, + "loss": 0.379, + "step": 5030 + }, + { + "epoch": 0.93, + "learning_rate": 0.00015352268535595722, + "loss": 0.2452, + "step": 5040 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001534304684618222, + "loss": 0.2741, + "step": 5050 + }, + { + "epoch": 0.93, + "learning_rate": 0.0001533382515676872, + "loss": 0.3541, + "step": 5060 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001532460346735522, + "loss": 0.4734, + "step": 5070 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015315381777941718, + "loss": 0.2334, + "step": 5080 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001530616008852822, + "loss": 0.2989, + "step": 5090 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015296938399114718, + "loss": 0.3124, + "step": 5100 + }, + { + "epoch": 0.94, + "learning_rate": 0.0001528771670970122, + "loss": 0.2227, + "step": 5110 + }, + { + "epoch": 0.94, + "learning_rate": 0.00015278495020287718, + "loss": 0.3233, + "step": 5120 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015269273330874216, + "loss": 0.3275, + "step": 5130 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015260051641460714, + "loss": 0.2369, + "step": 5140 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015250829952047216, + "loss": 0.2122, + "step": 5150 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015241608262633714, + "loss": 0.2965, + "step": 5160 + }, + { + "epoch": 0.95, + "learning_rate": 0.00015232386573220215, + "loss": 0.3045, + "step": 5170 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015223164883806714, + "loss": 0.3304, + "step": 5180 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015213943194393215, + "loss": 0.1698, + "step": 5190 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015204721504979713, + "loss": 0.2182, + "step": 5200 + }, + { + "epoch": 0.96, + "learning_rate": 0.00015195499815566212, + "loss": 0.2228, + "step": 5210 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001518627812615271, + "loss": 0.2691, + "step": 5220 + }, + { + "epoch": 0.96, + "learning_rate": 0.0001517705643673921, + "loss": 0.3049, + "step": 5230 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001516783474732571, + "loss": 0.268, + "step": 5240 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001515861305791221, + "loss": 0.2829, + "step": 5250 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001514939136849871, + "loss": 0.2393, + "step": 5260 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001514016967908521, + "loss": 0.2802, + "step": 5270 + }, + { + "epoch": 0.97, + "learning_rate": 0.0001513094798967171, + "loss": 0.2809, + "step": 5280 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015121726300258207, + "loss": 0.349, + "step": 5290 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015112504610844706, + "loss": 0.3409, + "step": 5300 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015103282921431207, + "loss": 0.328, + "step": 5310 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015094061232017705, + "loss": 0.2118, + "step": 5320 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015084839542604207, + "loss": 0.1884, + "step": 5330 + }, + { + "epoch": 0.98, + "learning_rate": 0.00015075617853190705, + "loss": 0.3182, + "step": 5340 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015066396163777206, + "loss": 0.3171, + "step": 5350 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015057174474363705, + "loss": 0.3254, + "step": 5360 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015047952784950203, + "loss": 0.2539, + "step": 5370 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015038731095536701, + "loss": 0.2322, + "step": 5380 + }, + { + "epoch": 0.99, + "learning_rate": 0.00015029509406123203, + "loss": 0.2637, + "step": 5390 + }, + { + "epoch": 1.0, + "learning_rate": 0.000150202877167097, + "loss": 0.3089, + "step": 5400 + }, + { + "epoch": 1.0, + "learning_rate": 0.00015011066027296202, + "loss": 0.2709, + "step": 5410 + }, + { + "epoch": 1.0, + "learning_rate": 0.000150018443378827, + "loss": 0.3575, + "step": 5420 + }, + { + "epoch": 1.0, + "learning_rate": 0.00014992622648469202, + "loss": 0.1982, + "step": 5430 + }, + { + "epoch": 1.0, + "learning_rate": 0.000149834009590557, + "loss": 0.1868, + "step": 5440 + }, + { + "epoch": 1.01, + "learning_rate": 0.000149741792696422, + "loss": 0.2507, + "step": 5450 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014964957580228697, + "loss": 0.212, + "step": 5460 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014955735890815198, + "loss": 0.3149, + "step": 5470 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014946514201401697, + "loss": 0.2096, + "step": 5480 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014937292511988198, + "loss": 0.1928, + "step": 5490 + }, + { + "epoch": 1.01, + "learning_rate": 0.00014928070822574696, + "loss": 0.2239, + "step": 5500 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014918849133161197, + "loss": 0.3118, + "step": 5510 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014909627443747696, + "loss": 0.2, + "step": 5520 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014900405754334194, + "loss": 0.2386, + "step": 5530 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014891184064920693, + "loss": 0.2723, + "step": 5540 + }, + { + "epoch": 1.02, + "learning_rate": 0.00014881962375507194, + "loss": 0.2094, + "step": 5550 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014872740686093692, + "loss": 0.1656, + "step": 5560 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014863518996680193, + "loss": 0.2381, + "step": 5570 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014854297307266692, + "loss": 0.1589, + "step": 5580 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014845075617853193, + "loss": 0.2366, + "step": 5590 + }, + { + "epoch": 1.03, + "learning_rate": 0.00014835853928439692, + "loss": 0.2766, + "step": 5600 + }, + { + "epoch": 1.03, + "learning_rate": 0.0001482663223902619, + "loss": 0.2008, + "step": 5610 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014817410549612688, + "loss": 0.2507, + "step": 5620 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001480818886019919, + "loss": 0.2559, + "step": 5630 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014798967170785688, + "loss": 0.2385, + "step": 5640 + }, + { + "epoch": 1.04, + "learning_rate": 0.0001478974548137219, + "loss": 0.2586, + "step": 5650 + }, + { + "epoch": 1.04, + "learning_rate": 0.00014780523791958688, + "loss": 0.3115, + "step": 5660 + }, + { + "epoch": 1.05, + "learning_rate": 0.0001477130210254519, + "loss": 0.3633, + "step": 5670 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014762080413131687, + "loss": 0.1495, + "step": 5680 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014752858723718186, + "loss": 0.1919, + "step": 5690 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014743637034304684, + "loss": 0.2895, + "step": 5700 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014734415344891183, + "loss": 0.2837, + "step": 5710 + }, + { + "epoch": 1.05, + "learning_rate": 0.00014725193655477684, + "loss": 0.2793, + "step": 5720 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014715971966064185, + "loss": 0.2243, + "step": 5730 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014706750276650683, + "loss": 0.1981, + "step": 5740 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014697528587237182, + "loss": 0.2063, + "step": 5750 + }, + { + "epoch": 1.06, + "learning_rate": 0.00014688306897823683, + "loss": 0.252, + "step": 5760 + }, + { + "epoch": 1.06, + "learning_rate": 0.0001467908520841018, + "loss": 0.2525, + "step": 5770 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001466986351899668, + "loss": 0.3201, + "step": 5780 + }, + { + "epoch": 1.07, + "learning_rate": 0.00014660641829583178, + "loss": 0.2411, + "step": 5790 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001465142014016968, + "loss": 0.3181, + "step": 5800 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001464219845075618, + "loss": 0.2225, + "step": 5810 + }, + { + "epoch": 1.07, + "learning_rate": 0.0001463297676134268, + "loss": 0.4026, + "step": 5820 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014623755071929177, + "loss": 0.3204, + "step": 5830 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014614533382515679, + "loss": 0.2252, + "step": 5840 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014605311693102177, + "loss": 0.2579, + "step": 5850 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014596090003688675, + "loss": 0.2791, + "step": 5860 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014586868314275174, + "loss": 0.2399, + "step": 5870 + }, + { + "epoch": 1.08, + "learning_rate": 0.00014577646624861675, + "loss": 0.2829, + "step": 5880 + }, + { + "epoch": 1.09, + "learning_rate": 0.00014568424935448176, + "loss": 0.1904, + "step": 5890 + }, + { + "epoch": 1.09, + "learning_rate": 0.00014559203246034675, + "loss": 0.234, + "step": 5900 + }, + { + "epoch": 1.09, + "learning_rate": 0.00014549981556621173, + "loss": 0.243, + "step": 5910 + }, + { + "epoch": 1.09, + "learning_rate": 0.00014540759867207674, + "loss": 0.3356, + "step": 5920 + }, + { + "epoch": 1.09, + "learning_rate": 0.00014531538177794173, + "loss": 0.2302, + "step": 5930 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001452231648838067, + "loss": 0.2237, + "step": 5940 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001451309479896717, + "loss": 0.2933, + "step": 5950 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001450387310955367, + "loss": 0.1666, + "step": 5960 + }, + { + "epoch": 1.1, + "learning_rate": 0.00014494651420140172, + "loss": 0.2424, + "step": 5970 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001448542973072667, + "loss": 0.1896, + "step": 5980 + }, + { + "epoch": 1.1, + "learning_rate": 0.0001447620804131317, + "loss": 0.2417, + "step": 5990 + }, + { + "epoch": 1.11, + "learning_rate": 0.0001446698635189967, + "loss": 0.2484, + "step": 6000 + }, + { + "epoch": 1.11, + "eval_accuracy": 0.8888990323169619, + "eval_loss": 0.25469714403152466, + "eval_runtime": 99.1559, + "eval_samples_per_second": 110.472, + "eval_steps_per_second": 13.817, + "step": 6000 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014457764662486168, + "loss": 0.1637, + "step": 6010 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014448542973072667, + "loss": 0.25, + "step": 6020 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014439321283659165, + "loss": 0.2656, + "step": 6030 + }, + { + "epoch": 1.11, + "learning_rate": 0.00014430099594245666, + "loss": 0.231, + "step": 6040 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014420877904832167, + "loss": 0.2561, + "step": 6050 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014411656215418666, + "loss": 0.2932, + "step": 6060 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014402434526005164, + "loss": 0.2187, + "step": 6070 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014393212836591665, + "loss": 0.2097, + "step": 6080 + }, + { + "epoch": 1.12, + "learning_rate": 0.00014383991147178164, + "loss": 0.2195, + "step": 6090 + }, + { + "epoch": 1.13, + "learning_rate": 0.00014374769457764662, + "loss": 0.2479, + "step": 6100 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001436554776835116, + "loss": 0.2269, + "step": 6110 + }, + { + "epoch": 1.13, + "learning_rate": 0.00014356326078937662, + "loss": 0.3276, + "step": 6120 + }, + { + "epoch": 1.13, + "learning_rate": 0.00014347104389524163, + "loss": 0.1641, + "step": 6130 + }, + { + "epoch": 1.13, + "learning_rate": 0.00014337882700110662, + "loss": 0.2639, + "step": 6140 + }, + { + "epoch": 1.13, + "learning_rate": 0.0001432866101069716, + "loss": 0.2498, + "step": 6150 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001431943932128366, + "loss": 0.259, + "step": 6160 + }, + { + "epoch": 1.14, + "learning_rate": 0.0001431021763187016, + "loss": 0.242, + "step": 6170 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014300995942456658, + "loss": 0.3055, + "step": 6180 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014291774253043156, + "loss": 0.2876, + "step": 6190 + }, + { + "epoch": 1.14, + "learning_rate": 0.00014282552563629658, + "loss": 0.2389, + "step": 6200 + }, + { + "epoch": 1.15, + "learning_rate": 0.0001427333087421616, + "loss": 0.2328, + "step": 6210 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014264109184802657, + "loss": 0.2067, + "step": 6220 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014254887495389156, + "loss": 0.2483, + "step": 6230 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014245665805975657, + "loss": 0.1956, + "step": 6240 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014236444116562155, + "loss": 0.1735, + "step": 6250 + }, + { + "epoch": 1.15, + "learning_rate": 0.00014227222427148654, + "loss": 0.2671, + "step": 6260 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014218000737735152, + "loss": 0.191, + "step": 6270 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014208779048321653, + "loss": 0.2442, + "step": 6280 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014199557358908154, + "loss": 0.3221, + "step": 6290 + }, + { + "epoch": 1.16, + "learning_rate": 0.00014190335669494653, + "loss": 0.2409, + "step": 6300 + }, + { + "epoch": 1.16, + "learning_rate": 0.0001418111398008115, + "loss": 0.3028, + "step": 6310 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001417189229066765, + "loss": 0.2569, + "step": 6320 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001416267060125415, + "loss": 0.2352, + "step": 6330 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001415344891184065, + "loss": 0.2273, + "step": 6340 + }, + { + "epoch": 1.17, + "learning_rate": 0.00014144227222427148, + "loss": 0.2696, + "step": 6350 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001413500553301365, + "loss": 0.2499, + "step": 6360 + }, + { + "epoch": 1.17, + "learning_rate": 0.0001412578384360015, + "loss": 0.2661, + "step": 6370 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014116562154186649, + "loss": 0.1747, + "step": 6380 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014107340464773147, + "loss": 0.2103, + "step": 6390 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014098118775359645, + "loss": 0.2952, + "step": 6400 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014088897085946147, + "loss": 0.3027, + "step": 6410 + }, + { + "epoch": 1.18, + "learning_rate": 0.00014079675396532645, + "loss": 0.1631, + "step": 6420 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014070453707119143, + "loss": 0.2349, + "step": 6430 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014061232017705645, + "loss": 0.2795, + "step": 6440 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014052010328292146, + "loss": 0.2568, + "step": 6450 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014042788638878644, + "loss": 0.3058, + "step": 6460 + }, + { + "epoch": 1.19, + "learning_rate": 0.00014033566949465143, + "loss": 0.2498, + "step": 6470 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001402434526005164, + "loss": 0.283, + "step": 6480 + }, + { + "epoch": 1.2, + "learning_rate": 0.00014015123570638142, + "loss": 0.2488, + "step": 6490 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001400590188122464, + "loss": 0.3068, + "step": 6500 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001399668019181114, + "loss": 0.1956, + "step": 6510 + }, + { + "epoch": 1.2, + "learning_rate": 0.0001398745850239764, + "loss": 0.3158, + "step": 6520 + }, + { + "epoch": 1.2, + "learning_rate": 0.00013978236812984141, + "loss": 0.2339, + "step": 6530 + }, + { + "epoch": 1.21, + "learning_rate": 0.0001396901512357064, + "loss": 0.1909, + "step": 6540 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013959793434157138, + "loss": 0.1971, + "step": 6550 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013950571744743637, + "loss": 0.2328, + "step": 6560 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013941350055330138, + "loss": 0.2454, + "step": 6570 + }, + { + "epoch": 1.21, + "learning_rate": 0.00013932128365916636, + "loss": 0.2013, + "step": 6580 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013922906676503135, + "loss": 0.2084, + "step": 6590 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013913684987089636, + "loss": 0.2192, + "step": 6600 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013904463297676137, + "loss": 0.205, + "step": 6610 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013895241608262635, + "loss": 0.271, + "step": 6620 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013886019918849134, + "loss": 0.2771, + "step": 6630 + }, + { + "epoch": 1.22, + "learning_rate": 0.00013876798229435632, + "loss": 0.2482, + "step": 6640 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013867576540022134, + "loss": 0.2578, + "step": 6650 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013858354850608632, + "loss": 0.23, + "step": 6660 + }, + { + "epoch": 1.23, + "learning_rate": 0.0001384913316119513, + "loss": 0.1738, + "step": 6670 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013839911471781632, + "loss": 0.2621, + "step": 6680 + }, + { + "epoch": 1.23, + "learning_rate": 0.00013830689782368133, + "loss": 0.2004, + "step": 6690 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001382146809295463, + "loss": 0.1957, + "step": 6700 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001381224640354113, + "loss": 0.2362, + "step": 6710 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013803024714127628, + "loss": 0.2001, + "step": 6720 + }, + { + "epoch": 1.24, + "learning_rate": 0.0001379380302471413, + "loss": 0.2112, + "step": 6730 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013784581335300628, + "loss": 0.1598, + "step": 6740 + }, + { + "epoch": 1.24, + "learning_rate": 0.00013775359645887126, + "loss": 0.2422, + "step": 6750 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013766137956473627, + "loss": 0.2663, + "step": 6760 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013756916267060128, + "loss": 0.1904, + "step": 6770 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013747694577646627, + "loss": 0.2727, + "step": 6780 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013738472888233125, + "loss": 0.2276, + "step": 6790 + }, + { + "epoch": 1.25, + "learning_rate": 0.00013729251198819624, + "loss": 0.2364, + "step": 6800 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013720029509406125, + "loss": 0.2491, + "step": 6810 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013710807819992623, + "loss": 0.2182, + "step": 6820 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013701586130579122, + "loss": 0.1737, + "step": 6830 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013692364441165623, + "loss": 0.2261, + "step": 6840 + }, + { + "epoch": 1.26, + "learning_rate": 0.00013683142751752124, + "loss": 0.3482, + "step": 6850 + }, + { + "epoch": 1.27, + "learning_rate": 0.00013673921062338622, + "loss": 0.2293, + "step": 6860 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001366469937292512, + "loss": 0.2106, + "step": 6870 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001365547768351162, + "loss": 0.2274, + "step": 6880 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001364625599409812, + "loss": 0.2122, + "step": 6890 + }, + { + "epoch": 1.27, + "learning_rate": 0.0001363703430468462, + "loss": 0.2738, + "step": 6900 + }, + { + "epoch": 1.27, + "learning_rate": 0.00013627812615271117, + "loss": 0.2573, + "step": 6910 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013618590925857619, + "loss": 0.2886, + "step": 6920 + }, + { + "epoch": 1.28, + "learning_rate": 0.0001360936923644412, + "loss": 0.2405, + "step": 6930 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013600147547030618, + "loss": 0.2874, + "step": 6940 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013590925857617117, + "loss": 0.2308, + "step": 6950 + }, + { + "epoch": 1.28, + "learning_rate": 0.00013581704168203615, + "loss": 0.1814, + "step": 6960 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013572482478790113, + "loss": 0.2587, + "step": 6970 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013563260789376615, + "loss": 0.1863, + "step": 6980 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013554039099963113, + "loss": 0.238, + "step": 6990 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013544817410549614, + "loss": 0.1933, + "step": 7000 + }, + { + "epoch": 1.29, + "eval_accuracy": 0.8900858134014972, + "eval_loss": 0.25212499499320984, + "eval_runtime": 99.1283, + "eval_samples_per_second": 110.503, + "eval_steps_per_second": 13.82, + "step": 7000 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013535595721136113, + "loss": 0.3187, + "step": 7010 + }, + { + "epoch": 1.29, + "learning_rate": 0.00013526374031722614, + "loss": 0.2061, + "step": 7020 + }, + { + "epoch": 1.3, + "learning_rate": 0.00013517152342309112, + "loss": 0.1919, + "step": 7030 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001350793065289561, + "loss": 0.2051, + "step": 7040 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001349870896348211, + "loss": 0.1793, + "step": 7050 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001348948727406861, + "loss": 0.1765, + "step": 7060 + }, + { + "epoch": 1.3, + "learning_rate": 0.0001348026558465511, + "loss": 0.267, + "step": 7070 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001347104389524161, + "loss": 0.1876, + "step": 7080 + }, + { + "epoch": 1.31, + "learning_rate": 0.00013461822205828108, + "loss": 0.255, + "step": 7090 + }, + { + "epoch": 1.31, + "learning_rate": 0.0001345260051641461, + "loss": 0.1907, + "step": 7100 + }, + { + "epoch": 1.31, + "learning_rate": 0.00013443378827001108, + "loss": 0.3561, + "step": 7110 + }, + { + "epoch": 1.31, + "learning_rate": 0.00013434157137587606, + "loss": 0.2443, + "step": 7120 + }, + { + "epoch": 1.32, + "learning_rate": 0.00013424935448174105, + "loss": 0.1717, + "step": 7130 + }, + { + "epoch": 1.32, + "learning_rate": 0.00013415713758760606, + "loss": 0.2216, + "step": 7140 + }, + { + "epoch": 1.32, + "learning_rate": 0.00013406492069347104, + "loss": 0.1926, + "step": 7150 + }, + { + "epoch": 1.32, + "learning_rate": 0.00013397270379933603, + "loss": 0.3015, + "step": 7160 + }, + { + "epoch": 1.32, + "learning_rate": 0.00013388048690520104, + "loss": 0.1808, + "step": 7170 + }, + { + "epoch": 1.32, + "learning_rate": 0.00013378827001106605, + "loss": 0.2813, + "step": 7180 + }, + { + "epoch": 1.33, + "learning_rate": 0.00013369605311693104, + "loss": 0.2651, + "step": 7190 + }, + { + "epoch": 1.33, + "learning_rate": 0.00013360383622279602, + "loss": 0.1868, + "step": 7200 + }, + { + "epoch": 1.33, + "learning_rate": 0.000133511619328661, + "loss": 0.2673, + "step": 7210 + }, + { + "epoch": 1.33, + "learning_rate": 0.00013341940243452602, + "loss": 0.1917, + "step": 7220 + }, + { + "epoch": 1.33, + "learning_rate": 0.000133327185540391, + "loss": 0.2866, + "step": 7230 + }, + { + "epoch": 1.34, + "learning_rate": 0.00013323496864625598, + "loss": 0.269, + "step": 7240 + }, + { + "epoch": 1.34, + "learning_rate": 0.000133142751752121, + "loss": 0.205, + "step": 7250 + }, + { + "epoch": 1.34, + "learning_rate": 0.000133050534857986, + "loss": 0.2133, + "step": 7260 + }, + { + "epoch": 1.34, + "learning_rate": 0.000132958317963851, + "loss": 0.2186, + "step": 7270 + }, + { + "epoch": 1.34, + "learning_rate": 0.00013286610106971598, + "loss": 0.2304, + "step": 7280 + }, + { + "epoch": 1.34, + "learning_rate": 0.00013277388417558096, + "loss": 0.189, + "step": 7290 + }, + { + "epoch": 1.35, + "learning_rate": 0.00013268166728144597, + "loss": 0.2829, + "step": 7300 + }, + { + "epoch": 1.35, + "learning_rate": 0.00013258945038731096, + "loss": 0.2703, + "step": 7310 + }, + { + "epoch": 1.35, + "learning_rate": 0.00013249723349317594, + "loss": 0.1833, + "step": 7320 + }, + { + "epoch": 1.35, + "learning_rate": 0.00013240501659904095, + "loss": 0.2662, + "step": 7330 + }, + { + "epoch": 1.35, + "learning_rate": 0.00013231279970490596, + "loss": 0.2284, + "step": 7340 + }, + { + "epoch": 1.36, + "learning_rate": 0.00013222058281077095, + "loss": 0.1519, + "step": 7350 + }, + { + "epoch": 1.36, + "learning_rate": 0.00013212836591663593, + "loss": 0.2817, + "step": 7360 + }, + { + "epoch": 1.36, + "learning_rate": 0.00013203614902250092, + "loss": 0.1787, + "step": 7370 + }, + { + "epoch": 1.36, + "learning_rate": 0.00013194393212836593, + "loss": 0.1595, + "step": 7380 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001318517152342309, + "loss": 0.2201, + "step": 7390 + }, + { + "epoch": 1.36, + "learning_rate": 0.0001317594983400959, + "loss": 0.2606, + "step": 7400 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001316672814459609, + "loss": 0.2759, + "step": 7410 + }, + { + "epoch": 1.37, + "learning_rate": 0.00013157506455182592, + "loss": 0.2757, + "step": 7420 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001314828476576909, + "loss": 0.2708, + "step": 7430 + }, + { + "epoch": 1.37, + "learning_rate": 0.0001313906307635559, + "loss": 0.1542, + "step": 7440 + }, + { + "epoch": 1.37, + "learning_rate": 0.00013129841386942087, + "loss": 0.2149, + "step": 7450 + }, + { + "epoch": 1.38, + "learning_rate": 0.00013120619697528589, + "loss": 0.1677, + "step": 7460 + }, + { + "epoch": 1.38, + "learning_rate": 0.00013111398008115087, + "loss": 0.2352, + "step": 7470 + }, + { + "epoch": 1.38, + "learning_rate": 0.00013102176318701585, + "loss": 0.2292, + "step": 7480 + }, + { + "epoch": 1.38, + "learning_rate": 0.00013092954629288087, + "loss": 0.2387, + "step": 7490 + }, + { + "epoch": 1.38, + "learning_rate": 0.00013083732939874588, + "loss": 0.2278, + "step": 7500 + }, + { + "epoch": 1.39, + "learning_rate": 0.00013074511250461086, + "loss": 0.1767, + "step": 7510 + }, + { + "epoch": 1.39, + "learning_rate": 0.00013065289561047585, + "loss": 0.1616, + "step": 7520 + }, + { + "epoch": 1.39, + "learning_rate": 0.00013056067871634083, + "loss": 0.2363, + "step": 7530 + }, + { + "epoch": 1.39, + "learning_rate": 0.00013046846182220584, + "loss": 0.1996, + "step": 7540 + }, + { + "epoch": 1.39, + "learning_rate": 0.00013037624492807083, + "loss": 0.2386, + "step": 7550 + }, + { + "epoch": 1.39, + "learning_rate": 0.0001302840280339358, + "loss": 0.1635, + "step": 7560 + }, + { + "epoch": 1.4, + "learning_rate": 0.00013019181113980082, + "loss": 0.1882, + "step": 7570 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001300995942456658, + "loss": 0.2306, + "step": 7580 + }, + { + "epoch": 1.4, + "learning_rate": 0.00013000737735153082, + "loss": 0.3108, + "step": 7590 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001299151604573958, + "loss": 0.1904, + "step": 7600 + }, + { + "epoch": 1.4, + "learning_rate": 0.0001298229435632608, + "loss": 0.3081, + "step": 7610 + }, + { + "epoch": 1.41, + "learning_rate": 0.00012973072666912577, + "loss": 0.1683, + "step": 7620 + }, + { + "epoch": 1.41, + "learning_rate": 0.00012963850977499078, + "loss": 0.2353, + "step": 7630 + }, + { + "epoch": 1.41, + "learning_rate": 0.00012954629288085577, + "loss": 0.1807, + "step": 7640 + }, + { + "epoch": 1.41, + "learning_rate": 0.00012945407598672078, + "loss": 0.1619, + "step": 7650 + }, + { + "epoch": 1.41, + "learning_rate": 0.00012936185909258576, + "loss": 0.1439, + "step": 7660 + }, + { + "epoch": 1.41, + "learning_rate": 0.00012926964219845077, + "loss": 0.2855, + "step": 7670 + }, + { + "epoch": 1.42, + "learning_rate": 0.00012917742530431576, + "loss": 0.2415, + "step": 7680 + }, + { + "epoch": 1.42, + "learning_rate": 0.00012908520841018074, + "loss": 0.112, + "step": 7690 + }, + { + "epoch": 1.42, + "learning_rate": 0.00012899299151604573, + "loss": 0.3052, + "step": 7700 + }, + { + "epoch": 1.42, + "learning_rate": 0.00012890077462191074, + "loss": 0.2934, + "step": 7710 + }, + { + "epoch": 1.42, + "learning_rate": 0.00012880855772777572, + "loss": 0.2106, + "step": 7720 + }, + { + "epoch": 1.43, + "learning_rate": 0.00012871634083364074, + "loss": 0.2402, + "step": 7730 + }, + { + "epoch": 1.43, + "learning_rate": 0.00012862412393950572, + "loss": 0.2989, + "step": 7740 + }, + { + "epoch": 1.43, + "learning_rate": 0.00012853190704537073, + "loss": 0.2562, + "step": 7750 + }, + { + "epoch": 1.43, + "learning_rate": 0.00012843969015123572, + "loss": 0.2612, + "step": 7760 + }, + { + "epoch": 1.43, + "learning_rate": 0.0001283474732571007, + "loss": 0.2899, + "step": 7770 + }, + { + "epoch": 1.43, + "learning_rate": 0.00012825525636296568, + "loss": 0.1914, + "step": 7780 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001281630394688307, + "loss": 0.2035, + "step": 7790 + }, + { + "epoch": 1.44, + "learning_rate": 0.00012807082257469568, + "loss": 0.1966, + "step": 7800 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001279786056805607, + "loss": 0.3944, + "step": 7810 + }, + { + "epoch": 1.44, + "learning_rate": 0.00012788638878642568, + "loss": 0.3371, + "step": 7820 + }, + { + "epoch": 1.44, + "learning_rate": 0.0001277941718922907, + "loss": 0.2058, + "step": 7830 + }, + { + "epoch": 1.45, + "learning_rate": 0.00012770195499815567, + "loss": 0.286, + "step": 7840 + }, + { + "epoch": 1.45, + "learning_rate": 0.00012760973810402066, + "loss": 0.2834, + "step": 7850 + }, + { + "epoch": 1.45, + "learning_rate": 0.00012751752120988564, + "loss": 0.1462, + "step": 7860 + }, + { + "epoch": 1.45, + "learning_rate": 0.00012742530431575065, + "loss": 0.2344, + "step": 7870 + }, + { + "epoch": 1.45, + "learning_rate": 0.00012733308742161564, + "loss": 0.2592, + "step": 7880 + }, + { + "epoch": 1.46, + "learning_rate": 0.00012724087052748065, + "loss": 0.3563, + "step": 7890 + }, + { + "epoch": 1.46, + "learning_rate": 0.00012714865363334563, + "loss": 0.1902, + "step": 7900 + }, + { + "epoch": 1.46, + "learning_rate": 0.00012705643673921064, + "loss": 0.2049, + "step": 7910 + }, + { + "epoch": 1.46, + "learning_rate": 0.00012696421984507563, + "loss": 0.2537, + "step": 7920 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001268720029509406, + "loss": 0.2205, + "step": 7930 + }, + { + "epoch": 1.46, + "learning_rate": 0.0001267797860568056, + "loss": 0.1966, + "step": 7940 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001266875691626706, + "loss": 0.2639, + "step": 7950 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001265953522685356, + "loss": 0.2864, + "step": 7960 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001265031353744006, + "loss": 0.2161, + "step": 7970 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001264109184802656, + "loss": 0.2435, + "step": 7980 + }, + { + "epoch": 1.47, + "learning_rate": 0.0001263187015861306, + "loss": 0.3763, + "step": 7990 + }, + { + "epoch": 1.48, + "learning_rate": 0.00012622648469199559, + "loss": 0.1497, + "step": 8000 + }, + { + "epoch": 1.48, + "eval_accuracy": 0.8929158298338506, + "eval_loss": 0.24426758289337158, + "eval_runtime": 99.0537, + "eval_samples_per_second": 110.586, + "eval_steps_per_second": 13.831, + "step": 8000 + }, + { + "epoch": 1.48, + "learning_rate": 0.00012613426779786057, + "loss": 0.2817, + "step": 8010 + }, + { + "epoch": 1.48, + "learning_rate": 0.00012604205090372555, + "loss": 0.1709, + "step": 8020 + }, + { + "epoch": 1.48, + "learning_rate": 0.00012594983400959057, + "loss": 0.2355, + "step": 8030 + }, + { + "epoch": 1.48, + "learning_rate": 0.00012585761711545555, + "loss": 0.1934, + "step": 8040 + }, + { + "epoch": 1.48, + "learning_rate": 0.00012576540022132056, + "loss": 0.1798, + "step": 8050 + }, + { + "epoch": 1.49, + "learning_rate": 0.00012567318332718555, + "loss": 0.2171, + "step": 8060 + }, + { + "epoch": 1.49, + "learning_rate": 0.00012558096643305056, + "loss": 0.1634, + "step": 8070 + }, + { + "epoch": 1.49, + "learning_rate": 0.00012548874953891554, + "loss": 0.2284, + "step": 8080 + }, + { + "epoch": 1.49, + "learning_rate": 0.00012539653264478053, + "loss": 0.1464, + "step": 8090 + }, + { + "epoch": 1.49, + "learning_rate": 0.0001253043157506455, + "loss": 0.1662, + "step": 8100 + }, + { + "epoch": 1.5, + "learning_rate": 0.00012521209885651052, + "loss": 0.2556, + "step": 8110 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001251198819623755, + "loss": 0.2933, + "step": 8120 + }, + { + "epoch": 1.5, + "learning_rate": 0.00012502766506824052, + "loss": 0.268, + "step": 8130 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001249354481741055, + "loss": 0.2616, + "step": 8140 + }, + { + "epoch": 1.5, + "learning_rate": 0.00012484323127997051, + "loss": 0.2024, + "step": 8150 + }, + { + "epoch": 1.5, + "learning_rate": 0.0001247510143858355, + "loss": 0.2202, + "step": 8160 + }, + { + "epoch": 1.51, + "learning_rate": 0.00012465879749170048, + "loss": 0.2795, + "step": 8170 + }, + { + "epoch": 1.51, + "learning_rate": 0.00012456658059756547, + "loss": 0.1465, + "step": 8180 + }, + { + "epoch": 1.51, + "learning_rate": 0.00012447436370343048, + "loss": 0.2327, + "step": 8190 + }, + { + "epoch": 1.51, + "learning_rate": 0.00012438214680929546, + "loss": 0.2233, + "step": 8200 + }, + { + "epoch": 1.51, + "learning_rate": 0.00012428992991516047, + "loss": 0.2079, + "step": 8210 + }, + { + "epoch": 1.52, + "learning_rate": 0.00012419771302102546, + "loss": 0.2067, + "step": 8220 + }, + { + "epoch": 1.52, + "learning_rate": 0.00012410549612689044, + "loss": 0.231, + "step": 8230 + }, + { + "epoch": 1.52, + "learning_rate": 0.00012401327923275546, + "loss": 0.3321, + "step": 8240 + }, + { + "epoch": 1.52, + "learning_rate": 0.00012392106233862044, + "loss": 0.2292, + "step": 8250 + }, + { + "epoch": 1.52, + "learning_rate": 0.00012382884544448542, + "loss": 0.2285, + "step": 8260 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001237366285503504, + "loss": 0.145, + "step": 8270 + }, + { + "epoch": 1.53, + "learning_rate": 0.00012364441165621542, + "loss": 0.2573, + "step": 8280 + }, + { + "epoch": 1.53, + "learning_rate": 0.00012355219476208043, + "loss": 0.2345, + "step": 8290 + }, + { + "epoch": 1.53, + "learning_rate": 0.00012345997786794542, + "loss": 0.2883, + "step": 8300 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001233677609738104, + "loss": 0.2041, + "step": 8310 + }, + { + "epoch": 1.53, + "learning_rate": 0.0001232755440796754, + "loss": 0.2447, + "step": 8320 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001231833271855404, + "loss": 0.1614, + "step": 8330 + }, + { + "epoch": 1.54, + "learning_rate": 0.00012309111029140538, + "loss": 0.1595, + "step": 8340 + }, + { + "epoch": 1.54, + "learning_rate": 0.00012299889339727036, + "loss": 0.1773, + "step": 8350 + }, + { + "epoch": 1.54, + "learning_rate": 0.00012290667650313538, + "loss": 0.2009, + "step": 8360 + }, + { + "epoch": 1.54, + "learning_rate": 0.0001228144596090004, + "loss": 0.134, + "step": 8370 + }, + { + "epoch": 1.55, + "learning_rate": 0.00012272224271486537, + "loss": 0.2702, + "step": 8380 + }, + { + "epoch": 1.55, + "learning_rate": 0.00012263002582073036, + "loss": 0.1946, + "step": 8390 + }, + { + "epoch": 1.55, + "learning_rate": 0.00012253780892659537, + "loss": 0.2947, + "step": 8400 + }, + { + "epoch": 1.55, + "learning_rate": 0.00012244559203246035, + "loss": 0.3152, + "step": 8410 + }, + { + "epoch": 1.55, + "learning_rate": 0.00012235337513832534, + "loss": 0.2391, + "step": 8420 + }, + { + "epoch": 1.55, + "learning_rate": 0.00012226115824419032, + "loss": 0.2751, + "step": 8430 + }, + { + "epoch": 1.56, + "learning_rate": 0.00012216894135005533, + "loss": 0.204, + "step": 8440 + }, + { + "epoch": 1.56, + "learning_rate": 0.00012207672445592034, + "loss": 0.2294, + "step": 8450 + }, + { + "epoch": 1.56, + "learning_rate": 0.00012198450756178532, + "loss": 0.1751, + "step": 8460 + }, + { + "epoch": 1.56, + "learning_rate": 0.00012189229066765031, + "loss": 0.185, + "step": 8470 + }, + { + "epoch": 1.56, + "learning_rate": 0.00012180007377351532, + "loss": 0.2549, + "step": 8480 + }, + { + "epoch": 1.57, + "learning_rate": 0.00012170785687938031, + "loss": 0.2544, + "step": 8490 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001216156399852453, + "loss": 0.212, + "step": 8500 + }, + { + "epoch": 1.57, + "learning_rate": 0.00012152342309111029, + "loss": 0.2575, + "step": 8510 + }, + { + "epoch": 1.57, + "learning_rate": 0.0001214312061969753, + "loss": 0.2937, + "step": 8520 + }, + { + "epoch": 1.57, + "learning_rate": 0.00012133898930284029, + "loss": 0.1906, + "step": 8530 + }, + { + "epoch": 1.58, + "learning_rate": 0.00012124677240870527, + "loss": 0.2207, + "step": 8540 + }, + { + "epoch": 1.58, + "learning_rate": 0.00012115455551457027, + "loss": 0.2573, + "step": 8550 + }, + { + "epoch": 1.58, + "learning_rate": 0.00012106233862043528, + "loss": 0.2415, + "step": 8560 + }, + { + "epoch": 1.58, + "learning_rate": 0.00012097012172630027, + "loss": 0.2171, + "step": 8570 + }, + { + "epoch": 1.58, + "learning_rate": 0.00012087790483216525, + "loss": 0.1182, + "step": 8580 + }, + { + "epoch": 1.58, + "learning_rate": 0.00012078568793803025, + "loss": 0.166, + "step": 8590 + }, + { + "epoch": 1.59, + "learning_rate": 0.00012069347104389526, + "loss": 0.2243, + "step": 8600 + }, + { + "epoch": 1.59, + "learning_rate": 0.00012060125414976024, + "loss": 0.2082, + "step": 8610 + }, + { + "epoch": 1.59, + "learning_rate": 0.00012050903725562523, + "loss": 0.2802, + "step": 8620 + }, + { + "epoch": 1.59, + "learning_rate": 0.00012041682036149023, + "loss": 0.2671, + "step": 8630 + }, + { + "epoch": 1.59, + "learning_rate": 0.00012032460346735524, + "loss": 0.1579, + "step": 8640 + }, + { + "epoch": 1.6, + "learning_rate": 0.00012023238657322022, + "loss": 0.2012, + "step": 8650 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001201401696790852, + "loss": 0.1878, + "step": 8660 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001200479527849502, + "loss": 0.2192, + "step": 8670 + }, + { + "epoch": 1.6, + "learning_rate": 0.00011995573589081522, + "loss": 0.2217, + "step": 8680 + }, + { + "epoch": 1.6, + "learning_rate": 0.0001198635189966802, + "loss": 0.1999, + "step": 8690 + }, + { + "epoch": 1.6, + "learning_rate": 0.00011977130210254518, + "loss": 0.2146, + "step": 8700 + }, + { + "epoch": 1.61, + "learning_rate": 0.00011967908520841018, + "loss": 0.1185, + "step": 8710 + }, + { + "epoch": 1.61, + "learning_rate": 0.0001195868683142752, + "loss": 0.1502, + "step": 8720 + }, + { + "epoch": 1.61, + "learning_rate": 0.00011949465142014018, + "loss": 0.2598, + "step": 8730 + }, + { + "epoch": 1.61, + "learning_rate": 0.00011940243452600516, + "loss": 0.2108, + "step": 8740 + }, + { + "epoch": 1.61, + "learning_rate": 0.00011931021763187016, + "loss": 0.1722, + "step": 8750 + }, + { + "epoch": 1.62, + "learning_rate": 0.00011921800073773517, + "loss": 0.1995, + "step": 8760 + }, + { + "epoch": 1.62, + "learning_rate": 0.00011912578384360016, + "loss": 0.2303, + "step": 8770 + }, + { + "epoch": 1.62, + "learning_rate": 0.00011903356694946514, + "loss": 0.1352, + "step": 8780 + }, + { + "epoch": 1.62, + "learning_rate": 0.00011894135005533014, + "loss": 0.2972, + "step": 8790 + }, + { + "epoch": 1.62, + "learning_rate": 0.00011884913316119515, + "loss": 0.2021, + "step": 8800 + }, + { + "epoch": 1.62, + "learning_rate": 0.00011875691626706014, + "loss": 0.2159, + "step": 8810 + }, + { + "epoch": 1.63, + "learning_rate": 0.00011866469937292512, + "loss": 0.2628, + "step": 8820 + }, + { + "epoch": 1.63, + "learning_rate": 0.00011857248247879012, + "loss": 0.2672, + "step": 8830 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001184802655846551, + "loss": 0.2086, + "step": 8840 + }, + { + "epoch": 1.63, + "learning_rate": 0.00011838804869052011, + "loss": 0.2027, + "step": 8850 + }, + { + "epoch": 1.63, + "learning_rate": 0.0001182958317963851, + "loss": 0.2272, + "step": 8860 + }, + { + "epoch": 1.64, + "learning_rate": 0.0001182036149022501, + "loss": 0.284, + "step": 8870 + }, + { + "epoch": 1.64, + "learning_rate": 0.00011811139800811508, + "loss": 0.2287, + "step": 8880 + }, + { + "epoch": 1.64, + "learning_rate": 0.00011801918111398009, + "loss": 0.2576, + "step": 8890 + }, + { + "epoch": 1.64, + "learning_rate": 0.00011792696421984508, + "loss": 0.1977, + "step": 8900 + }, + { + "epoch": 1.64, + "learning_rate": 0.00011783474732571007, + "loss": 0.2285, + "step": 8910 + }, + { + "epoch": 1.65, + "learning_rate": 0.00011774253043157506, + "loss": 0.2527, + "step": 8920 + }, + { + "epoch": 1.65, + "learning_rate": 0.00011765031353744007, + "loss": 0.1921, + "step": 8930 + }, + { + "epoch": 1.65, + "learning_rate": 0.00011755809664330505, + "loss": 0.1627, + "step": 8940 + }, + { + "epoch": 1.65, + "learning_rate": 0.00011746587974917005, + "loss": 0.1663, + "step": 8950 + }, + { + "epoch": 1.65, + "learning_rate": 0.00011737366285503504, + "loss": 0.2219, + "step": 8960 + }, + { + "epoch": 1.65, + "learning_rate": 0.00011728144596090005, + "loss": 0.2183, + "step": 8970 + }, + { + "epoch": 1.66, + "learning_rate": 0.00011718922906676503, + "loss": 0.1422, + "step": 8980 + }, + { + "epoch": 1.66, + "learning_rate": 0.00011709701217263003, + "loss": 0.3236, + "step": 8990 + }, + { + "epoch": 1.66, + "learning_rate": 0.00011700479527849502, + "loss": 0.326, + "step": 9000 + }, + { + "epoch": 1.66, + "eval_accuracy": 0.8958371371188607, + "eval_loss": 0.24060481786727905, + "eval_runtime": 99.1063, + "eval_samples_per_second": 110.528, + "eval_steps_per_second": 13.824, + "step": 9000 + }, + { + "epoch": 1.66, + "learning_rate": 0.00011691257838436003, + "loss": 0.2334, + "step": 9010 + }, + { + "epoch": 1.66, + "learning_rate": 0.00011682036149022501, + "loss": 0.2106, + "step": 9020 + }, + { + "epoch": 1.67, + "learning_rate": 0.00011672814459609001, + "loss": 0.1898, + "step": 9030 + }, + { + "epoch": 1.67, + "learning_rate": 0.000116635927701955, + "loss": 0.2647, + "step": 9040 + }, + { + "epoch": 1.67, + "learning_rate": 0.00011654371080782, + "loss": 0.2088, + "step": 9050 + }, + { + "epoch": 1.67, + "learning_rate": 0.00011645149391368499, + "loss": 0.2472, + "step": 9060 + }, + { + "epoch": 1.67, + "learning_rate": 0.00011635927701954999, + "loss": 0.1796, + "step": 9070 + }, + { + "epoch": 1.67, + "learning_rate": 0.00011626706012541497, + "loss": 0.1225, + "step": 9080 + }, + { + "epoch": 1.68, + "learning_rate": 0.00011617484323127998, + "loss": 0.3306, + "step": 9090 + }, + { + "epoch": 1.68, + "learning_rate": 0.00011608262633714497, + "loss": 0.2479, + "step": 9100 + }, + { + "epoch": 1.68, + "learning_rate": 0.00011599040944300997, + "loss": 0.2157, + "step": 9110 + }, + { + "epoch": 1.68, + "learning_rate": 0.00011589819254887495, + "loss": 0.1434, + "step": 9120 + }, + { + "epoch": 1.68, + "learning_rate": 0.00011580597565473996, + "loss": 0.1893, + "step": 9130 + }, + { + "epoch": 1.69, + "learning_rate": 0.00011571375876060495, + "loss": 0.2089, + "step": 9140 + }, + { + "epoch": 1.69, + "learning_rate": 0.00011562154186646994, + "loss": 0.1933, + "step": 9150 + }, + { + "epoch": 1.69, + "learning_rate": 0.00011552932497233493, + "loss": 0.2579, + "step": 9160 + }, + { + "epoch": 1.69, + "learning_rate": 0.00011543710807819994, + "loss": 0.2344, + "step": 9170 + }, + { + "epoch": 1.69, + "learning_rate": 0.00011534489118406492, + "loss": 0.1639, + "step": 9180 + }, + { + "epoch": 1.69, + "learning_rate": 0.00011525267428992992, + "loss": 0.2719, + "step": 9190 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001151604573957949, + "loss": 0.2147, + "step": 9200 + }, + { + "epoch": 1.7, + "learning_rate": 0.00011506824050165992, + "loss": 0.2561, + "step": 9210 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001149760236075249, + "loss": 0.2279, + "step": 9220 + }, + { + "epoch": 1.7, + "learning_rate": 0.0001148838067133899, + "loss": 0.2453, + "step": 9230 + }, + { + "epoch": 1.7, + "learning_rate": 0.00011479158981925489, + "loss": 0.2262, + "step": 9240 + }, + { + "epoch": 1.71, + "learning_rate": 0.0001146993729251199, + "loss": 0.2004, + "step": 9250 + }, + { + "epoch": 1.71, + "learning_rate": 0.00011460715603098488, + "loss": 0.1751, + "step": 9260 + }, + { + "epoch": 1.71, + "learning_rate": 0.00011451493913684988, + "loss": 0.26, + "step": 9270 + }, + { + "epoch": 1.71, + "learning_rate": 0.00011442272224271486, + "loss": 0.1794, + "step": 9280 + }, + { + "epoch": 1.71, + "learning_rate": 0.00011433050534857987, + "loss": 0.2668, + "step": 9290 + }, + { + "epoch": 1.72, + "learning_rate": 0.00011423828845444486, + "loss": 0.2006, + "step": 9300 + }, + { + "epoch": 1.72, + "learning_rate": 0.00011414607156030986, + "loss": 0.2378, + "step": 9310 + }, + { + "epoch": 1.72, + "learning_rate": 0.00011405385466617484, + "loss": 0.1551, + "step": 9320 + }, + { + "epoch": 1.72, + "learning_rate": 0.00011396163777203985, + "loss": 0.1739, + "step": 9330 + }, + { + "epoch": 1.72, + "learning_rate": 0.00011386942087790484, + "loss": 0.1707, + "step": 9340 + }, + { + "epoch": 1.72, + "learning_rate": 0.00011377720398376984, + "loss": 0.2313, + "step": 9350 + }, + { + "epoch": 1.73, + "learning_rate": 0.00011368498708963482, + "loss": 0.1719, + "step": 9360 + }, + { + "epoch": 1.73, + "learning_rate": 0.00011359277019549983, + "loss": 0.2656, + "step": 9370 + }, + { + "epoch": 1.73, + "learning_rate": 0.00011350055330136482, + "loss": 0.1628, + "step": 9380 + }, + { + "epoch": 1.73, + "learning_rate": 0.00011340833640722981, + "loss": 0.2269, + "step": 9390 + }, + { + "epoch": 1.73, + "learning_rate": 0.0001133161195130948, + "loss": 0.2695, + "step": 9400 + }, + { + "epoch": 1.74, + "learning_rate": 0.00011322390261895981, + "loss": 0.2848, + "step": 9410 + }, + { + "epoch": 1.74, + "learning_rate": 0.0001131316857248248, + "loss": 0.2628, + "step": 9420 + }, + { + "epoch": 1.74, + "learning_rate": 0.00011303946883068979, + "loss": 0.2679, + "step": 9430 + }, + { + "epoch": 1.74, + "learning_rate": 0.00011294725193655478, + "loss": 0.151, + "step": 9440 + }, + { + "epoch": 1.74, + "learning_rate": 0.00011285503504241979, + "loss": 0.2359, + "step": 9450 + }, + { + "epoch": 1.74, + "learning_rate": 0.00011276281814828477, + "loss": 0.1497, + "step": 9460 + }, + { + "epoch": 1.75, + "learning_rate": 0.00011267060125414977, + "loss": 0.2328, + "step": 9470 + }, + { + "epoch": 1.75, + "learning_rate": 0.00011257838436001475, + "loss": 0.1297, + "step": 9480 + }, + { + "epoch": 1.75, + "learning_rate": 0.00011248616746587974, + "loss": 0.2342, + "step": 9490 + }, + { + "epoch": 1.75, + "learning_rate": 0.00011239395057174475, + "loss": 0.2353, + "step": 9500 + }, + { + "epoch": 1.75, + "learning_rate": 0.00011230173367760975, + "loss": 0.2787, + "step": 9510 + }, + { + "epoch": 1.76, + "learning_rate": 0.00011220951678347473, + "loss": 0.2113, + "step": 9520 + }, + { + "epoch": 1.76, + "learning_rate": 0.00011211729988933972, + "loss": 0.2745, + "step": 9530 + }, + { + "epoch": 1.76, + "learning_rate": 0.00011202508299520473, + "loss": 0.1921, + "step": 9540 + }, + { + "epoch": 1.76, + "learning_rate": 0.00011193286610106973, + "loss": 0.179, + "step": 9550 + }, + { + "epoch": 1.76, + "learning_rate": 0.00011184064920693471, + "loss": 0.1999, + "step": 9560 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001117484323127997, + "loss": 0.1753, + "step": 9570 + }, + { + "epoch": 1.77, + "learning_rate": 0.00011165621541866471, + "loss": 0.2441, + "step": 9580 + }, + { + "epoch": 1.77, + "learning_rate": 0.0001115639985245297, + "loss": 0.2039, + "step": 9590 + }, + { + "epoch": 1.77, + "learning_rate": 0.00011147178163039469, + "loss": 0.179, + "step": 9600 + }, + { + "epoch": 1.77, + "learning_rate": 0.00011137956473625967, + "loss": 0.2161, + "step": 9610 + }, + { + "epoch": 1.77, + "learning_rate": 0.00011128734784212469, + "loss": 0.1772, + "step": 9620 + }, + { + "epoch": 1.78, + "learning_rate": 0.00011119513094798968, + "loss": 0.2605, + "step": 9630 + }, + { + "epoch": 1.78, + "learning_rate": 0.00011110291405385467, + "loss": 0.2228, + "step": 9640 + }, + { + "epoch": 1.78, + "learning_rate": 0.00011101069715971965, + "loss": 0.223, + "step": 9650 + }, + { + "epoch": 1.78, + "learning_rate": 0.00011091848026558466, + "loss": 0.2553, + "step": 9660 + }, + { + "epoch": 1.78, + "learning_rate": 0.00011082626337144966, + "loss": 0.2314, + "step": 9670 + }, + { + "epoch": 1.79, + "learning_rate": 0.00011073404647731465, + "loss": 0.2686, + "step": 9680 + }, + { + "epoch": 1.79, + "learning_rate": 0.00011064182958317963, + "loss": 0.2472, + "step": 9690 + }, + { + "epoch": 1.79, + "learning_rate": 0.00011054961268904464, + "loss": 0.2428, + "step": 9700 + }, + { + "epoch": 1.79, + "learning_rate": 0.00011045739579490964, + "loss": 0.136, + "step": 9710 + }, + { + "epoch": 1.79, + "learning_rate": 0.00011036517890077462, + "loss": 0.1727, + "step": 9720 + }, + { + "epoch": 1.79, + "learning_rate": 0.00011027296200663961, + "loss": 0.1694, + "step": 9730 + }, + { + "epoch": 1.8, + "learning_rate": 0.00011018074511250462, + "loss": 0.1562, + "step": 9740 + }, + { + "epoch": 1.8, + "learning_rate": 0.00011008852821836962, + "loss": 0.2279, + "step": 9750 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001099963113242346, + "loss": 0.1468, + "step": 9760 + }, + { + "epoch": 1.8, + "learning_rate": 0.00010990409443009959, + "loss": 0.2378, + "step": 9770 + }, + { + "epoch": 1.8, + "learning_rate": 0.0001098118775359646, + "loss": 0.1784, + "step": 9780 + }, + { + "epoch": 1.81, + "learning_rate": 0.0001097196606418296, + "loss": 0.2022, + "step": 9790 + }, + { + "epoch": 1.81, + "learning_rate": 0.00010962744374769458, + "loss": 0.159, + "step": 9800 + }, + { + "epoch": 1.81, + "learning_rate": 0.00010953522685355957, + "loss": 0.2206, + "step": 9810 + }, + { + "epoch": 1.81, + "learning_rate": 0.00010944300995942458, + "loss": 0.1881, + "step": 9820 + }, + { + "epoch": 1.81, + "learning_rate": 0.00010935079306528957, + "loss": 0.1979, + "step": 9830 + }, + { + "epoch": 1.81, + "learning_rate": 0.00010925857617115456, + "loss": 0.2833, + "step": 9840 + }, + { + "epoch": 1.82, + "learning_rate": 0.00010916635927701954, + "loss": 0.2631, + "step": 9850 + }, + { + "epoch": 1.82, + "learning_rate": 0.00010907414238288456, + "loss": 0.1653, + "step": 9860 + }, + { + "epoch": 1.82, + "learning_rate": 0.00010898192548874955, + "loss": 0.1997, + "step": 9870 + }, + { + "epoch": 1.82, + "learning_rate": 0.00010888970859461454, + "loss": 0.121, + "step": 9880 + }, + { + "epoch": 1.82, + "learning_rate": 0.00010879749170047952, + "loss": 0.2092, + "step": 9890 + }, + { + "epoch": 1.83, + "learning_rate": 0.00010870527480634453, + "loss": 0.1775, + "step": 9900 + }, + { + "epoch": 1.83, + "learning_rate": 0.00010861305791220953, + "loss": 0.2089, + "step": 9910 + }, + { + "epoch": 1.83, + "learning_rate": 0.00010852084101807452, + "loss": 0.2334, + "step": 9920 + }, + { + "epoch": 1.83, + "learning_rate": 0.0001084286241239395, + "loss": 0.1916, + "step": 9930 + }, + { + "epoch": 1.83, + "learning_rate": 0.00010833640722980451, + "loss": 0.2291, + "step": 9940 + }, + { + "epoch": 1.84, + "learning_rate": 0.00010824419033566951, + "loss": 0.2331, + "step": 9950 + }, + { + "epoch": 1.84, + "learning_rate": 0.0001081519734415345, + "loss": 0.1836, + "step": 9960 + }, + { + "epoch": 1.84, + "learning_rate": 0.00010805975654739948, + "loss": 0.2869, + "step": 9970 + }, + { + "epoch": 1.84, + "learning_rate": 0.00010796753965326449, + "loss": 0.1925, + "step": 9980 + }, + { + "epoch": 1.84, + "learning_rate": 0.00010787532275912949, + "loss": 0.1442, + "step": 9990 + }, + { + "epoch": 1.84, + "learning_rate": 0.00010778310586499447, + "loss": 0.215, + "step": 10000 + }, + { + "epoch": 1.84, + "eval_accuracy": 0.9006755523096586, + "eval_loss": 0.23809599876403809, + "eval_runtime": 99.4319, + "eval_samples_per_second": 110.166, + "eval_steps_per_second": 13.778, + "step": 10000 + }, + { + "epoch": 1.85, + "learning_rate": 0.00010769088897085946, + "loss": 0.1787, + "step": 10010 + }, + { + "epoch": 1.85, + "learning_rate": 0.00010759867207672447, + "loss": 0.2511, + "step": 10020 + }, + { + "epoch": 1.85, + "learning_rate": 0.00010750645518258947, + "loss": 0.2396, + "step": 10030 + }, + { + "epoch": 1.85, + "learning_rate": 0.00010741423828845445, + "loss": 0.1376, + "step": 10040 + }, + { + "epoch": 1.85, + "learning_rate": 0.00010732202139431944, + "loss": 0.2806, + "step": 10050 + }, + { + "epoch": 1.86, + "learning_rate": 0.00010722980450018445, + "loss": 0.238, + "step": 10060 + }, + { + "epoch": 1.86, + "learning_rate": 0.00010713758760604944, + "loss": 0.2295, + "step": 10070 + }, + { + "epoch": 1.86, + "learning_rate": 0.00010704537071191443, + "loss": 0.2549, + "step": 10080 + }, + { + "epoch": 1.86, + "learning_rate": 0.00010695315381777941, + "loss": 0.1697, + "step": 10090 + }, + { + "epoch": 1.86, + "learning_rate": 0.00010686093692364441, + "loss": 0.2492, + "step": 10100 + }, + { + "epoch": 1.86, + "learning_rate": 0.00010676872002950942, + "loss": 0.2332, + "step": 10110 + }, + { + "epoch": 1.87, + "learning_rate": 0.00010667650313537441, + "loss": 0.2462, + "step": 10120 + }, + { + "epoch": 1.87, + "learning_rate": 0.00010658428624123939, + "loss": 0.1817, + "step": 10130 + }, + { + "epoch": 1.87, + "learning_rate": 0.00010649206934710439, + "loss": 0.1883, + "step": 10140 + }, + { + "epoch": 1.87, + "learning_rate": 0.0001063998524529694, + "loss": 0.1145, + "step": 10150 + }, + { + "epoch": 1.87, + "learning_rate": 0.00010630763555883439, + "loss": 0.1207, + "step": 10160 + }, + { + "epoch": 1.88, + "learning_rate": 0.00010621541866469937, + "loss": 0.1852, + "step": 10170 + }, + { + "epoch": 1.88, + "learning_rate": 0.00010612320177056437, + "loss": 0.2175, + "step": 10180 + }, + { + "epoch": 1.88, + "learning_rate": 0.00010603098487642938, + "loss": 0.1844, + "step": 10190 + }, + { + "epoch": 1.88, + "learning_rate": 0.00010593876798229436, + "loss": 0.2802, + "step": 10200 + }, + { + "epoch": 1.88, + "learning_rate": 0.00010584655108815935, + "loss": 0.2221, + "step": 10210 + }, + { + "epoch": 1.88, + "learning_rate": 0.00010575433419402435, + "loss": 0.2463, + "step": 10220 + }, + { + "epoch": 1.89, + "learning_rate": 0.00010566211729988936, + "loss": 0.1879, + "step": 10230 + }, + { + "epoch": 1.89, + "learning_rate": 0.00010556990040575434, + "loss": 0.1723, + "step": 10240 + }, + { + "epoch": 1.89, + "learning_rate": 0.00010547768351161933, + "loss": 0.1791, + "step": 10250 + }, + { + "epoch": 1.89, + "learning_rate": 0.00010538546661748432, + "loss": 0.2838, + "step": 10260 + }, + { + "epoch": 1.89, + "learning_rate": 0.00010529324972334934, + "loss": 0.2493, + "step": 10270 + }, + { + "epoch": 1.9, + "learning_rate": 0.00010520103282921432, + "loss": 0.1853, + "step": 10280 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001051088159350793, + "loss": 0.2794, + "step": 10290 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001050165990409443, + "loss": 0.2217, + "step": 10300 + }, + { + "epoch": 1.9, + "learning_rate": 0.00010492438214680931, + "loss": 0.1742, + "step": 10310 + }, + { + "epoch": 1.9, + "learning_rate": 0.0001048321652526743, + "loss": 0.2308, + "step": 10320 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010473994835853928, + "loss": 0.1497, + "step": 10330 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010464773146440428, + "loss": 0.2586, + "step": 10340 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010455551457026929, + "loss": 0.2479, + "step": 10350 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010446329767613428, + "loss": 0.1498, + "step": 10360 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010437108078199926, + "loss": 0.1545, + "step": 10370 + }, + { + "epoch": 1.91, + "learning_rate": 0.00010427886388786426, + "loss": 0.1987, + "step": 10380 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010418664699372927, + "loss": 0.1942, + "step": 10390 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010409443009959426, + "loss": 0.1845, + "step": 10400 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010400221320545924, + "loss": 0.1511, + "step": 10410 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010390999631132424, + "loss": 0.198, + "step": 10420 + }, + { + "epoch": 1.92, + "learning_rate": 0.00010381777941718925, + "loss": 0.2414, + "step": 10430 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010372556252305423, + "loss": 0.169, + "step": 10440 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010363334562891922, + "loss": 0.2181, + "step": 10450 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010354112873478422, + "loss": 0.1829, + "step": 10460 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010344891184064923, + "loss": 0.1543, + "step": 10470 + }, + { + "epoch": 1.93, + "learning_rate": 0.00010335669494651421, + "loss": 0.2585, + "step": 10480 + }, + { + "epoch": 1.93, + "learning_rate": 0.0001032644780523792, + "loss": 0.1768, + "step": 10490 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001031722611582442, + "loss": 0.2628, + "step": 10500 + }, + { + "epoch": 1.94, + "learning_rate": 0.0001030800442641092, + "loss": 0.2547, + "step": 10510 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010298782736997419, + "loss": 0.2313, + "step": 10520 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010289561047583917, + "loss": 0.1581, + "step": 10530 + }, + { + "epoch": 1.94, + "learning_rate": 0.00010280339358170417, + "loss": 0.1884, + "step": 10540 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010271117668756917, + "loss": 0.2104, + "step": 10550 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010261895979343417, + "loss": 0.1989, + "step": 10560 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010252674289929915, + "loss": 0.213, + "step": 10570 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010243452600516415, + "loss": 0.1792, + "step": 10580 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010234230911102915, + "loss": 0.1808, + "step": 10590 + }, + { + "epoch": 1.95, + "learning_rate": 0.00010225009221689415, + "loss": 0.1969, + "step": 10600 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010215787532275913, + "loss": 0.2469, + "step": 10610 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010206565842862413, + "loss": 0.1471, + "step": 10620 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010197344153448913, + "loss": 0.2731, + "step": 10630 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010188122464035413, + "loss": 0.1954, + "step": 10640 + }, + { + "epoch": 1.96, + "learning_rate": 0.00010178900774621911, + "loss": 0.1555, + "step": 10650 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010169679085208411, + "loss": 0.1597, + "step": 10660 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001016045739579491, + "loss": 0.1966, + "step": 10670 + }, + { + "epoch": 1.97, + "learning_rate": 0.0001015123570638141, + "loss": 0.183, + "step": 10680 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010142014016967909, + "loss": 0.2784, + "step": 10690 + }, + { + "epoch": 1.97, + "learning_rate": 0.00010132792327554407, + "loss": 0.3295, + "step": 10700 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010123570638140908, + "loss": 0.2518, + "step": 10710 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010114348948727408, + "loss": 0.2137, + "step": 10720 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010105127259313907, + "loss": 0.1548, + "step": 10730 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010095905569900405, + "loss": 0.3132, + "step": 10740 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010086683880486905, + "loss": 0.2445, + "step": 10750 + }, + { + "epoch": 1.98, + "learning_rate": 0.00010077462191073406, + "loss": 0.2612, + "step": 10760 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010068240501659904, + "loss": 0.2073, + "step": 10770 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010059018812246403, + "loss": 0.2407, + "step": 10780 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010049797122832903, + "loss": 0.139, + "step": 10790 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010040575433419404, + "loss": 0.2008, + "step": 10800 + }, + { + "epoch": 1.99, + "learning_rate": 0.00010031353744005902, + "loss": 0.3201, + "step": 10810 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010022132054592401, + "loss": 0.2554, + "step": 10820 + }, + { + "epoch": 2.0, + "learning_rate": 0.000100129103651789, + "loss": 0.2992, + "step": 10830 + }, + { + "epoch": 2.0, + "learning_rate": 0.00010003688675765402, + "loss": 0.2177, + "step": 10840 + }, + { + "epoch": 2.0, + "learning_rate": 9.9944669863519e-05, + "loss": 0.1616, + "step": 10850 + }, + { + "epoch": 2.0, + "learning_rate": 9.985245296938399e-05, + "loss": 0.1123, + "step": 10860 + }, + { + "epoch": 2.0, + "learning_rate": 9.9760236075249e-05, + "loss": 0.1704, + "step": 10870 + }, + { + "epoch": 2.01, + "learning_rate": 9.966801918111398e-05, + "loss": 0.1256, + "step": 10880 + }, + { + "epoch": 2.01, + "learning_rate": 9.957580228697898e-05, + "loss": 0.1458, + "step": 10890 + }, + { + "epoch": 2.01, + "learning_rate": 9.948358539284396e-05, + "loss": 0.2031, + "step": 10900 + }, + { + "epoch": 2.01, + "learning_rate": 9.939136849870898e-05, + "loss": 0.204, + "step": 10910 + }, + { + "epoch": 2.01, + "learning_rate": 9.929915160457396e-05, + "loss": 0.2048, + "step": 10920 + }, + { + "epoch": 2.02, + "learning_rate": 9.920693471043896e-05, + "loss": 0.1439, + "step": 10930 + }, + { + "epoch": 2.02, + "learning_rate": 9.911471781630394e-05, + "loss": 0.1381, + "step": 10940 + }, + { + "epoch": 2.02, + "learning_rate": 9.902250092216895e-05, + "loss": 0.0731, + "step": 10950 + }, + { + "epoch": 2.02, + "learning_rate": 9.893028402803394e-05, + "loss": 0.2045, + "step": 10960 + }, + { + "epoch": 2.02, + "learning_rate": 9.883806713389894e-05, + "loss": 0.1455, + "step": 10970 + }, + { + "epoch": 2.03, + "learning_rate": 9.874585023976392e-05, + "loss": 0.1463, + "step": 10980 + }, + { + "epoch": 2.03, + "learning_rate": 9.865363334562893e-05, + "loss": 0.12, + "step": 10990 + }, + { + "epoch": 2.03, + "learning_rate": 9.856141645149392e-05, + "loss": 0.2035, + "step": 11000 + }, + { + "epoch": 2.03, + "eval_accuracy": 0.8985758626985576, + "eval_loss": 0.24368144571781158, + "eval_runtime": 99.1817, + "eval_samples_per_second": 110.444, + "eval_steps_per_second": 13.813, + "step": 11000 + }, + { + "epoch": 2.03, + "step": 11000, + "total_flos": 3.7370912368449946e+18, + "train_loss": 0.29799577762321994, + "train_runtime": 2883.713, + "train_samples_per_second": 120.323, + "train_steps_per_second": 7.521 + } + ], + "logging_steps": 10, + "max_steps": 21688, + "num_train_epochs": 4, + "save_steps": 1000, + "total_flos": 3.7370912368449946e+18, + "trial_name": null, + "trial_params": null +}