{ "best_metric": 0.5081329345703125, "best_model_checkpoint": "finetuned-cards-blackjack/checkpoint-2800", "epoch": 7.0, "eval_steps": 100, "global_step": 2891, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 3.301379680633545, "learning_rate": 0.00019930819785541338, "loss": 1.3967, "step": 10 }, { "epoch": 0.05, "grad_norm": 3.8909592628479004, "learning_rate": 0.00019861639571082672, "loss": 1.3469, "step": 20 }, { "epoch": 0.07, "grad_norm": 3.4435207843780518, "learning_rate": 0.00019792459356624006, "loss": 1.321, "step": 30 }, { "epoch": 0.1, "grad_norm": 4.026681900024414, "learning_rate": 0.0001972327914216534, "loss": 1.0812, "step": 40 }, { "epoch": 0.12, "grad_norm": 5.660863399505615, "learning_rate": 0.00019654098927706677, "loss": 1.2255, "step": 50 }, { "epoch": 0.15, "grad_norm": 4.201864719390869, "learning_rate": 0.00019584918713248014, "loss": 1.2845, "step": 60 }, { "epoch": 0.17, "grad_norm": 3.0525405406951904, "learning_rate": 0.00019515738498789345, "loss": 1.3223, "step": 70 }, { "epoch": 0.19, "grad_norm": 4.4655351638793945, "learning_rate": 0.00019446558284330682, "loss": 1.444, "step": 80 }, { "epoch": 0.22, "grad_norm": 5.199573993682861, "learning_rate": 0.00019377378069872016, "loss": 1.4201, "step": 90 }, { "epoch": 0.24, "grad_norm": 4.03210973739624, "learning_rate": 0.00019308197855413353, "loss": 1.3563, "step": 100 }, { "epoch": 0.24, "eval_accuracy": 0.6749571183533448, "eval_loss": 1.1494646072387695, "eval_runtime": 6.2584, "eval_samples_per_second": 186.311, "eval_steps_per_second": 23.329, "step": 100 }, { "epoch": 0.27, "grad_norm": 4.278244495391846, "learning_rate": 0.00019239017640954688, "loss": 1.3488, "step": 110 }, { "epoch": 0.29, "grad_norm": 3.920788049697876, "learning_rate": 0.00019169837426496022, "loss": 1.451, "step": 120 }, { "epoch": 0.31, "grad_norm": 3.261601686477661, "learning_rate": 0.00019100657212037359, "loss": 1.2141, "step": 130 }, { "epoch": 0.34, "grad_norm": 5.404760837554932, "learning_rate": 0.00019031476997578695, "loss": 1.5746, "step": 140 }, { "epoch": 0.36, "grad_norm": 2.8406295776367188, "learning_rate": 0.0001896229678312003, "loss": 1.2777, "step": 150 }, { "epoch": 0.39, "grad_norm": 3.7745213508605957, "learning_rate": 0.00018893116568661364, "loss": 1.2075, "step": 160 }, { "epoch": 0.41, "grad_norm": 3.8692853450775146, "learning_rate": 0.00018823936354202698, "loss": 1.2082, "step": 170 }, { "epoch": 0.44, "grad_norm": 3.2764315605163574, "learning_rate": 0.00018754756139744035, "loss": 1.1009, "step": 180 }, { "epoch": 0.46, "grad_norm": 2.6145033836364746, "learning_rate": 0.0001868557592528537, "loss": 1.2236, "step": 190 }, { "epoch": 0.48, "grad_norm": 4.715363502502441, "learning_rate": 0.00018616395710826703, "loss": 1.3393, "step": 200 }, { "epoch": 0.48, "eval_accuracy": 0.7204116638078902, "eval_loss": 1.0388233661651611, "eval_runtime": 6.0054, "eval_samples_per_second": 194.159, "eval_steps_per_second": 24.311, "step": 200 }, { "epoch": 0.51, "grad_norm": 3.9448986053466797, "learning_rate": 0.0001854721549636804, "loss": 1.1597, "step": 210 }, { "epoch": 0.53, "grad_norm": 3.357956886291504, "learning_rate": 0.00018478035281909374, "loss": 1.1734, "step": 220 }, { "epoch": 0.56, "grad_norm": 5.5605244636535645, "learning_rate": 0.0001840885506745071, "loss": 1.165, "step": 230 }, { "epoch": 0.58, "grad_norm": 4.48176908493042, "learning_rate": 0.00018339674852992045, "loss": 1.4025, "step": 240 }, { "epoch": 0.61, "grad_norm": 3.6814768314361572, "learning_rate": 0.0001827049463853338, "loss": 1.198, "step": 250 }, { "epoch": 0.63, "grad_norm": 4.112949848175049, "learning_rate": 0.00018201314424074716, "loss": 1.1062, "step": 260 }, { "epoch": 0.65, "grad_norm": 5.755402565002441, "learning_rate": 0.0001813213420961605, "loss": 1.144, "step": 270 }, { "epoch": 0.68, "grad_norm": 3.6004717350006104, "learning_rate": 0.00018062953995157384, "loss": 1.2527, "step": 280 }, { "epoch": 0.7, "grad_norm": 2.4746742248535156, "learning_rate": 0.0001799377378069872, "loss": 1.1316, "step": 290 }, { "epoch": 0.73, "grad_norm": 2.231992483139038, "learning_rate": 0.00017924593566240055, "loss": 1.2033, "step": 300 }, { "epoch": 0.73, "eval_accuracy": 0.7547169811320755, "eval_loss": 0.9323562979698181, "eval_runtime": 7.9011, "eval_samples_per_second": 147.574, "eval_steps_per_second": 18.478, "step": 300 }, { "epoch": 0.75, "grad_norm": 4.073417663574219, "learning_rate": 0.00017855413351781392, "loss": 1.3702, "step": 310 }, { "epoch": 0.77, "grad_norm": 5.040902137756348, "learning_rate": 0.00017786233137322726, "loss": 1.1423, "step": 320 }, { "epoch": 0.8, "grad_norm": 3.7068464756011963, "learning_rate": 0.0001771705292286406, "loss": 1.0609, "step": 330 }, { "epoch": 0.82, "grad_norm": 4.864231586456299, "learning_rate": 0.00017647872708405397, "loss": 1.201, "step": 340 }, { "epoch": 0.85, "grad_norm": 6.007138252258301, "learning_rate": 0.00017578692493946732, "loss": 1.0687, "step": 350 }, { "epoch": 0.87, "grad_norm": 4.0837225914001465, "learning_rate": 0.00017509512279488069, "loss": 1.1311, "step": 360 }, { "epoch": 0.9, "grad_norm": 4.566812992095947, "learning_rate": 0.00017440332065029403, "loss": 1.3071, "step": 370 }, { "epoch": 0.92, "grad_norm": 3.3199901580810547, "learning_rate": 0.00017371151850570737, "loss": 1.0246, "step": 380 }, { "epoch": 0.94, "grad_norm": 2.4883534908294678, "learning_rate": 0.00017301971636112074, "loss": 1.0215, "step": 390 }, { "epoch": 0.97, "grad_norm": 5.232284069061279, "learning_rate": 0.00017232791421653408, "loss": 0.9672, "step": 400 }, { "epoch": 0.97, "eval_accuracy": 0.7658662092624356, "eval_loss": 0.8557726144790649, "eval_runtime": 6.2462, "eval_samples_per_second": 186.674, "eval_steps_per_second": 23.374, "step": 400 }, { "epoch": 0.99, "grad_norm": 3.8225362300872803, "learning_rate": 0.00017163611207194742, "loss": 1.0908, "step": 410 }, { "epoch": 1.02, "grad_norm": 4.098091125488281, "learning_rate": 0.0001709443099273608, "loss": 0.8621, "step": 420 }, { "epoch": 1.04, "grad_norm": 4.027368068695068, "learning_rate": 0.00017025250778277413, "loss": 0.9868, "step": 430 }, { "epoch": 1.07, "grad_norm": 4.375247478485107, "learning_rate": 0.0001695607056381875, "loss": 1.0179, "step": 440 }, { "epoch": 1.09, "grad_norm": 8.204839706420898, "learning_rate": 0.00016886890349360084, "loss": 0.902, "step": 450 }, { "epoch": 1.11, "grad_norm": 3.1056785583496094, "learning_rate": 0.00016817710134901418, "loss": 0.9873, "step": 460 }, { "epoch": 1.14, "grad_norm": 6.811554908752441, "learning_rate": 0.00016748529920442755, "loss": 0.9035, "step": 470 }, { "epoch": 1.16, "grad_norm": 4.715181350708008, "learning_rate": 0.0001667934970598409, "loss": 1.0024, "step": 480 }, { "epoch": 1.19, "grad_norm": 5.355204105377197, "learning_rate": 0.00016610169491525423, "loss": 1.0384, "step": 490 }, { "epoch": 1.21, "grad_norm": 8.26843547821045, "learning_rate": 0.0001654098927706676, "loss": 0.8674, "step": 500 }, { "epoch": 1.21, "eval_accuracy": 0.7615780445969125, "eval_loss": 0.8456417322158813, "eval_runtime": 5.9836, "eval_samples_per_second": 194.865, "eval_steps_per_second": 24.4, "step": 500 }, { "epoch": 1.23, "grad_norm": 3.612718343734741, "learning_rate": 0.00016471809062608094, "loss": 1.035, "step": 510 }, { "epoch": 1.26, "grad_norm": 3.2531259059906006, "learning_rate": 0.0001640262884814943, "loss": 0.9591, "step": 520 }, { "epoch": 1.28, "grad_norm": 2.1132917404174805, "learning_rate": 0.00016333448633690765, "loss": 0.7013, "step": 530 }, { "epoch": 1.31, "grad_norm": 5.840766906738281, "learning_rate": 0.000162642684192321, "loss": 1.1066, "step": 540 }, { "epoch": 1.33, "grad_norm": 2.8128092288970947, "learning_rate": 0.00016195088204773436, "loss": 0.8851, "step": 550 }, { "epoch": 1.36, "grad_norm": 5.935888290405273, "learning_rate": 0.0001612590799031477, "loss": 0.9738, "step": 560 }, { "epoch": 1.38, "grad_norm": 4.2558488845825195, "learning_rate": 0.00016056727775856107, "loss": 1.1094, "step": 570 }, { "epoch": 1.4, "grad_norm": 3.7361583709716797, "learning_rate": 0.0001598754756139744, "loss": 1.0376, "step": 580 }, { "epoch": 1.43, "grad_norm": 3.6672043800354004, "learning_rate": 0.00015918367346938776, "loss": 0.9765, "step": 590 }, { "epoch": 1.45, "grad_norm": 2.8976941108703613, "learning_rate": 0.00015849187132480113, "loss": 0.8277, "step": 600 }, { "epoch": 1.45, "eval_accuracy": 0.7958833619210978, "eval_loss": 0.7562589049339294, "eval_runtime": 6.7504, "eval_samples_per_second": 172.731, "eval_steps_per_second": 21.628, "step": 600 }, { "epoch": 1.48, "grad_norm": 4.665554046630859, "learning_rate": 0.00015780006918021447, "loss": 0.8139, "step": 610 }, { "epoch": 1.5, "grad_norm": 4.166018486022949, "learning_rate": 0.0001571082670356278, "loss": 1.1314, "step": 620 }, { "epoch": 1.53, "grad_norm": 3.610258102416992, "learning_rate": 0.00015641646489104115, "loss": 0.9497, "step": 630 }, { "epoch": 1.55, "grad_norm": 4.610332489013672, "learning_rate": 0.00015572466274645452, "loss": 1.0767, "step": 640 }, { "epoch": 1.57, "grad_norm": 3.796252965927124, "learning_rate": 0.0001550328606018679, "loss": 0.8486, "step": 650 }, { "epoch": 1.6, "grad_norm": 3.9809694290161133, "learning_rate": 0.00015434105845728123, "loss": 0.9211, "step": 660 }, { "epoch": 1.62, "grad_norm": 2.5232605934143066, "learning_rate": 0.00015364925631269457, "loss": 0.8843, "step": 670 }, { "epoch": 1.65, "grad_norm": 4.975670337677002, "learning_rate": 0.00015295745416810794, "loss": 0.9494, "step": 680 }, { "epoch": 1.67, "grad_norm": 5.420626163482666, "learning_rate": 0.00015226565202352128, "loss": 0.9786, "step": 690 }, { "epoch": 1.69, "grad_norm": 3.364365339279175, "learning_rate": 0.00015157384987893465, "loss": 0.8703, "step": 700 }, { "epoch": 1.69, "eval_accuracy": 0.7538593481989708, "eval_loss": 0.8465284109115601, "eval_runtime": 6.2814, "eval_samples_per_second": 185.628, "eval_steps_per_second": 23.243, "step": 700 }, { "epoch": 1.72, "grad_norm": 3.51340913772583, "learning_rate": 0.00015088204773434796, "loss": 0.9032, "step": 710 }, { "epoch": 1.74, "grad_norm": 3.7203245162963867, "learning_rate": 0.00015019024558976133, "loss": 0.7729, "step": 720 }, { "epoch": 1.77, "grad_norm": 3.596214771270752, "learning_rate": 0.0001494984434451747, "loss": 0.8151, "step": 730 }, { "epoch": 1.79, "grad_norm": 3.2724595069885254, "learning_rate": 0.00014880664130058804, "loss": 0.8064, "step": 740 }, { "epoch": 1.82, "grad_norm": 3.5748846530914307, "learning_rate": 0.00014811483915600139, "loss": 0.8419, "step": 750 }, { "epoch": 1.84, "grad_norm": 5.998478412628174, "learning_rate": 0.00014742303701141473, "loss": 0.8494, "step": 760 }, { "epoch": 1.86, "grad_norm": 3.545043706893921, "learning_rate": 0.0001467312348668281, "loss": 0.7695, "step": 770 }, { "epoch": 1.89, "grad_norm": 3.9944069385528564, "learning_rate": 0.00014603943272224146, "loss": 0.9405, "step": 780 }, { "epoch": 1.91, "grad_norm": 5.435621738433838, "learning_rate": 0.00014534763057765478, "loss": 0.8863, "step": 790 }, { "epoch": 1.94, "grad_norm": 7.365724086761475, "learning_rate": 0.00014465582843306815, "loss": 0.893, "step": 800 }, { "epoch": 1.94, "eval_accuracy": 0.8001715265866209, "eval_loss": 0.688121497631073, "eval_runtime": 6.0254, "eval_samples_per_second": 193.514, "eval_steps_per_second": 24.231, "step": 800 }, { "epoch": 1.96, "grad_norm": 6.192987442016602, "learning_rate": 0.0001439640262884815, "loss": 0.8819, "step": 810 }, { "epoch": 1.99, "grad_norm": 3.021066188812256, "learning_rate": 0.00014327222414389486, "loss": 0.6413, "step": 820 }, { "epoch": 2.01, "grad_norm": 4.522083759307861, "learning_rate": 0.0001425804219993082, "loss": 0.9185, "step": 830 }, { "epoch": 2.03, "grad_norm": 3.089639186859131, "learning_rate": 0.00014188861985472154, "loss": 0.7384, "step": 840 }, { "epoch": 2.06, "grad_norm": 4.491950988769531, "learning_rate": 0.0001411968177101349, "loss": 0.765, "step": 850 }, { "epoch": 2.08, "grad_norm": 3.618821144104004, "learning_rate": 0.00014050501556554828, "loss": 0.8237, "step": 860 }, { "epoch": 2.11, "grad_norm": 4.773688793182373, "learning_rate": 0.00013981321342096162, "loss": 0.7171, "step": 870 }, { "epoch": 2.13, "grad_norm": 1.607408881187439, "learning_rate": 0.00013912141127637496, "loss": 0.6438, "step": 880 }, { "epoch": 2.15, "grad_norm": 4.511462211608887, "learning_rate": 0.0001384296091317883, "loss": 0.7983, "step": 890 }, { "epoch": 2.18, "grad_norm": 4.259463787078857, "learning_rate": 0.00013773780698720167, "loss": 0.9454, "step": 900 }, { "epoch": 2.18, "eval_accuracy": 0.8027444253859348, "eval_loss": 0.7210972905158997, "eval_runtime": 5.9658, "eval_samples_per_second": 195.449, "eval_steps_per_second": 24.473, "step": 900 }, { "epoch": 2.2, "grad_norm": 3.810264825820923, "learning_rate": 0.00013704600484261504, "loss": 0.7729, "step": 910 }, { "epoch": 2.23, "grad_norm": 5.475677967071533, "learning_rate": 0.00013635420269802835, "loss": 0.7531, "step": 920 }, { "epoch": 2.25, "grad_norm": 2.9276745319366455, "learning_rate": 0.00013566240055344172, "loss": 0.725, "step": 930 }, { "epoch": 2.28, "grad_norm": 4.840962886810303, "learning_rate": 0.00013497059840885506, "loss": 0.6938, "step": 940 }, { "epoch": 2.3, "grad_norm": 5.3595194816589355, "learning_rate": 0.00013427879626426843, "loss": 0.6863, "step": 950 }, { "epoch": 2.32, "grad_norm": 7.755936145782471, "learning_rate": 0.00013358699411968177, "loss": 0.7146, "step": 960 }, { "epoch": 2.35, "grad_norm": 3.4426372051239014, "learning_rate": 0.00013289519197509512, "loss": 0.7144, "step": 970 }, { "epoch": 2.37, "grad_norm": 4.554823398590088, "learning_rate": 0.00013220338983050849, "loss": 0.6512, "step": 980 }, { "epoch": 2.4, "grad_norm": 2.8689632415771484, "learning_rate": 0.00013151158768592183, "loss": 0.7966, "step": 990 }, { "epoch": 2.42, "grad_norm": 3.4381957054138184, "learning_rate": 0.0001308197855413352, "loss": 0.8109, "step": 1000 }, { "epoch": 2.42, "eval_accuracy": 0.8284734133790738, "eval_loss": 0.6368530988693237, "eval_runtime": 6.2777, "eval_samples_per_second": 185.738, "eval_steps_per_second": 23.257, "step": 1000 }, { "epoch": 2.45, "grad_norm": 2.984152317047119, "learning_rate": 0.00013012798339674854, "loss": 0.6477, "step": 1010 }, { "epoch": 2.47, "grad_norm": 5.486266613006592, "learning_rate": 0.00012943618125216188, "loss": 0.6834, "step": 1020 }, { "epoch": 2.49, "grad_norm": 2.2987334728240967, "learning_rate": 0.00012874437910757525, "loss": 0.6415, "step": 1030 }, { "epoch": 2.52, "grad_norm": 7.007256507873535, "learning_rate": 0.0001280525769629886, "loss": 0.6464, "step": 1040 }, { "epoch": 2.54, "grad_norm": 1.817421555519104, "learning_rate": 0.00012736077481840193, "loss": 0.774, "step": 1050 }, { "epoch": 2.57, "grad_norm": 4.492701530456543, "learning_rate": 0.0001266689726738153, "loss": 0.9331, "step": 1060 }, { "epoch": 2.59, "grad_norm": 3.884744644165039, "learning_rate": 0.00012597717052922864, "loss": 0.6107, "step": 1070 }, { "epoch": 2.62, "grad_norm": 4.274733066558838, "learning_rate": 0.000125285368384642, "loss": 0.6268, "step": 1080 }, { "epoch": 2.64, "grad_norm": 4.432763576507568, "learning_rate": 0.00012459356624005535, "loss": 0.6326, "step": 1090 }, { "epoch": 2.66, "grad_norm": 4.16074275970459, "learning_rate": 0.0001239017640954687, "loss": 0.8762, "step": 1100 }, { "epoch": 2.66, "eval_accuracy": 0.839622641509434, "eval_loss": 0.6335619688034058, "eval_runtime": 6.1128, "eval_samples_per_second": 190.748, "eval_steps_per_second": 23.884, "step": 1100 }, { "epoch": 2.69, "grad_norm": 4.018909931182861, "learning_rate": 0.00012320996195088206, "loss": 0.8039, "step": 1110 }, { "epoch": 2.71, "grad_norm": 8.111551284790039, "learning_rate": 0.0001225181598062954, "loss": 0.6436, "step": 1120 }, { "epoch": 2.74, "grad_norm": 4.22373628616333, "learning_rate": 0.00012182635766170876, "loss": 0.6228, "step": 1130 }, { "epoch": 2.76, "grad_norm": 4.817978858947754, "learning_rate": 0.00012113455551712211, "loss": 0.7047, "step": 1140 }, { "epoch": 2.78, "grad_norm": 5.471624851226807, "learning_rate": 0.00012044275337253545, "loss": 0.8293, "step": 1150 }, { "epoch": 2.81, "grad_norm": 3.491068124771118, "learning_rate": 0.00011975095122794881, "loss": 0.7128, "step": 1160 }, { "epoch": 2.83, "grad_norm": 4.463800430297852, "learning_rate": 0.00011905914908336215, "loss": 0.7569, "step": 1170 }, { "epoch": 2.86, "grad_norm": 2.7582342624664307, "learning_rate": 0.00011836734693877552, "loss": 0.6774, "step": 1180 }, { "epoch": 2.88, "grad_norm": 4.606247901916504, "learning_rate": 0.00011767554479418887, "loss": 0.7384, "step": 1190 }, { "epoch": 2.91, "grad_norm": 4.3657660484313965, "learning_rate": 0.00011698374264960222, "loss": 0.8034, "step": 1200 }, { "epoch": 2.91, "eval_accuracy": 0.8164665523156089, "eval_loss": 0.657957911491394, "eval_runtime": 6.0796, "eval_samples_per_second": 191.79, "eval_steps_per_second": 24.015, "step": 1200 }, { "epoch": 2.93, "grad_norm": 3.4329655170440674, "learning_rate": 0.00011629194050501557, "loss": 0.733, "step": 1210 }, { "epoch": 2.95, "grad_norm": 4.565850257873535, "learning_rate": 0.00011560013836042894, "loss": 0.7215, "step": 1220 }, { "epoch": 2.98, "grad_norm": 3.225835084915161, "learning_rate": 0.00011490833621584227, "loss": 0.6895, "step": 1230 }, { "epoch": 3.0, "grad_norm": 5.188159942626953, "learning_rate": 0.00011421653407125564, "loss": 0.6821, "step": 1240 }, { "epoch": 3.03, "grad_norm": 1.3620127439498901, "learning_rate": 0.00011352473192666896, "loss": 0.5604, "step": 1250 }, { "epoch": 3.05, "grad_norm": 3.446960687637329, "learning_rate": 0.00011283292978208233, "loss": 0.5554, "step": 1260 }, { "epoch": 3.08, "grad_norm": 5.016846656799316, "learning_rate": 0.00011214112763749569, "loss": 0.7156, "step": 1270 }, { "epoch": 3.1, "grad_norm": 5.600184440612793, "learning_rate": 0.00011144932549290903, "loss": 0.745, "step": 1280 }, { "epoch": 3.12, "grad_norm": 3.7640082836151123, "learning_rate": 0.00011075752334832239, "loss": 0.6316, "step": 1290 }, { "epoch": 3.15, "grad_norm": 4.837277889251709, "learning_rate": 0.00011006572120373573, "loss": 0.5833, "step": 1300 }, { "epoch": 3.15, "eval_accuracy": 0.8439108061749572, "eval_loss": 0.5827564597129822, "eval_runtime": 6.3009, "eval_samples_per_second": 185.052, "eval_steps_per_second": 23.171, "step": 1300 }, { "epoch": 3.17, "grad_norm": 5.6047797203063965, "learning_rate": 0.00010937391905914908, "loss": 0.7184, "step": 1310 }, { "epoch": 3.2, "grad_norm": 4.177833080291748, "learning_rate": 0.0001087512971290211, "loss": 0.7094, "step": 1320 }, { "epoch": 3.22, "grad_norm": 2.0531811714172363, "learning_rate": 0.00010805949498443447, "loss": 0.6679, "step": 1330 }, { "epoch": 3.24, "grad_norm": 2.919313430786133, "learning_rate": 0.0001073676928398478, "loss": 0.4368, "step": 1340 }, { "epoch": 3.27, "grad_norm": 5.47185754776001, "learning_rate": 0.00010667589069526116, "loss": 0.5666, "step": 1350 }, { "epoch": 3.29, "grad_norm": 5.082462310791016, "learning_rate": 0.00010598408855067452, "loss": 0.7758, "step": 1360 }, { "epoch": 3.32, "grad_norm": 3.3282408714294434, "learning_rate": 0.00010529228640608786, "loss": 0.6055, "step": 1370 }, { "epoch": 3.34, "grad_norm": 5.19661808013916, "learning_rate": 0.00010460048426150121, "loss": 0.6678, "step": 1380 }, { "epoch": 3.37, "grad_norm": 2.5423412322998047, "learning_rate": 0.00010390868211691456, "loss": 0.5333, "step": 1390 }, { "epoch": 3.39, "grad_norm": 9.068185806274414, "learning_rate": 0.00010321687997232792, "loss": 0.8811, "step": 1400 }, { "epoch": 3.39, "eval_accuracy": 0.8259005145797599, "eval_loss": 0.6564387679100037, "eval_runtime": 5.9782, "eval_samples_per_second": 195.041, "eval_steps_per_second": 24.422, "step": 1400 }, { "epoch": 3.41, "grad_norm": 5.794499397277832, "learning_rate": 0.00010252507782774128, "loss": 0.8535, "step": 1410 }, { "epoch": 3.44, "grad_norm": 4.731594562530518, "learning_rate": 0.00010183327568315462, "loss": 0.6556, "step": 1420 }, { "epoch": 3.46, "grad_norm": 3.823868751525879, "learning_rate": 0.00010114147353856798, "loss": 0.687, "step": 1430 }, { "epoch": 3.49, "grad_norm": 7.72351598739624, "learning_rate": 0.00010044967139398133, "loss": 0.6479, "step": 1440 }, { "epoch": 3.51, "grad_norm": 5.026217937469482, "learning_rate": 9.975786924939467e-05, "loss": 0.64, "step": 1450 }, { "epoch": 3.54, "grad_norm": 2.873476028442383, "learning_rate": 9.906606710480803e-05, "loss": 0.6692, "step": 1460 }, { "epoch": 3.56, "grad_norm": 3.9772098064422607, "learning_rate": 9.837426496022138e-05, "loss": 0.8156, "step": 1470 }, { "epoch": 3.58, "grad_norm": 5.044854164123535, "learning_rate": 9.768246281563474e-05, "loss": 0.7261, "step": 1480 }, { "epoch": 3.61, "grad_norm": 2.8663127422332764, "learning_rate": 9.699066067104808e-05, "loss": 0.7608, "step": 1490 }, { "epoch": 3.63, "grad_norm": 7.623239040374756, "learning_rate": 9.629885852646143e-05, "loss": 0.5639, "step": 1500 }, { "epoch": 3.63, "eval_accuracy": 0.8439108061749572, "eval_loss": 0.5736597180366516, "eval_runtime": 6.1394, "eval_samples_per_second": 189.92, "eval_steps_per_second": 23.781, "step": 1500 }, { "epoch": 3.66, "grad_norm": 4.9297966957092285, "learning_rate": 9.560705638187479e-05, "loss": 0.6672, "step": 1510 }, { "epoch": 3.68, "grad_norm": 2.0875468254089355, "learning_rate": 9.491525423728815e-05, "loss": 0.5335, "step": 1520 }, { "epoch": 3.7, "grad_norm": 10.350793838500977, "learning_rate": 9.422345209270149e-05, "loss": 0.7163, "step": 1530 }, { "epoch": 3.73, "grad_norm": 4.162230014801025, "learning_rate": 9.353164994811484e-05, "loss": 0.7528, "step": 1540 }, { "epoch": 3.75, "grad_norm": 5.249913215637207, "learning_rate": 9.28398478035282e-05, "loss": 0.7946, "step": 1550 }, { "epoch": 3.78, "grad_norm": 3.5188651084899902, "learning_rate": 9.214804565894155e-05, "loss": 0.7108, "step": 1560 }, { "epoch": 3.8, "grad_norm": 5.497143268585205, "learning_rate": 9.14562435143549e-05, "loss": 0.8685, "step": 1570 }, { "epoch": 3.83, "grad_norm": 4.383511066436768, "learning_rate": 9.076444136976825e-05, "loss": 0.7093, "step": 1580 }, { "epoch": 3.85, "grad_norm": 4.953444957733154, "learning_rate": 9.00726392251816e-05, "loss": 0.7253, "step": 1590 }, { "epoch": 3.87, "grad_norm": 3.61757230758667, "learning_rate": 8.938083708059496e-05, "loss": 0.639, "step": 1600 }, { "epoch": 3.87, "eval_accuracy": 0.8379073756432247, "eval_loss": 0.560886561870575, "eval_runtime": 5.9489, "eval_samples_per_second": 196.002, "eval_steps_per_second": 24.542, "step": 1600 }, { "epoch": 3.9, "grad_norm": 5.650317668914795, "learning_rate": 8.868903493600831e-05, "loss": 0.8287, "step": 1610 }, { "epoch": 3.92, "grad_norm": 6.012249946594238, "learning_rate": 8.799723279142166e-05, "loss": 0.6976, "step": 1620 }, { "epoch": 3.95, "grad_norm": 5.186240196228027, "learning_rate": 8.730543064683501e-05, "loss": 0.7452, "step": 1630 }, { "epoch": 3.97, "grad_norm": 5.2669572830200195, "learning_rate": 8.661362850224835e-05, "loss": 0.8586, "step": 1640 }, { "epoch": 4.0, "grad_norm": 2.4105117321014404, "learning_rate": 8.592182635766172e-05, "loss": 0.6194, "step": 1650 }, { "epoch": 4.02, "grad_norm": 1.2886375188827515, "learning_rate": 8.523002421307506e-05, "loss": 0.4969, "step": 1660 }, { "epoch": 4.04, "grad_norm": 1.915207862854004, "learning_rate": 8.453822206848842e-05, "loss": 0.571, "step": 1670 }, { "epoch": 4.07, "grad_norm": 3.7422375679016113, "learning_rate": 8.384641992390176e-05, "loss": 0.6791, "step": 1680 }, { "epoch": 4.09, "grad_norm": 5.4421467781066895, "learning_rate": 8.315461777931513e-05, "loss": 0.6829, "step": 1690 }, { "epoch": 4.12, "grad_norm": 1.9872852563858032, "learning_rate": 8.246281563472847e-05, "loss": 0.6455, "step": 1700 }, { "epoch": 4.12, "eval_accuracy": 0.8370497427101201, "eval_loss": 0.5820054411888123, "eval_runtime": 6.2231, "eval_samples_per_second": 187.366, "eval_steps_per_second": 23.461, "step": 1700 }, { "epoch": 4.14, "grad_norm": 3.1257195472717285, "learning_rate": 8.177101349014182e-05, "loss": 0.6619, "step": 1710 }, { "epoch": 4.16, "grad_norm": 3.6743292808532715, "learning_rate": 8.107921134555517e-05, "loss": 0.8357, "step": 1720 }, { "epoch": 4.19, "grad_norm": 3.7856836318969727, "learning_rate": 8.038740920096852e-05, "loss": 0.6017, "step": 1730 }, { "epoch": 4.21, "grad_norm": 4.6526970863342285, "learning_rate": 7.969560705638188e-05, "loss": 0.6805, "step": 1740 }, { "epoch": 4.24, "grad_norm": 3.4002761840820312, "learning_rate": 7.900380491179523e-05, "loss": 0.5558, "step": 1750 }, { "epoch": 4.26, "grad_norm": 3.9795327186584473, "learning_rate": 7.831200276720859e-05, "loss": 0.6921, "step": 1760 }, { "epoch": 4.29, "grad_norm": 3.5085155963897705, "learning_rate": 7.762020062262193e-05, "loss": 0.5476, "step": 1770 }, { "epoch": 4.31, "grad_norm": 5.0314412117004395, "learning_rate": 7.69283984780353e-05, "loss": 0.8566, "step": 1780 }, { "epoch": 4.33, "grad_norm": 2.536855697631836, "learning_rate": 7.623659633344864e-05, "loss": 0.5743, "step": 1790 }, { "epoch": 4.36, "grad_norm": 4.995050430297852, "learning_rate": 7.5544794188862e-05, "loss": 0.5402, "step": 1800 }, { "epoch": 4.36, "eval_accuracy": 0.8344768439108061, "eval_loss": 0.5796906352043152, "eval_runtime": 6.1279, "eval_samples_per_second": 190.278, "eval_steps_per_second": 23.826, "step": 1800 }, { "epoch": 4.38, "grad_norm": 4.143467903137207, "learning_rate": 7.485299204427533e-05, "loss": 0.6715, "step": 1810 }, { "epoch": 4.41, "grad_norm": 1.8152028322219849, "learning_rate": 7.416118989968869e-05, "loss": 0.6965, "step": 1820 }, { "epoch": 4.43, "grad_norm": 3.699620485305786, "learning_rate": 7.346938775510205e-05, "loss": 0.5758, "step": 1830 }, { "epoch": 4.46, "grad_norm": 2.2266180515289307, "learning_rate": 7.27775856105154e-05, "loss": 0.6802, "step": 1840 }, { "epoch": 4.48, "grad_norm": 4.586669445037842, "learning_rate": 7.208578346592874e-05, "loss": 0.5885, "step": 1850 }, { "epoch": 4.5, "grad_norm": 4.72069787979126, "learning_rate": 7.13939813213421e-05, "loss": 0.6404, "step": 1860 }, { "epoch": 4.53, "grad_norm": 5.436990261077881, "learning_rate": 7.070217917675545e-05, "loss": 0.7781, "step": 1870 }, { "epoch": 4.55, "grad_norm": 4.715204238891602, "learning_rate": 7.001037703216881e-05, "loss": 0.7109, "step": 1880 }, { "epoch": 4.58, "grad_norm": 3.261801242828369, "learning_rate": 6.931857488758215e-05, "loss": 0.5707, "step": 1890 }, { "epoch": 4.6, "grad_norm": 3.516954183578491, "learning_rate": 6.86267727429955e-05, "loss": 0.5311, "step": 1900 }, { "epoch": 4.6, "eval_accuracy": 0.8456260720411664, "eval_loss": 0.55106520652771, "eval_runtime": 6.3501, "eval_samples_per_second": 183.618, "eval_steps_per_second": 22.992, "step": 1900 }, { "epoch": 4.62, "grad_norm": 4.697694778442383, "learning_rate": 6.793497059840886e-05, "loss": 0.6169, "step": 1910 }, { "epoch": 4.65, "grad_norm": 4.627555847167969, "learning_rate": 6.724316845382221e-05, "loss": 0.649, "step": 1920 }, { "epoch": 4.67, "grad_norm": 3.16441011428833, "learning_rate": 6.662054652369423e-05, "loss": 0.6538, "step": 1930 }, { "epoch": 4.7, "grad_norm": 3.6413562297821045, "learning_rate": 6.592874437910757e-05, "loss": 0.5963, "step": 1940 }, { "epoch": 4.72, "grad_norm": 4.7628960609436035, "learning_rate": 6.523694223452093e-05, "loss": 0.5106, "step": 1950 }, { "epoch": 4.75, "grad_norm": 3.3812217712402344, "learning_rate": 6.454514008993428e-05, "loss": 0.6908, "step": 1960 }, { "epoch": 4.77, "grad_norm": 3.9284725189208984, "learning_rate": 6.385333794534764e-05, "loss": 0.4863, "step": 1970 }, { "epoch": 4.79, "grad_norm": 3.633194923400879, "learning_rate": 6.316153580076098e-05, "loss": 0.4383, "step": 1980 }, { "epoch": 4.82, "grad_norm": 6.324495792388916, "learning_rate": 6.246973365617433e-05, "loss": 0.8217, "step": 1990 }, { "epoch": 4.84, "grad_norm": 5.055554389953613, "learning_rate": 6.177793151158769e-05, "loss": 0.5734, "step": 2000 }, { "epoch": 4.84, "eval_accuracy": 0.8507718696397941, "eval_loss": 0.5443547964096069, "eval_runtime": 5.9845, "eval_samples_per_second": 194.837, "eval_steps_per_second": 24.396, "step": 2000 }, { "epoch": 4.87, "grad_norm": 3.0936367511749268, "learning_rate": 6.108612936700104e-05, "loss": 0.6563, "step": 2010 }, { "epoch": 4.89, "grad_norm": 3.478715181350708, "learning_rate": 6.039432722241439e-05, "loss": 0.595, "step": 2020 }, { "epoch": 4.92, "grad_norm": 3.4817001819610596, "learning_rate": 5.970252507782774e-05, "loss": 0.3782, "step": 2030 }, { "epoch": 4.94, "grad_norm": 6.603343963623047, "learning_rate": 5.901072293324109e-05, "loss": 0.5735, "step": 2040 }, { "epoch": 4.96, "grad_norm": 1.6107616424560547, "learning_rate": 5.831892078865445e-05, "loss": 0.6045, "step": 2050 }, { "epoch": 4.99, "grad_norm": 4.367840766906738, "learning_rate": 5.76271186440678e-05, "loss": 0.5533, "step": 2060 }, { "epoch": 5.01, "grad_norm": 6.6618266105651855, "learning_rate": 5.6935316499481154e-05, "loss": 0.4114, "step": 2070 }, { "epoch": 5.04, "grad_norm": 7.852776050567627, "learning_rate": 5.62435143548945e-05, "loss": 0.642, "step": 2080 }, { "epoch": 5.06, "grad_norm": 5.769771099090576, "learning_rate": 5.555171221030785e-05, "loss": 0.5934, "step": 2090 }, { "epoch": 5.08, "grad_norm": 2.4635396003723145, "learning_rate": 5.485991006572121e-05, "loss": 0.5206, "step": 2100 }, { "epoch": 5.08, "eval_accuracy": 0.8636363636363636, "eval_loss": 0.5326434969902039, "eval_runtime": 6.1606, "eval_samples_per_second": 189.267, "eval_steps_per_second": 23.699, "step": 2100 }, { "epoch": 5.11, "grad_norm": 4.699713706970215, "learning_rate": 5.416810792113456e-05, "loss": 0.6311, "step": 2110 }, { "epoch": 5.13, "grad_norm": 3.2119288444519043, "learning_rate": 5.347630577654791e-05, "loss": 0.4781, "step": 2120 }, { "epoch": 5.16, "grad_norm": 5.083879470825195, "learning_rate": 5.278450363196126e-05, "loss": 0.5123, "step": 2130 }, { "epoch": 5.18, "grad_norm": 3.2444283962249756, "learning_rate": 5.209270148737462e-05, "loss": 0.39, "step": 2140 }, { "epoch": 5.21, "grad_norm": 2.9540326595306396, "learning_rate": 5.140089934278797e-05, "loss": 0.5081, "step": 2150 }, { "epoch": 5.23, "grad_norm": 4.055675029754639, "learning_rate": 5.0709097198201316e-05, "loss": 0.5884, "step": 2160 }, { "epoch": 5.25, "grad_norm": 2.7214150428771973, "learning_rate": 5.0017295053614664e-05, "loss": 0.5241, "step": 2170 }, { "epoch": 5.28, "grad_norm": 1.249835729598999, "learning_rate": 4.932549290902802e-05, "loss": 0.3994, "step": 2180 }, { "epoch": 5.3, "grad_norm": 5.9494829177856445, "learning_rate": 4.863369076444137e-05, "loss": 0.5295, "step": 2190 }, { "epoch": 5.33, "grad_norm": 3.318251371383667, "learning_rate": 4.794188861985472e-05, "loss": 0.6272, "step": 2200 }, { "epoch": 5.33, "eval_accuracy": 0.8524871355060034, "eval_loss": 0.5477628707885742, "eval_runtime": 6.3218, "eval_samples_per_second": 184.442, "eval_steps_per_second": 23.095, "step": 2200 }, { "epoch": 5.35, "grad_norm": 4.580955505371094, "learning_rate": 4.725008647526807e-05, "loss": 0.5204, "step": 2210 }, { "epoch": 5.38, "grad_norm": 1.7544004917144775, "learning_rate": 4.6558284330681426e-05, "loss": 0.5388, "step": 2220 }, { "epoch": 5.4, "grad_norm": 4.454672336578369, "learning_rate": 4.586648218609478e-05, "loss": 0.4773, "step": 2230 }, { "epoch": 5.42, "grad_norm": 7.039458274841309, "learning_rate": 4.517468004150813e-05, "loss": 0.5821, "step": 2240 }, { "epoch": 5.45, "grad_norm": 4.6715006828308105, "learning_rate": 4.4482877896921485e-05, "loss": 0.5881, "step": 2250 }, { "epoch": 5.47, "grad_norm": 3.3161721229553223, "learning_rate": 4.379107575233484e-05, "loss": 0.5587, "step": 2260 }, { "epoch": 5.5, "grad_norm": 5.245398044586182, "learning_rate": 4.309927360774819e-05, "loss": 0.4973, "step": 2270 }, { "epoch": 5.52, "grad_norm": 3.1016721725463867, "learning_rate": 4.2407471463161536e-05, "loss": 0.4004, "step": 2280 }, { "epoch": 5.54, "grad_norm": 4.883015155792236, "learning_rate": 4.171566931857489e-05, "loss": 0.4267, "step": 2290 }, { "epoch": 5.57, "grad_norm": 4.550380229949951, "learning_rate": 4.102386717398824e-05, "loss": 0.5124, "step": 2300 }, { "epoch": 5.57, "eval_accuracy": 0.8687821612349914, "eval_loss": 0.5295912623405457, "eval_runtime": 6.0432, "eval_samples_per_second": 192.944, "eval_steps_per_second": 24.159, "step": 2300 }, { "epoch": 5.59, "grad_norm": 3.2469303607940674, "learning_rate": 4.0332065029401595e-05, "loss": 0.4733, "step": 2310 }, { "epoch": 5.62, "grad_norm": 5.656401634216309, "learning_rate": 3.964026288481494e-05, "loss": 0.5978, "step": 2320 }, { "epoch": 5.64, "grad_norm": 2.7476541996002197, "learning_rate": 3.89484607402283e-05, "loss": 0.3513, "step": 2330 }, { "epoch": 5.67, "grad_norm": 4.047815322875977, "learning_rate": 3.825665859564165e-05, "loss": 0.5287, "step": 2340 }, { "epoch": 5.69, "grad_norm": 3.4885923862457275, "learning_rate": 3.7564856451055e-05, "loss": 0.4946, "step": 2350 }, { "epoch": 5.71, "grad_norm": 7.513520240783691, "learning_rate": 3.687305430646835e-05, "loss": 0.6233, "step": 2360 }, { "epoch": 5.74, "grad_norm": 2.3985989093780518, "learning_rate": 3.61812521618817e-05, "loss": 0.5149, "step": 2370 }, { "epoch": 5.76, "grad_norm": 5.046018123626709, "learning_rate": 3.5489450017295054e-05, "loss": 0.4948, "step": 2380 }, { "epoch": 5.79, "grad_norm": 2.6082875728607178, "learning_rate": 3.479764787270841e-05, "loss": 0.6084, "step": 2390 }, { "epoch": 5.81, "grad_norm": 2.541283369064331, "learning_rate": 3.410584572812176e-05, "loss": 0.5659, "step": 2400 }, { "epoch": 5.81, "eval_accuracy": 0.8704974271012007, "eval_loss": 0.5180826783180237, "eval_runtime": 6.1391, "eval_samples_per_second": 189.929, "eval_steps_per_second": 23.782, "step": 2400 }, { "epoch": 5.84, "grad_norm": 2.853994846343994, "learning_rate": 3.341404358353511e-05, "loss": 0.6081, "step": 2410 }, { "epoch": 5.86, "grad_norm": 4.628828525543213, "learning_rate": 3.272224143894847e-05, "loss": 0.4588, "step": 2420 }, { "epoch": 5.88, "grad_norm": 3.1006319522857666, "learning_rate": 3.2030439294361816e-05, "loss": 0.4341, "step": 2430 }, { "epoch": 5.91, "grad_norm": 2.395719528198242, "learning_rate": 3.133863714977517e-05, "loss": 0.442, "step": 2440 }, { "epoch": 5.93, "grad_norm": 3.238839864730835, "learning_rate": 3.064683500518852e-05, "loss": 0.4359, "step": 2450 }, { "epoch": 5.96, "grad_norm": 5.706843852996826, "learning_rate": 2.9955032860601867e-05, "loss": 0.5139, "step": 2460 }, { "epoch": 5.98, "grad_norm": 6.059083461761475, "learning_rate": 2.9263230716015223e-05, "loss": 0.4459, "step": 2470 }, { "epoch": 6.0, "grad_norm": 4.164783954620361, "learning_rate": 2.857142857142857e-05, "loss": 0.5037, "step": 2480 }, { "epoch": 6.03, "grad_norm": 3.230203151702881, "learning_rate": 2.7879626426841926e-05, "loss": 0.4225, "step": 2490 }, { "epoch": 6.05, "grad_norm": 5.467190742492676, "learning_rate": 2.7187824282255274e-05, "loss": 0.4212, "step": 2500 }, { "epoch": 6.05, "eval_accuracy": 0.8610634648370498, "eval_loss": 0.5200443267822266, "eval_runtime": 6.2608, "eval_samples_per_second": 186.239, "eval_steps_per_second": 23.32, "step": 2500 }, { "epoch": 6.08, "grad_norm": 3.7668442726135254, "learning_rate": 2.649602213766863e-05, "loss": 0.4042, "step": 2510 }, { "epoch": 6.1, "grad_norm": 3.094477415084839, "learning_rate": 2.580421999308198e-05, "loss": 0.4338, "step": 2520 }, { "epoch": 6.13, "grad_norm": 5.538024425506592, "learning_rate": 2.5112417848495333e-05, "loss": 0.3269, "step": 2530 }, { "epoch": 6.15, "grad_norm": 5.658746719360352, "learning_rate": 2.4420615703908685e-05, "loss": 0.4719, "step": 2540 }, { "epoch": 6.17, "grad_norm": 1.6886987686157227, "learning_rate": 2.3728813559322036e-05, "loss": 0.395, "step": 2550 }, { "epoch": 6.2, "grad_norm": 3.538180112838745, "learning_rate": 2.3037011414735388e-05, "loss": 0.2877, "step": 2560 }, { "epoch": 6.22, "grad_norm": 2.9912898540496826, "learning_rate": 2.234520927014874e-05, "loss": 0.4797, "step": 2570 }, { "epoch": 6.25, "grad_norm": 2.68037748336792, "learning_rate": 2.1653407125562088e-05, "loss": 0.5114, "step": 2580 }, { "epoch": 6.27, "grad_norm": 5.079796314239502, "learning_rate": 2.096160498097544e-05, "loss": 0.3604, "step": 2590 }, { "epoch": 6.3, "grad_norm": 3.052543878555298, "learning_rate": 2.026980283638879e-05, "loss": 0.4338, "step": 2600 }, { "epoch": 6.3, "eval_accuracy": 0.8730703259005146, "eval_loss": 0.5135151743888855, "eval_runtime": 5.9846, "eval_samples_per_second": 194.834, "eval_steps_per_second": 24.396, "step": 2600 }, { "epoch": 6.32, "grad_norm": 5.780861854553223, "learning_rate": 1.9578000691802147e-05, "loss": 0.3725, "step": 2610 }, { "epoch": 6.34, "grad_norm": 4.87053108215332, "learning_rate": 1.88861985472155e-05, "loss": 0.2491, "step": 2620 }, { "epoch": 6.37, "grad_norm": 2.2995293140411377, "learning_rate": 1.819439640262885e-05, "loss": 0.2911, "step": 2630 }, { "epoch": 6.39, "grad_norm": 1.6383118629455566, "learning_rate": 1.7502594258042202e-05, "loss": 0.2562, "step": 2640 }, { "epoch": 6.42, "grad_norm": 4.9596991539001465, "learning_rate": 1.6810792113455554e-05, "loss": 0.5795, "step": 2650 }, { "epoch": 6.44, "grad_norm": 2.922712802886963, "learning_rate": 1.6118989968868905e-05, "loss": 0.421, "step": 2660 }, { "epoch": 6.46, "grad_norm": 2.0401623249053955, "learning_rate": 1.5427187824282254e-05, "loss": 0.4283, "step": 2670 }, { "epoch": 6.49, "grad_norm": 0.9165148735046387, "learning_rate": 1.4735385679695607e-05, "loss": 0.4512, "step": 2680 }, { "epoch": 6.51, "grad_norm": 4.587483882904053, "learning_rate": 1.4043583535108959e-05, "loss": 0.4664, "step": 2690 }, { "epoch": 6.54, "grad_norm": 4.216481685638428, "learning_rate": 1.335178139052231e-05, "loss": 0.3407, "step": 2700 }, { "epoch": 6.54, "eval_accuracy": 0.87221269296741, "eval_loss": 0.5147121548652649, "eval_runtime": 6.1635, "eval_samples_per_second": 189.179, "eval_steps_per_second": 23.688, "step": 2700 }, { "epoch": 6.56, "grad_norm": 1.7551047801971436, "learning_rate": 1.2659979245935664e-05, "loss": 0.4725, "step": 2710 }, { "epoch": 6.59, "grad_norm": 4.851523399353027, "learning_rate": 1.1968177101349016e-05, "loss": 0.4639, "step": 2720 }, { "epoch": 6.61, "grad_norm": 6.040704727172852, "learning_rate": 1.1276374956762366e-05, "loss": 0.3146, "step": 2730 }, { "epoch": 6.63, "grad_norm": 1.6925532817840576, "learning_rate": 1.0584572812175717e-05, "loss": 0.3665, "step": 2740 }, { "epoch": 6.66, "grad_norm": 2.9491493701934814, "learning_rate": 9.89277066758907e-06, "loss": 0.467, "step": 2750 }, { "epoch": 6.68, "grad_norm": 2.1744699478149414, "learning_rate": 9.200968523002422e-06, "loss": 0.3542, "step": 2760 }, { "epoch": 6.71, "grad_norm": 3.170931577682495, "learning_rate": 8.509166378415774e-06, "loss": 0.5874, "step": 2770 }, { "epoch": 6.73, "grad_norm": 3.2446773052215576, "learning_rate": 7.817364233829124e-06, "loss": 0.3705, "step": 2780 }, { "epoch": 6.76, "grad_norm": 3.8055498600006104, "learning_rate": 7.125562089242477e-06, "loss": 0.3164, "step": 2790 }, { "epoch": 6.78, "grad_norm": 2.3979437351226807, "learning_rate": 6.4337599446558285e-06, "loss": 0.4043, "step": 2800 }, { "epoch": 6.78, "eval_accuracy": 0.869639794168096, "eval_loss": 0.5081329345703125, "eval_runtime": 6.6143, "eval_samples_per_second": 176.285, "eval_steps_per_second": 22.073, "step": 2800 }, { "epoch": 6.8, "grad_norm": 1.7395985126495361, "learning_rate": 5.74195780006918e-06, "loss": 0.3624, "step": 2810 }, { "epoch": 6.83, "grad_norm": 2.924905300140381, "learning_rate": 5.050155655482532e-06, "loss": 0.4046, "step": 2820 }, { "epoch": 6.85, "grad_norm": 11.709400177001953, "learning_rate": 4.358353510895884e-06, "loss": 0.4807, "step": 2830 }, { "epoch": 6.88, "grad_norm": 6.416582107543945, "learning_rate": 3.666551366309236e-06, "loss": 0.4782, "step": 2840 }, { "epoch": 6.9, "grad_norm": 6.1391448974609375, "learning_rate": 2.9747492217225875e-06, "loss": 0.4852, "step": 2850 }, { "epoch": 6.92, "grad_norm": 3.525520086288452, "learning_rate": 2.2829470771359392e-06, "loss": 0.4282, "step": 2860 }, { "epoch": 6.95, "grad_norm": 1.4197200536727905, "learning_rate": 1.591144932549291e-06, "loss": 0.4337, "step": 2870 }, { "epoch": 6.97, "grad_norm": 4.016748905181885, "learning_rate": 8.993427879626428e-07, "loss": 0.3915, "step": 2880 }, { "epoch": 7.0, "grad_norm": 2.1515309810638428, "learning_rate": 2.0754064337599448e-07, "loss": 0.4095, "step": 2890 }, { "epoch": 7.0, "step": 2891, "total_flos": 3.5833623598425784e+18, "train_loss": 0.7298227465279536, "train_runtime": 1041.6701, "train_samples_per_second": 44.372, "train_steps_per_second": 2.775 } ], "logging_steps": 10, "max_steps": 2891, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 100, "total_flos": 3.5833623598425784e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }