{ "best_metric": 0.6351959109306335, "best_model_checkpoint": "Action_agent/checkpoint-4100", "epoch": 20.0, "eval_steps": 100, "global_step": 6280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 1.1194300651550293, "learning_rate": 9.97611464968153e-06, "loss": 2.2988, "step": 15 }, { "epoch": 0.1, "grad_norm": 1.3563270568847656, "learning_rate": 9.952229299363057e-06, "loss": 2.2782, "step": 30 }, { "epoch": 0.14, "grad_norm": 1.2270524501800537, "learning_rate": 9.928343949044586e-06, "loss": 2.2582, "step": 45 }, { "epoch": 0.19, "grad_norm": 1.1236611604690552, "learning_rate": 9.904458598726116e-06, "loss": 2.2436, "step": 60 }, { "epoch": 0.24, "grad_norm": 1.2250008583068848, "learning_rate": 9.880573248407644e-06, "loss": 2.2234, "step": 75 }, { "epoch": 0.29, "grad_norm": 1.230263590812683, "learning_rate": 9.856687898089172e-06, "loss": 2.1987, "step": 90 }, { "epoch": 0.32, "eval_accuracy": 0.3914285714285714, "eval_loss": 2.1640379428863525, "eval_runtime": 20.4497, "eval_samples_per_second": 51.346, "eval_steps_per_second": 6.455, "step": 100 }, { "epoch": 0.33, "grad_norm": 1.2624423503875732, "learning_rate": 9.832802547770702e-06, "loss": 2.1866, "step": 105 }, { "epoch": 0.38, "grad_norm": 1.2295231819152832, "learning_rate": 9.80891719745223e-06, "loss": 2.1503, "step": 120 }, { "epoch": 0.43, "grad_norm": 1.6050071716308594, "learning_rate": 9.78503184713376e-06, "loss": 2.1169, "step": 135 }, { "epoch": 0.48, "grad_norm": 1.3574703931808472, "learning_rate": 9.761146496815288e-06, "loss": 2.0902, "step": 150 }, { "epoch": 0.53, "grad_norm": 1.575221300125122, "learning_rate": 9.737261146496816e-06, "loss": 2.052, "step": 165 }, { "epoch": 0.57, "grad_norm": 1.443302035331726, "learning_rate": 9.713375796178345e-06, "loss": 2.0154, "step": 180 }, { "epoch": 0.62, "grad_norm": 1.4323551654815674, "learning_rate": 9.689490445859873e-06, "loss": 1.9807, "step": 195 }, { "epoch": 0.64, "eval_accuracy": 0.6142857142857143, "eval_loss": 1.9168660640716553, "eval_runtime": 13.8079, "eval_samples_per_second": 76.043, "eval_steps_per_second": 9.56, "step": 200 }, { "epoch": 0.67, "grad_norm": 1.587117314338684, "learning_rate": 9.665605095541401e-06, "loss": 1.9042, "step": 210 }, { "epoch": 0.72, "grad_norm": 1.4769114255905151, "learning_rate": 9.641719745222931e-06, "loss": 1.8832, "step": 225 }, { "epoch": 0.76, "grad_norm": 1.5509356260299683, "learning_rate": 9.617834394904459e-06, "loss": 1.8388, "step": 240 }, { "epoch": 0.81, "grad_norm": 1.6834059953689575, "learning_rate": 9.593949044585989e-06, "loss": 1.7997, "step": 255 }, { "epoch": 0.86, "grad_norm": 1.6854455471038818, "learning_rate": 9.570063694267517e-06, "loss": 1.7652, "step": 270 }, { "epoch": 0.91, "grad_norm": 1.7652379274368286, "learning_rate": 9.546178343949045e-06, "loss": 1.7315, "step": 285 }, { "epoch": 0.96, "grad_norm": 1.6533325910568237, "learning_rate": 9.522292993630574e-06, "loss": 1.6738, "step": 300 }, { "epoch": 0.96, "eval_accuracy": 0.72, "eval_loss": 1.6147921085357666, "eval_runtime": 14.0649, "eval_samples_per_second": 74.654, "eval_steps_per_second": 9.385, "step": 300 }, { "epoch": 1.0, "grad_norm": 1.9673418998718262, "learning_rate": 9.498407643312102e-06, "loss": 1.6181, "step": 315 }, { "epoch": 1.05, "grad_norm": 1.8408273458480835, "learning_rate": 9.47452229299363e-06, "loss": 1.601, "step": 330 }, { "epoch": 1.1, "grad_norm": 1.7604966163635254, "learning_rate": 9.45063694267516e-06, "loss": 1.5844, "step": 345 }, { "epoch": 1.15, "grad_norm": 1.9570591449737549, "learning_rate": 9.426751592356688e-06, "loss": 1.5673, "step": 360 }, { "epoch": 1.19, "grad_norm": 1.7698423862457275, "learning_rate": 9.402866242038218e-06, "loss": 1.5021, "step": 375 }, { "epoch": 1.24, "grad_norm": 2.1795897483825684, "learning_rate": 9.378980891719746e-06, "loss": 1.4828, "step": 390 }, { "epoch": 1.27, "eval_accuracy": 0.7704761904761904, "eval_loss": 1.3860931396484375, "eval_runtime": 14.2522, "eval_samples_per_second": 73.673, "eval_steps_per_second": 9.262, "step": 400 }, { "epoch": 1.29, "grad_norm": 1.9033102989196777, "learning_rate": 9.355095541401275e-06, "loss": 1.4153, "step": 405 }, { "epoch": 1.34, "grad_norm": 2.0485243797302246, "learning_rate": 9.331210191082803e-06, "loss": 1.3602, "step": 420 }, { "epoch": 1.39, "grad_norm": 2.0998847484588623, "learning_rate": 9.307324840764333e-06, "loss": 1.3972, "step": 435 }, { "epoch": 1.43, "grad_norm": 2.2247631549835205, "learning_rate": 9.283439490445861e-06, "loss": 1.3401, "step": 450 }, { "epoch": 1.48, "grad_norm": 2.1834170818328857, "learning_rate": 9.259554140127389e-06, "loss": 1.2974, "step": 465 }, { "epoch": 1.53, "grad_norm": 2.6833572387695312, "learning_rate": 9.235668789808919e-06, "loss": 1.2803, "step": 480 }, { "epoch": 1.58, "grad_norm": 2.371852397918701, "learning_rate": 9.211783439490447e-06, "loss": 1.2768, "step": 495 }, { "epoch": 1.59, "eval_accuracy": 0.7590476190476191, "eval_loss": 1.2411518096923828, "eval_runtime": 14.0673, "eval_samples_per_second": 74.641, "eval_steps_per_second": 9.383, "step": 500 }, { "epoch": 1.62, "grad_norm": 2.595338821411133, "learning_rate": 9.187898089171975e-06, "loss": 1.2409, "step": 510 }, { "epoch": 1.67, "grad_norm": 1.8707057237625122, "learning_rate": 9.164012738853504e-06, "loss": 1.279, "step": 525 }, { "epoch": 1.72, "grad_norm": 6.527611255645752, "learning_rate": 9.140127388535032e-06, "loss": 1.2422, "step": 540 }, { "epoch": 1.77, "grad_norm": 2.4771523475646973, "learning_rate": 9.116242038216562e-06, "loss": 1.2176, "step": 555 }, { "epoch": 1.82, "grad_norm": 2.560882329940796, "learning_rate": 9.09235668789809e-06, "loss": 1.1891, "step": 570 }, { "epoch": 1.86, "grad_norm": 2.966686487197876, "learning_rate": 9.068471337579618e-06, "loss": 1.201, "step": 585 }, { "epoch": 1.91, "grad_norm": 2.8730111122131348, "learning_rate": 9.044585987261148e-06, "loss": 1.1759, "step": 600 }, { "epoch": 1.91, "eval_accuracy": 0.7914285714285715, "eval_loss": 1.116868257522583, "eval_runtime": 13.8703, "eval_samples_per_second": 75.701, "eval_steps_per_second": 9.517, "step": 600 }, { "epoch": 1.96, "grad_norm": 2.659374237060547, "learning_rate": 9.020700636942676e-06, "loss": 1.1196, "step": 615 }, { "epoch": 2.01, "grad_norm": 2.438401699066162, "learning_rate": 8.996815286624204e-06, "loss": 1.1535, "step": 630 }, { "epoch": 2.05, "grad_norm": 2.551445245742798, "learning_rate": 8.972929936305733e-06, "loss": 1.1448, "step": 645 }, { "epoch": 2.1, "grad_norm": 2.738217353820801, "learning_rate": 8.949044585987261e-06, "loss": 1.0833, "step": 660 }, { "epoch": 2.15, "grad_norm": 2.850508689880371, "learning_rate": 8.925159235668791e-06, "loss": 1.0304, "step": 675 }, { "epoch": 2.2, "grad_norm": 1.8277325630187988, "learning_rate": 8.901273885350319e-06, "loss": 1.0314, "step": 690 }, { "epoch": 2.23, "eval_accuracy": 0.7761904761904762, "eval_loss": 1.0599384307861328, "eval_runtime": 13.8327, "eval_samples_per_second": 75.907, "eval_steps_per_second": 9.543, "step": 700 }, { "epoch": 2.25, "grad_norm": 3.3399343490600586, "learning_rate": 8.877388535031847e-06, "loss": 1.0278, "step": 705 }, { "epoch": 2.29, "grad_norm": 2.1914594173431396, "learning_rate": 8.853503184713377e-06, "loss": 1.0493, "step": 720 }, { "epoch": 2.34, "grad_norm": 4.2753682136535645, "learning_rate": 8.829617834394906e-06, "loss": 1.0377, "step": 735 }, { "epoch": 2.39, "grad_norm": 2.1339023113250732, "learning_rate": 8.805732484076433e-06, "loss": 1.0431, "step": 750 }, { "epoch": 2.44, "grad_norm": 2.3232874870300293, "learning_rate": 8.781847133757962e-06, "loss": 0.9801, "step": 765 }, { "epoch": 2.48, "grad_norm": 2.304199695587158, "learning_rate": 8.757961783439492e-06, "loss": 0.9999, "step": 780 }, { "epoch": 2.53, "grad_norm": 2.624868154525757, "learning_rate": 8.734076433121018e-06, "loss": 0.9702, "step": 795 }, { "epoch": 2.55, "eval_accuracy": 0.8104761904761905, "eval_loss": 0.9639754295349121, "eval_runtime": 13.7909, "eval_samples_per_second": 76.137, "eval_steps_per_second": 9.572, "step": 800 }, { "epoch": 2.58, "grad_norm": 3.725691080093384, "learning_rate": 8.710191082802548e-06, "loss": 0.9801, "step": 810 }, { "epoch": 2.63, "grad_norm": 3.1988706588745117, "learning_rate": 8.686305732484078e-06, "loss": 0.976, "step": 825 }, { "epoch": 2.68, "grad_norm": 3.571751356124878, "learning_rate": 8.662420382165606e-06, "loss": 0.9098, "step": 840 }, { "epoch": 2.72, "grad_norm": 3.5634512901306152, "learning_rate": 8.638535031847134e-06, "loss": 0.894, "step": 855 }, { "epoch": 2.77, "grad_norm": 3.1144227981567383, "learning_rate": 8.614649681528664e-06, "loss": 0.9718, "step": 870 }, { "epoch": 2.82, "grad_norm": 3.4376463890075684, "learning_rate": 8.590764331210192e-06, "loss": 0.9247, "step": 885 }, { "epoch": 2.87, "grad_norm": 3.0620603561401367, "learning_rate": 8.566878980891721e-06, "loss": 0.9559, "step": 900 }, { "epoch": 2.87, "eval_accuracy": 0.8076190476190476, "eval_loss": 0.9138039946556091, "eval_runtime": 13.8645, "eval_samples_per_second": 75.733, "eval_steps_per_second": 9.521, "step": 900 }, { "epoch": 2.91, "grad_norm": 3.61348557472229, "learning_rate": 8.54299363057325e-06, "loss": 0.8909, "step": 915 }, { "epoch": 2.96, "grad_norm": 3.5213446617126465, "learning_rate": 8.519108280254777e-06, "loss": 0.8482, "step": 930 }, { "epoch": 3.01, "grad_norm": 3.8047091960906982, "learning_rate": 8.495222929936307e-06, "loss": 0.9267, "step": 945 }, { "epoch": 3.06, "grad_norm": 4.242620468139648, "learning_rate": 8.471337579617835e-06, "loss": 0.8823, "step": 960 }, { "epoch": 3.11, "grad_norm": 3.3440823554992676, "learning_rate": 8.447452229299363e-06, "loss": 0.8623, "step": 975 }, { "epoch": 3.15, "grad_norm": 4.062607288360596, "learning_rate": 8.423566878980893e-06, "loss": 0.858, "step": 990 }, { "epoch": 3.18, "eval_accuracy": 0.8247619047619048, "eval_loss": 0.860478401184082, "eval_runtime": 14.0508, "eval_samples_per_second": 74.729, "eval_steps_per_second": 9.394, "step": 1000 }, { "epoch": 3.2, "grad_norm": 2.4473769664764404, "learning_rate": 8.39968152866242e-06, "loss": 0.8473, "step": 1005 }, { "epoch": 3.25, "grad_norm": 5.5681023597717285, "learning_rate": 8.37579617834395e-06, "loss": 0.865, "step": 1020 }, { "epoch": 3.3, "grad_norm": 2.966853618621826, "learning_rate": 8.351910828025478e-06, "loss": 0.8306, "step": 1035 }, { "epoch": 3.34, "grad_norm": 5.360078811645508, "learning_rate": 8.328025477707006e-06, "loss": 0.7947, "step": 1050 }, { "epoch": 3.39, "grad_norm": 5.492262840270996, "learning_rate": 8.304140127388536e-06, "loss": 0.8343, "step": 1065 }, { "epoch": 3.44, "grad_norm": 2.8821961879730225, "learning_rate": 8.280254777070064e-06, "loss": 0.7892, "step": 1080 }, { "epoch": 3.49, "grad_norm": 3.5130903720855713, "learning_rate": 8.256369426751592e-06, "loss": 0.7858, "step": 1095 }, { "epoch": 3.5, "eval_accuracy": 0.8371428571428572, "eval_loss": 0.8164299726486206, "eval_runtime": 14.1175, "eval_samples_per_second": 74.376, "eval_steps_per_second": 9.35, "step": 1100 }, { "epoch": 3.54, "grad_norm": 4.600940704345703, "learning_rate": 8.232484076433122e-06, "loss": 0.8238, "step": 1110 }, { "epoch": 3.58, "grad_norm": 2.8502960205078125, "learning_rate": 8.208598726114651e-06, "loss": 0.8122, "step": 1125 }, { "epoch": 3.63, "grad_norm": 3.0440945625305176, "learning_rate": 8.18471337579618e-06, "loss": 0.7492, "step": 1140 }, { "epoch": 3.68, "grad_norm": 3.4030416011810303, "learning_rate": 8.160828025477707e-06, "loss": 0.7699, "step": 1155 }, { "epoch": 3.73, "grad_norm": 6.065947532653809, "learning_rate": 8.136942675159237e-06, "loss": 0.7903, "step": 1170 }, { "epoch": 3.77, "grad_norm": 3.8829727172851562, "learning_rate": 8.113057324840765e-06, "loss": 0.7977, "step": 1185 }, { "epoch": 3.82, "grad_norm": 3.397552013397217, "learning_rate": 8.089171974522295e-06, "loss": 0.7898, "step": 1200 }, { "epoch": 3.82, "eval_accuracy": 0.8333333333333334, "eval_loss": 0.7916920781135559, "eval_runtime": 14.238, "eval_samples_per_second": 73.746, "eval_steps_per_second": 9.271, "step": 1200 }, { "epoch": 3.87, "grad_norm": 3.440532684326172, "learning_rate": 8.065286624203823e-06, "loss": 0.7507, "step": 1215 }, { "epoch": 3.92, "grad_norm": 2.4461238384246826, "learning_rate": 8.04140127388535e-06, "loss": 0.7454, "step": 1230 }, { "epoch": 3.96, "grad_norm": 2.5467453002929688, "learning_rate": 8.01751592356688e-06, "loss": 0.7682, "step": 1245 }, { "epoch": 4.01, "grad_norm": 5.4643025398254395, "learning_rate": 7.993630573248408e-06, "loss": 0.7136, "step": 1260 }, { "epoch": 4.06, "grad_norm": 4.570471286773682, "learning_rate": 7.969745222929936e-06, "loss": 0.6882, "step": 1275 }, { "epoch": 4.11, "grad_norm": 2.5436484813690186, "learning_rate": 7.945859872611466e-06, "loss": 0.6909, "step": 1290 }, { "epoch": 4.14, "eval_accuracy": 0.8038095238095239, "eval_loss": 0.7995317578315735, "eval_runtime": 14.274, "eval_samples_per_second": 73.56, "eval_steps_per_second": 9.248, "step": 1300 }, { "epoch": 4.16, "grad_norm": 5.001487731933594, "learning_rate": 7.921974522292994e-06, "loss": 0.7197, "step": 1305 }, { "epoch": 4.2, "grad_norm": 3.342618465423584, "learning_rate": 7.898089171974524e-06, "loss": 0.6634, "step": 1320 }, { "epoch": 4.25, "grad_norm": 4.680523872375488, "learning_rate": 7.874203821656052e-06, "loss": 0.6952, "step": 1335 }, { "epoch": 4.3, "grad_norm": 4.881319522857666, "learning_rate": 7.85031847133758e-06, "loss": 0.7134, "step": 1350 }, { "epoch": 4.35, "grad_norm": 5.452937126159668, "learning_rate": 7.82643312101911e-06, "loss": 0.7078, "step": 1365 }, { "epoch": 4.39, "grad_norm": 3.7072432041168213, "learning_rate": 7.802547770700637e-06, "loss": 0.6691, "step": 1380 }, { "epoch": 4.44, "grad_norm": 4.637836456298828, "learning_rate": 7.778662420382165e-06, "loss": 0.6619, "step": 1395 }, { "epoch": 4.46, "eval_accuracy": 0.7828571428571428, "eval_loss": 0.8194388151168823, "eval_runtime": 14.1277, "eval_samples_per_second": 74.322, "eval_steps_per_second": 9.343, "step": 1400 }, { "epoch": 4.49, "grad_norm": 4.957500457763672, "learning_rate": 7.754777070063695e-06, "loss": 0.6907, "step": 1410 }, { "epoch": 4.54, "grad_norm": 6.341672420501709, "learning_rate": 7.730891719745223e-06, "loss": 0.7597, "step": 1425 }, { "epoch": 4.59, "grad_norm": 5.114958763122559, "learning_rate": 7.707006369426753e-06, "loss": 0.6816, "step": 1440 }, { "epoch": 4.63, "grad_norm": 4.16578483581543, "learning_rate": 7.68312101910828e-06, "loss": 0.7146, "step": 1455 }, { "epoch": 4.68, "grad_norm": 4.3378071784973145, "learning_rate": 7.659235668789809e-06, "loss": 0.666, "step": 1470 }, { "epoch": 4.73, "grad_norm": 3.4012575149536133, "learning_rate": 7.635350318471338e-06, "loss": 0.654, "step": 1485 }, { "epoch": 4.78, "grad_norm": 4.172365188598633, "learning_rate": 7.611464968152867e-06, "loss": 0.6457, "step": 1500 }, { "epoch": 4.78, "eval_accuracy": 0.8085714285714286, "eval_loss": 0.7536157369613647, "eval_runtime": 14.3149, "eval_samples_per_second": 73.35, "eval_steps_per_second": 9.221, "step": 1500 }, { "epoch": 4.82, "grad_norm": 4.036574840545654, "learning_rate": 7.587579617834395e-06, "loss": 0.6452, "step": 1515 }, { "epoch": 4.87, "grad_norm": 3.928567409515381, "learning_rate": 7.563694267515924e-06, "loss": 0.6374, "step": 1530 }, { "epoch": 4.92, "grad_norm": 4.106697082519531, "learning_rate": 7.539808917197453e-06, "loss": 0.7006, "step": 1545 }, { "epoch": 4.97, "grad_norm": 4.1516242027282715, "learning_rate": 7.515923566878982e-06, "loss": 0.5827, "step": 1560 }, { "epoch": 5.02, "grad_norm": 4.836935043334961, "learning_rate": 7.49203821656051e-06, "loss": 0.5722, "step": 1575 }, { "epoch": 5.06, "grad_norm": 3.066807985305786, "learning_rate": 7.468152866242039e-06, "loss": 0.6155, "step": 1590 }, { "epoch": 5.1, "eval_accuracy": 0.8257142857142857, "eval_loss": 0.7212250828742981, "eval_runtime": 14.0875, "eval_samples_per_second": 74.534, "eval_steps_per_second": 9.37, "step": 1600 }, { "epoch": 5.11, "grad_norm": 4.183410167694092, "learning_rate": 7.4442675159235675e-06, "loss": 0.5963, "step": 1605 }, { "epoch": 5.16, "grad_norm": 3.687758207321167, "learning_rate": 7.4203821656050955e-06, "loss": 0.6106, "step": 1620 }, { "epoch": 5.21, "grad_norm": 3.360952138900757, "learning_rate": 7.396496815286624e-06, "loss": 0.6153, "step": 1635 }, { "epoch": 5.25, "grad_norm": 6.3952507972717285, "learning_rate": 7.372611464968153e-06, "loss": 0.6328, "step": 1650 }, { "epoch": 5.3, "grad_norm": 3.9628074169158936, "learning_rate": 7.348726114649683e-06, "loss": 0.6233, "step": 1665 }, { "epoch": 5.35, "grad_norm": 5.332892417907715, "learning_rate": 7.32484076433121e-06, "loss": 0.6638, "step": 1680 }, { "epoch": 5.4, "grad_norm": 4.033740043640137, "learning_rate": 7.300955414012739e-06, "loss": 0.5511, "step": 1695 }, { "epoch": 5.41, "eval_accuracy": 0.8095238095238095, "eval_loss": 0.7273786067962646, "eval_runtime": 14.0255, "eval_samples_per_second": 74.864, "eval_steps_per_second": 9.411, "step": 1700 }, { "epoch": 5.45, "grad_norm": 2.9152443408966064, "learning_rate": 7.2770700636942685e-06, "loss": 0.5926, "step": 1710 }, { "epoch": 5.49, "grad_norm": 4.412004470825195, "learning_rate": 7.253184713375797e-06, "loss": 0.575, "step": 1725 }, { "epoch": 5.54, "grad_norm": 4.271115779876709, "learning_rate": 7.2292993630573245e-06, "loss": 0.5522, "step": 1740 }, { "epoch": 5.59, "grad_norm": 5.693962574005127, "learning_rate": 7.205414012738854e-06, "loss": 0.5918, "step": 1755 }, { "epoch": 5.64, "grad_norm": 2.9795901775360107, "learning_rate": 7.181528662420383e-06, "loss": 0.5804, "step": 1770 }, { "epoch": 5.68, "grad_norm": 4.588832855224609, "learning_rate": 7.157643312101912e-06, "loss": 0.6135, "step": 1785 }, { "epoch": 5.73, "grad_norm": 5.463626861572266, "learning_rate": 7.13375796178344e-06, "loss": 0.5486, "step": 1800 }, { "epoch": 5.73, "eval_accuracy": 0.8285714285714286, "eval_loss": 0.7047600150108337, "eval_runtime": 14.2281, "eval_samples_per_second": 73.798, "eval_steps_per_second": 9.277, "step": 1800 }, { "epoch": 5.78, "grad_norm": 4.97993278503418, "learning_rate": 7.109872611464969e-06, "loss": 0.5619, "step": 1815 }, { "epoch": 5.83, "grad_norm": 2.6514835357666016, "learning_rate": 7.085987261146498e-06, "loss": 0.5108, "step": 1830 }, { "epoch": 5.88, "grad_norm": 3.859992742538452, "learning_rate": 7.0621019108280264e-06, "loss": 0.586, "step": 1845 }, { "epoch": 5.92, "grad_norm": 4.4486494064331055, "learning_rate": 7.0382165605095544e-06, "loss": 0.5856, "step": 1860 }, { "epoch": 5.97, "grad_norm": 4.080160617828369, "learning_rate": 7.014331210191083e-06, "loss": 0.5792, "step": 1875 }, { "epoch": 6.02, "grad_norm": 5.243125915527344, "learning_rate": 6.990445859872612e-06, "loss": 0.5679, "step": 1890 }, { "epoch": 6.05, "eval_accuracy": 0.8180952380952381, "eval_loss": 0.712360143661499, "eval_runtime": 14.0568, "eval_samples_per_second": 74.697, "eval_steps_per_second": 9.39, "step": 1900 }, { "epoch": 6.07, "grad_norm": 3.062335252761841, "learning_rate": 6.966560509554141e-06, "loss": 0.515, "step": 1905 }, { "epoch": 6.11, "grad_norm": 2.6931862831115723, "learning_rate": 6.942675159235669e-06, "loss": 0.5465, "step": 1920 }, { "epoch": 6.16, "grad_norm": 6.212809085845947, "learning_rate": 6.918789808917198e-06, "loss": 0.549, "step": 1935 }, { "epoch": 6.21, "grad_norm": 2.02023983001709, "learning_rate": 6.894904458598727e-06, "loss": 0.5534, "step": 1950 }, { "epoch": 6.26, "grad_norm": 4.084249496459961, "learning_rate": 6.8710191082802555e-06, "loss": 0.5061, "step": 1965 }, { "epoch": 6.31, "grad_norm": 9.472137451171875, "learning_rate": 6.8471337579617835e-06, "loss": 0.5017, "step": 1980 }, { "epoch": 6.35, "grad_norm": 3.419933319091797, "learning_rate": 6.823248407643312e-06, "loss": 0.4914, "step": 1995 }, { "epoch": 6.37, "eval_accuracy": 0.800952380952381, "eval_loss": 0.7276927828788757, "eval_runtime": 14.1066, "eval_samples_per_second": 74.433, "eval_steps_per_second": 9.357, "step": 2000 }, { "epoch": 6.4, "grad_norm": 2.971423387527466, "learning_rate": 6.799363057324841e-06, "loss": 0.5342, "step": 2010 }, { "epoch": 6.45, "grad_norm": 3.118852376937866, "learning_rate": 6.77547770700637e-06, "loss": 0.5055, "step": 2025 }, { "epoch": 6.5, "grad_norm": 3.238327741622925, "learning_rate": 6.751592356687898e-06, "loss": 0.5287, "step": 2040 }, { "epoch": 6.54, "grad_norm": 4.170012950897217, "learning_rate": 6.727707006369427e-06, "loss": 0.5199, "step": 2055 }, { "epoch": 6.59, "grad_norm": 6.786019325256348, "learning_rate": 6.7038216560509565e-06, "loss": 0.5561, "step": 2070 }, { "epoch": 6.64, "grad_norm": 4.666538715362549, "learning_rate": 6.679936305732485e-06, "loss": 0.4712, "step": 2085 }, { "epoch": 6.69, "grad_norm": 3.341716766357422, "learning_rate": 6.6560509554140125e-06, "loss": 0.525, "step": 2100 }, { "epoch": 6.69, "eval_accuracy": 0.8123809523809524, "eval_loss": 0.6971268653869629, "eval_runtime": 14.2262, "eval_samples_per_second": 73.808, "eval_steps_per_second": 9.279, "step": 2100 }, { "epoch": 6.74, "grad_norm": 8.099006652832031, "learning_rate": 6.632165605095542e-06, "loss": 0.5809, "step": 2115 }, { "epoch": 6.78, "grad_norm": 4.491795063018799, "learning_rate": 6.608280254777071e-06, "loss": 0.4867, "step": 2130 }, { "epoch": 6.83, "grad_norm": 2.734052896499634, "learning_rate": 6.5843949044586e-06, "loss": 0.4816, "step": 2145 }, { "epoch": 6.88, "grad_norm": 6.541310787200928, "learning_rate": 6.560509554140128e-06, "loss": 0.4877, "step": 2160 }, { "epoch": 6.93, "grad_norm": 4.009987831115723, "learning_rate": 6.536624203821657e-06, "loss": 0.5303, "step": 2175 }, { "epoch": 6.97, "grad_norm": 5.541009902954102, "learning_rate": 6.5127388535031856e-06, "loss": 0.5081, "step": 2190 }, { "epoch": 7.01, "eval_accuracy": 0.8161904761904762, "eval_loss": 0.6869356632232666, "eval_runtime": 14.0402, "eval_samples_per_second": 74.785, "eval_steps_per_second": 9.402, "step": 2200 }, { "epoch": 7.02, "grad_norm": 3.370013475418091, "learning_rate": 6.488853503184714e-06, "loss": 0.441, "step": 2205 }, { "epoch": 7.07, "grad_norm": 3.942737340927124, "learning_rate": 6.464968152866242e-06, "loss": 0.4422, "step": 2220 }, { "epoch": 7.12, "grad_norm": 4.051229476928711, "learning_rate": 6.441082802547771e-06, "loss": 0.4439, "step": 2235 }, { "epoch": 7.17, "grad_norm": 4.835233211517334, "learning_rate": 6.4171974522293e-06, "loss": 0.4789, "step": 2250 }, { "epoch": 7.21, "grad_norm": 5.995761871337891, "learning_rate": 6.393312101910829e-06, "loss": 0.4776, "step": 2265 }, { "epoch": 7.26, "grad_norm": 3.5410878658294678, "learning_rate": 6.369426751592357e-06, "loss": 0.5077, "step": 2280 }, { "epoch": 7.31, "grad_norm": 5.137345790863037, "learning_rate": 6.345541401273886e-06, "loss": 0.5072, "step": 2295 }, { "epoch": 7.32, "eval_accuracy": 0.8076190476190476, "eval_loss": 0.6836773157119751, "eval_runtime": 14.3682, "eval_samples_per_second": 73.078, "eval_steps_per_second": 9.187, "step": 2300 }, { "epoch": 7.36, "grad_norm": 7.055994987487793, "learning_rate": 6.321656050955415e-06, "loss": 0.4419, "step": 2310 }, { "epoch": 7.4, "grad_norm": 5.709652423858643, "learning_rate": 6.2977707006369435e-06, "loss": 0.485, "step": 2325 }, { "epoch": 7.45, "grad_norm": 3.7861547470092773, "learning_rate": 6.2738853503184715e-06, "loss": 0.4473, "step": 2340 }, { "epoch": 7.5, "grad_norm": 5.255743503570557, "learning_rate": 6.25e-06, "loss": 0.4558, "step": 2355 }, { "epoch": 7.55, "grad_norm": 3.864764928817749, "learning_rate": 6.226114649681529e-06, "loss": 0.4522, "step": 2370 }, { "epoch": 7.6, "grad_norm": 3.3340399265289307, "learning_rate": 6.202229299363057e-06, "loss": 0.4358, "step": 2385 }, { "epoch": 7.64, "grad_norm": 6.069746971130371, "learning_rate": 6.178343949044586e-06, "loss": 0.4702, "step": 2400 }, { "epoch": 7.64, "eval_accuracy": 0.8152380952380952, "eval_loss": 0.6736096739768982, "eval_runtime": 14.301, "eval_samples_per_second": 73.421, "eval_steps_per_second": 9.23, "step": 2400 }, { "epoch": 7.69, "grad_norm": 8.816964149475098, "learning_rate": 6.154458598726115e-06, "loss": 0.4847, "step": 2415 }, { "epoch": 7.74, "grad_norm": 2.001786708831787, "learning_rate": 6.1305732484076445e-06, "loss": 0.5597, "step": 2430 }, { "epoch": 7.79, "grad_norm": 4.528175354003906, "learning_rate": 6.106687898089172e-06, "loss": 0.4268, "step": 2445 }, { "epoch": 7.83, "grad_norm": 5.561005115509033, "learning_rate": 6.0828025477707005e-06, "loss": 0.4001, "step": 2460 }, { "epoch": 7.88, "grad_norm": 5.51016902923584, "learning_rate": 6.05891719745223e-06, "loss": 0.4159, "step": 2475 }, { "epoch": 7.93, "grad_norm": 2.6793384552001953, "learning_rate": 6.035031847133759e-06, "loss": 0.4303, "step": 2490 }, { "epoch": 7.96, "eval_accuracy": 0.8104761904761905, "eval_loss": 0.6692745685577393, "eval_runtime": 14.1903, "eval_samples_per_second": 73.994, "eval_steps_per_second": 9.302, "step": 2500 }, { "epoch": 7.98, "grad_norm": 3.1929242610931396, "learning_rate": 6.011146496815287e-06, "loss": 0.43, "step": 2505 }, { "epoch": 8.03, "grad_norm": 3.281541585922241, "learning_rate": 5.987261146496816e-06, "loss": 0.4189, "step": 2520 }, { "epoch": 8.07, "grad_norm": 4.396710395812988, "learning_rate": 5.963375796178345e-06, "loss": 0.4211, "step": 2535 }, { "epoch": 8.12, "grad_norm": 2.817596435546875, "learning_rate": 5.9394904458598736e-06, "loss": 0.4409, "step": 2550 }, { "epoch": 8.17, "grad_norm": 5.4012250900268555, "learning_rate": 5.9156050955414016e-06, "loss": 0.4407, "step": 2565 }, { "epoch": 8.22, "grad_norm": 8.333674430847168, "learning_rate": 5.89171974522293e-06, "loss": 0.4267, "step": 2580 }, { "epoch": 8.26, "grad_norm": 3.2275092601776123, "learning_rate": 5.867834394904459e-06, "loss": 0.3916, "step": 2595 }, { "epoch": 8.28, "eval_accuracy": 0.8238095238095238, "eval_loss": 0.6487377882003784, "eval_runtime": 13.7782, "eval_samples_per_second": 76.208, "eval_steps_per_second": 9.58, "step": 2600 }, { "epoch": 8.31, "grad_norm": 6.641006946563721, "learning_rate": 5.843949044585988e-06, "loss": 0.4032, "step": 2610 }, { "epoch": 8.36, "grad_norm": 7.206479072570801, "learning_rate": 5.820063694267516e-06, "loss": 0.4515, "step": 2625 }, { "epoch": 8.41, "grad_norm": 8.875749588012695, "learning_rate": 5.796178343949045e-06, "loss": 0.4438, "step": 2640 }, { "epoch": 8.46, "grad_norm": 8.141197204589844, "learning_rate": 5.772292993630574e-06, "loss": 0.439, "step": 2655 }, { "epoch": 8.5, "grad_norm": 5.922451019287109, "learning_rate": 5.748407643312103e-06, "loss": 0.4322, "step": 2670 }, { "epoch": 8.55, "grad_norm": 5.907478332519531, "learning_rate": 5.724522292993631e-06, "loss": 0.5118, "step": 2685 }, { "epoch": 8.6, "grad_norm": 7.069815635681152, "learning_rate": 5.7006369426751594e-06, "loss": 0.4002, "step": 2700 }, { "epoch": 8.6, "eval_accuracy": 0.8161904761904762, "eval_loss": 0.6660885214805603, "eval_runtime": 14.4979, "eval_samples_per_second": 72.424, "eval_steps_per_second": 9.105, "step": 2700 }, { "epoch": 8.65, "grad_norm": 5.684536457061768, "learning_rate": 5.676751592356688e-06, "loss": 0.4734, "step": 2715 }, { "epoch": 8.69, "grad_norm": 6.170379161834717, "learning_rate": 5.652866242038217e-06, "loss": 0.3783, "step": 2730 }, { "epoch": 8.74, "grad_norm": 4.068530082702637, "learning_rate": 5.628980891719745e-06, "loss": 0.4746, "step": 2745 }, { "epoch": 8.79, "grad_norm": 4.95632791519165, "learning_rate": 5.605095541401274e-06, "loss": 0.3979, "step": 2760 }, { "epoch": 8.84, "grad_norm": 2.6091508865356445, "learning_rate": 5.581210191082803e-06, "loss": 0.4066, "step": 2775 }, { "epoch": 8.89, "grad_norm": 4.2863030433654785, "learning_rate": 5.5573248407643325e-06, "loss": 0.3965, "step": 2790 }, { "epoch": 8.92, "eval_accuracy": 0.8142857142857143, "eval_loss": 0.6611486673355103, "eval_runtime": 14.1692, "eval_samples_per_second": 74.104, "eval_steps_per_second": 9.316, "step": 2800 }, { "epoch": 8.93, "grad_norm": 4.6702561378479, "learning_rate": 5.53343949044586e-06, "loss": 0.4176, "step": 2805 }, { "epoch": 8.98, "grad_norm": 7.670042037963867, "learning_rate": 5.5095541401273885e-06, "loss": 0.4, "step": 2820 }, { "epoch": 9.03, "grad_norm": 5.234184741973877, "learning_rate": 5.485668789808918e-06, "loss": 0.4021, "step": 2835 }, { "epoch": 9.08, "grad_norm": 5.475564479827881, "learning_rate": 5.461783439490447e-06, "loss": 0.3984, "step": 2850 }, { "epoch": 9.12, "grad_norm": 3.48297119140625, "learning_rate": 5.437898089171974e-06, "loss": 0.4107, "step": 2865 }, { "epoch": 9.17, "grad_norm": 5.547008514404297, "learning_rate": 5.414012738853504e-06, "loss": 0.4205, "step": 2880 }, { "epoch": 9.22, "grad_norm": 5.729915142059326, "learning_rate": 5.390127388535033e-06, "loss": 0.3946, "step": 2895 }, { "epoch": 9.24, "eval_accuracy": 0.8142857142857143, "eval_loss": 0.652323842048645, "eval_runtime": 14.2667, "eval_samples_per_second": 73.598, "eval_steps_per_second": 9.252, "step": 2900 }, { "epoch": 9.27, "grad_norm": 3.7174131870269775, "learning_rate": 5.3662420382165615e-06, "loss": 0.4326, "step": 2910 }, { "epoch": 9.32, "grad_norm": 4.2326130867004395, "learning_rate": 5.3423566878980895e-06, "loss": 0.3826, "step": 2925 }, { "epoch": 9.36, "grad_norm": 6.576159954071045, "learning_rate": 5.318471337579618e-06, "loss": 0.4053, "step": 2940 }, { "epoch": 9.41, "grad_norm": 3.004859685897827, "learning_rate": 5.294585987261147e-06, "loss": 0.3833, "step": 2955 }, { "epoch": 9.46, "grad_norm": 4.646060943603516, "learning_rate": 5.270700636942676e-06, "loss": 0.3586, "step": 2970 }, { "epoch": 9.51, "grad_norm": 3.8182735443115234, "learning_rate": 5.246815286624204e-06, "loss": 0.3463, "step": 2985 }, { "epoch": 9.55, "grad_norm": 4.330494403839111, "learning_rate": 5.222929936305733e-06, "loss": 0.3794, "step": 3000 }, { "epoch": 9.55, "eval_accuracy": 0.8047619047619048, "eval_loss": 0.6615909934043884, "eval_runtime": 13.9408, "eval_samples_per_second": 75.318, "eval_steps_per_second": 9.469, "step": 3000 }, { "epoch": 9.6, "grad_norm": 5.949002742767334, "learning_rate": 5.199044585987262e-06, "loss": 0.3572, "step": 3015 }, { "epoch": 9.65, "grad_norm": 4.592197418212891, "learning_rate": 5.175159235668791e-06, "loss": 0.3428, "step": 3030 }, { "epoch": 9.7, "grad_norm": 6.483883857727051, "learning_rate": 5.151273885350319e-06, "loss": 0.4247, "step": 3045 }, { "epoch": 9.75, "grad_norm": 4.930222988128662, "learning_rate": 5.1273885350318474e-06, "loss": 0.467, "step": 3060 }, { "epoch": 9.79, "grad_norm": 4.973721504211426, "learning_rate": 5.103503184713376e-06, "loss": 0.3293, "step": 3075 }, { "epoch": 9.84, "grad_norm": 3.4401309490203857, "learning_rate": 5.079617834394905e-06, "loss": 0.3257, "step": 3090 }, { "epoch": 9.87, "eval_accuracy": 0.8028571428571428, "eval_loss": 0.6717351675033569, "eval_runtime": 14.2902, "eval_samples_per_second": 73.477, "eval_steps_per_second": 9.237, "step": 3100 }, { "epoch": 9.89, "grad_norm": 3.4736809730529785, "learning_rate": 5.055732484076433e-06, "loss": 0.3339, "step": 3105 }, { "epoch": 9.94, "grad_norm": 9.58438777923584, "learning_rate": 5.031847133757962e-06, "loss": 0.3268, "step": 3120 }, { "epoch": 9.98, "grad_norm": 5.36261510848999, "learning_rate": 5.007961783439491e-06, "loss": 0.3418, "step": 3135 }, { "epoch": 10.03, "grad_norm": 4.828367710113525, "learning_rate": 4.98407643312102e-06, "loss": 0.4032, "step": 3150 }, { "epoch": 10.08, "grad_norm": 3.9634041786193848, "learning_rate": 4.960191082802548e-06, "loss": 0.3709, "step": 3165 }, { "epoch": 10.13, "grad_norm": 4.182370185852051, "learning_rate": 4.9363057324840765e-06, "loss": 0.4119, "step": 3180 }, { "epoch": 10.18, "grad_norm": 2.144193410873413, "learning_rate": 4.912420382165605e-06, "loss": 0.4175, "step": 3195 }, { "epoch": 10.19, "eval_accuracy": 0.8057142857142857, "eval_loss": 0.6530217528343201, "eval_runtime": 14.3184, "eval_samples_per_second": 73.332, "eval_steps_per_second": 9.219, "step": 3200 }, { "epoch": 10.22, "grad_norm": 6.0914716720581055, "learning_rate": 4.888535031847134e-06, "loss": 0.3614, "step": 3210 }, { "epoch": 10.27, "grad_norm": 7.094061851501465, "learning_rate": 4.864649681528662e-06, "loss": 0.4015, "step": 3225 }, { "epoch": 10.32, "grad_norm": 6.06875467300415, "learning_rate": 4.840764331210192e-06, "loss": 0.3543, "step": 3240 }, { "epoch": 10.37, "grad_norm": 5.071672439575195, "learning_rate": 4.81687898089172e-06, "loss": 0.3798, "step": 3255 }, { "epoch": 10.41, "grad_norm": 3.41487979888916, "learning_rate": 4.792993630573249e-06, "loss": 0.3878, "step": 3270 }, { "epoch": 10.46, "grad_norm": 6.576408386230469, "learning_rate": 4.7691082802547775e-06, "loss": 0.4392, "step": 3285 }, { "epoch": 10.51, "grad_norm": 4.69280481338501, "learning_rate": 4.745222929936306e-06, "loss": 0.3559, "step": 3300 }, { "epoch": 10.51, "eval_accuracy": 0.7885714285714286, "eval_loss": 0.6882754564285278, "eval_runtime": 14.2678, "eval_samples_per_second": 73.592, "eval_steps_per_second": 9.252, "step": 3300 }, { "epoch": 10.56, "grad_norm": 3.288024663925171, "learning_rate": 4.721337579617834e-06, "loss": 0.36, "step": 3315 }, { "epoch": 10.61, "grad_norm": 5.169658660888672, "learning_rate": 4.697452229299363e-06, "loss": 0.3983, "step": 3330 }, { "epoch": 10.65, "grad_norm": 6.143041610717773, "learning_rate": 4.673566878980892e-06, "loss": 0.3467, "step": 3345 }, { "epoch": 10.7, "grad_norm": 4.744012355804443, "learning_rate": 4.649681528662421e-06, "loss": 0.3497, "step": 3360 }, { "epoch": 10.75, "grad_norm": 3.161777973175049, "learning_rate": 4.625796178343949e-06, "loss": 0.3649, "step": 3375 }, { "epoch": 10.8, "grad_norm": 5.027173042297363, "learning_rate": 4.601910828025479e-06, "loss": 0.3824, "step": 3390 }, { "epoch": 10.83, "eval_accuracy": 0.8, "eval_loss": 0.6610695719718933, "eval_runtime": 14.447, "eval_samples_per_second": 72.68, "eval_steps_per_second": 9.137, "step": 3400 }, { "epoch": 10.84, "grad_norm": 4.828523635864258, "learning_rate": 4.5780254777070066e-06, "loss": 0.3232, "step": 3405 }, { "epoch": 10.89, "grad_norm": 6.017237186431885, "learning_rate": 4.554140127388535e-06, "loss": 0.3464, "step": 3420 }, { "epoch": 10.94, "grad_norm": 4.829287528991699, "learning_rate": 4.530254777070064e-06, "loss": 0.3401, "step": 3435 }, { "epoch": 10.99, "grad_norm": 4.1853485107421875, "learning_rate": 4.506369426751593e-06, "loss": 0.3005, "step": 3450 }, { "epoch": 11.04, "grad_norm": 10.277836799621582, "learning_rate": 4.482484076433121e-06, "loss": 0.3473, "step": 3465 }, { "epoch": 11.08, "grad_norm": 3.646463394165039, "learning_rate": 4.45859872611465e-06, "loss": 0.2844, "step": 3480 }, { "epoch": 11.13, "grad_norm": 5.018289089202881, "learning_rate": 4.434713375796179e-06, "loss": 0.3589, "step": 3495 }, { "epoch": 11.15, "eval_accuracy": 0.8019047619047619, "eval_loss": 0.665899932384491, "eval_runtime": 13.9371, "eval_samples_per_second": 75.339, "eval_steps_per_second": 9.471, "step": 3500 }, { "epoch": 11.18, "grad_norm": 5.166079044342041, "learning_rate": 4.410828025477708e-06, "loss": 0.3608, "step": 3510 }, { "epoch": 11.23, "grad_norm": 3.933661937713623, "learning_rate": 4.386942675159236e-06, "loss": 0.323, "step": 3525 }, { "epoch": 11.27, "grad_norm": 6.12958288192749, "learning_rate": 4.3630573248407645e-06, "loss": 0.4135, "step": 3540 }, { "epoch": 11.32, "grad_norm": 8.609708786010742, "learning_rate": 4.339171974522293e-06, "loss": 0.3714, "step": 3555 }, { "epoch": 11.37, "grad_norm": 5.927934646606445, "learning_rate": 4.315286624203822e-06, "loss": 0.3629, "step": 3570 }, { "epoch": 11.42, "grad_norm": 3.802996873855591, "learning_rate": 4.29140127388535e-06, "loss": 0.3184, "step": 3585 }, { "epoch": 11.46, "grad_norm": 6.791957378387451, "learning_rate": 4.26751592356688e-06, "loss": 0.3299, "step": 3600 }, { "epoch": 11.46, "eval_accuracy": 0.7961904761904762, "eval_loss": 0.681881844997406, "eval_runtime": 14.3394, "eval_samples_per_second": 73.225, "eval_steps_per_second": 9.205, "step": 3600 }, { "epoch": 11.51, "grad_norm": 6.895874977111816, "learning_rate": 4.243630573248408e-06, "loss": 0.3421, "step": 3615 }, { "epoch": 11.56, "grad_norm": 2.5626707077026367, "learning_rate": 4.219745222929937e-06, "loss": 0.333, "step": 3630 }, { "epoch": 11.61, "grad_norm": 6.374701023101807, "learning_rate": 4.1958598726114655e-06, "loss": 0.3841, "step": 3645 }, { "epoch": 11.66, "grad_norm": 3.2361741065979004, "learning_rate": 4.171974522292994e-06, "loss": 0.3102, "step": 3660 }, { "epoch": 11.7, "grad_norm": 4.866982936859131, "learning_rate": 4.148089171974522e-06, "loss": 0.3589, "step": 3675 }, { "epoch": 11.75, "grad_norm": 8.816326141357422, "learning_rate": 4.124203821656051e-06, "loss": 0.3736, "step": 3690 }, { "epoch": 11.78, "eval_accuracy": 0.8114285714285714, "eval_loss": 0.6405251026153564, "eval_runtime": 14.9862, "eval_samples_per_second": 70.064, "eval_steps_per_second": 8.808, "step": 3700 }, { "epoch": 11.8, "grad_norm": 5.2895121574401855, "learning_rate": 4.10031847133758e-06, "loss": 0.354, "step": 3705 }, { "epoch": 11.85, "grad_norm": 7.5266499519348145, "learning_rate": 4.076433121019109e-06, "loss": 0.3051, "step": 3720 }, { "epoch": 11.89, "grad_norm": 4.967874050140381, "learning_rate": 4.052547770700637e-06, "loss": 0.3428, "step": 3735 }, { "epoch": 11.94, "grad_norm": 4.581750392913818, "learning_rate": 4.0286624203821666e-06, "loss": 0.3441, "step": 3750 }, { "epoch": 11.99, "grad_norm": 3.4178411960601807, "learning_rate": 4.0047770700636946e-06, "loss": 0.3154, "step": 3765 }, { "epoch": 12.04, "grad_norm": 7.328088760375977, "learning_rate": 3.980891719745223e-06, "loss": 0.3493, "step": 3780 }, { "epoch": 12.09, "grad_norm": 24.022367477416992, "learning_rate": 3.957006369426752e-06, "loss": 0.3576, "step": 3795 }, { "epoch": 12.1, "eval_accuracy": 0.7961904761904762, "eval_loss": 0.6725224256515503, "eval_runtime": 14.2063, "eval_samples_per_second": 73.911, "eval_steps_per_second": 9.292, "step": 3800 }, { "epoch": 12.13, "grad_norm": 2.5664048194885254, "learning_rate": 3.933121019108281e-06, "loss": 0.3368, "step": 3810 }, { "epoch": 12.18, "grad_norm": 7.625591278076172, "learning_rate": 3.909235668789809e-06, "loss": 0.3103, "step": 3825 }, { "epoch": 12.23, "grad_norm": 3.343187093734741, "learning_rate": 3.885350318471338e-06, "loss": 0.3318, "step": 3840 }, { "epoch": 12.28, "grad_norm": 7.195040702819824, "learning_rate": 3.861464968152867e-06, "loss": 0.2984, "step": 3855 }, { "epoch": 12.32, "grad_norm": 5.504820346832275, "learning_rate": 3.837579617834396e-06, "loss": 0.2789, "step": 3870 }, { "epoch": 12.37, "grad_norm": 7.319380760192871, "learning_rate": 3.813694267515924e-06, "loss": 0.289, "step": 3885 }, { "epoch": 12.42, "grad_norm": 8.737920761108398, "learning_rate": 3.789808917197453e-06, "loss": 0.3454, "step": 3900 }, { "epoch": 12.42, "eval_accuracy": 0.7942857142857143, "eval_loss": 0.7025014162063599, "eval_runtime": 13.9336, "eval_samples_per_second": 75.358, "eval_steps_per_second": 9.474, "step": 3900 }, { "epoch": 12.47, "grad_norm": 6.828339576721191, "learning_rate": 3.7659235668789813e-06, "loss": 0.3705, "step": 3915 }, { "epoch": 12.52, "grad_norm": 8.453927993774414, "learning_rate": 3.7420382165605097e-06, "loss": 0.3016, "step": 3930 }, { "epoch": 12.56, "grad_norm": 4.459524631500244, "learning_rate": 3.7181528662420386e-06, "loss": 0.2653, "step": 3945 }, { "epoch": 12.61, "grad_norm": 1.677200436592102, "learning_rate": 3.694267515923567e-06, "loss": 0.3191, "step": 3960 }, { "epoch": 12.66, "grad_norm": 6.538283824920654, "learning_rate": 3.670382165605096e-06, "loss": 0.3162, "step": 3975 }, { "epoch": 12.71, "grad_norm": 6.386500358581543, "learning_rate": 3.6464968152866242e-06, "loss": 0.3049, "step": 3990 }, { "epoch": 12.74, "eval_accuracy": 0.8133333333333334, "eval_loss": 0.6439189314842224, "eval_runtime": 14.144, "eval_samples_per_second": 74.237, "eval_steps_per_second": 9.333, "step": 4000 }, { "epoch": 12.75, "grad_norm": 10.936306953430176, "learning_rate": 3.622611464968153e-06, "loss": 0.2914, "step": 4005 }, { "epoch": 12.8, "grad_norm": 7.365331649780273, "learning_rate": 3.5987261146496815e-06, "loss": 0.2969, "step": 4020 }, { "epoch": 12.85, "grad_norm": 3.8908514976501465, "learning_rate": 3.5748407643312103e-06, "loss": 0.3224, "step": 4035 }, { "epoch": 12.9, "grad_norm": 5.096860885620117, "learning_rate": 3.5509554140127388e-06, "loss": 0.346, "step": 4050 }, { "epoch": 12.95, "grad_norm": 9.797178268432617, "learning_rate": 3.527070063694268e-06, "loss": 0.3392, "step": 4065 }, { "epoch": 12.99, "grad_norm": 1.396338939666748, "learning_rate": 3.5031847133757964e-06, "loss": 0.3111, "step": 4080 }, { "epoch": 13.04, "grad_norm": 5.88714599609375, "learning_rate": 3.4792993630573253e-06, "loss": 0.3363, "step": 4095 }, { "epoch": 13.06, "eval_accuracy": 0.8142857142857143, "eval_loss": 0.6351959109306335, "eval_runtime": 14.1319, "eval_samples_per_second": 74.3, "eval_steps_per_second": 9.341, "step": 4100 }, { "epoch": 13.09, "grad_norm": 3.2820403575897217, "learning_rate": 3.4554140127388537e-06, "loss": 0.3142, "step": 4110 }, { "epoch": 13.14, "grad_norm": 7.780394077301025, "learning_rate": 3.4315286624203825e-06, "loss": 0.355, "step": 4125 }, { "epoch": 13.18, "grad_norm": 4.955718517303467, "learning_rate": 3.407643312101911e-06, "loss": 0.2956, "step": 4140 }, { "epoch": 13.23, "grad_norm": 3.0316593647003174, "learning_rate": 3.38375796178344e-06, "loss": 0.2811, "step": 4155 }, { "epoch": 13.28, "grad_norm": 7.823929786682129, "learning_rate": 3.3598726114649682e-06, "loss": 0.3386, "step": 4170 }, { "epoch": 13.33, "grad_norm": 6.2089457511901855, "learning_rate": 3.335987261146497e-06, "loss": 0.3011, "step": 4185 }, { "epoch": 13.38, "grad_norm": 4.865994453430176, "learning_rate": 3.3121019108280255e-06, "loss": 0.3273, "step": 4200 }, { "epoch": 13.38, "eval_accuracy": 0.7885714285714286, "eval_loss": 0.6794772148132324, "eval_runtime": 13.9608, "eval_samples_per_second": 75.21, "eval_steps_per_second": 9.455, "step": 4200 }, { "epoch": 13.42, "grad_norm": 8.2437105178833, "learning_rate": 3.2882165605095543e-06, "loss": 0.342, "step": 4215 }, { "epoch": 13.47, "grad_norm": 6.45313835144043, "learning_rate": 3.2643312101910827e-06, "loss": 0.2899, "step": 4230 }, { "epoch": 13.52, "grad_norm": 5.616313457489014, "learning_rate": 3.240445859872612e-06, "loss": 0.2714, "step": 4245 }, { "epoch": 13.57, "grad_norm": 6.7722554206848145, "learning_rate": 3.2165605095541404e-06, "loss": 0.3199, "step": 4260 }, { "epoch": 13.61, "grad_norm": 3.373429775238037, "learning_rate": 3.1926751592356693e-06, "loss": 0.309, "step": 4275 }, { "epoch": 13.66, "grad_norm": 6.058035373687744, "learning_rate": 3.1687898089171977e-06, "loss": 0.283, "step": 4290 }, { "epoch": 13.69, "eval_accuracy": 0.8, "eval_loss": 0.6704856157302856, "eval_runtime": 14.0603, "eval_samples_per_second": 74.678, "eval_steps_per_second": 9.388, "step": 4300 }, { "epoch": 13.71, "grad_norm": 11.277039527893066, "learning_rate": 3.1449044585987265e-06, "loss": 0.3114, "step": 4305 }, { "epoch": 13.76, "grad_norm": 6.542344093322754, "learning_rate": 3.121019108280255e-06, "loss": 0.2487, "step": 4320 }, { "epoch": 13.81, "grad_norm": 4.342966556549072, "learning_rate": 3.097133757961784e-06, "loss": 0.3076, "step": 4335 }, { "epoch": 13.85, "grad_norm": 1.972347617149353, "learning_rate": 3.0732484076433122e-06, "loss": 0.2782, "step": 4350 }, { "epoch": 13.9, "grad_norm": 5.91991662979126, "learning_rate": 3.049363057324841e-06, "loss": 0.3133, "step": 4365 }, { "epoch": 13.95, "grad_norm": 5.045269012451172, "learning_rate": 3.0254777070063695e-06, "loss": 0.2368, "step": 4380 }, { "epoch": 14.0, "grad_norm": 6.446601867675781, "learning_rate": 3.0015923566878983e-06, "loss": 0.2607, "step": 4395 }, { "epoch": 14.01, "eval_accuracy": 0.7914285714285715, "eval_loss": 0.6731985211372375, "eval_runtime": 14.0854, "eval_samples_per_second": 74.545, "eval_steps_per_second": 9.371, "step": 4400 }, { "epoch": 14.04, "grad_norm": 3.5464470386505127, "learning_rate": 2.9777070063694267e-06, "loss": 0.2699, "step": 4410 }, { "epoch": 14.09, "grad_norm": 3.760664701461792, "learning_rate": 2.953821656050956e-06, "loss": 0.2393, "step": 4425 }, { "epoch": 14.14, "grad_norm": 7.939091205596924, "learning_rate": 2.9299363057324844e-06, "loss": 0.3154, "step": 4440 }, { "epoch": 14.19, "grad_norm": 5.330219745635986, "learning_rate": 2.9060509554140133e-06, "loss": 0.2955, "step": 4455 }, { "epoch": 14.24, "grad_norm": 4.532066345214844, "learning_rate": 2.8821656050955417e-06, "loss": 0.3213, "step": 4470 }, { "epoch": 14.28, "grad_norm": 9.095784187316895, "learning_rate": 2.8582802547770705e-06, "loss": 0.2958, "step": 4485 }, { "epoch": 14.33, "grad_norm": 8.94389820098877, "learning_rate": 2.834394904458599e-06, "loss": 0.3174, "step": 4500 }, { "epoch": 14.33, "eval_accuracy": 0.8047619047619048, "eval_loss": 0.6691258549690247, "eval_runtime": 13.9708, "eval_samples_per_second": 75.157, "eval_steps_per_second": 9.448, "step": 4500 }, { "epoch": 14.38, "grad_norm": 7.957052230834961, "learning_rate": 2.810509554140128e-06, "loss": 0.3231, "step": 4515 }, { "epoch": 14.43, "grad_norm": 4.776412487030029, "learning_rate": 2.786624203821656e-06, "loss": 0.3328, "step": 4530 }, { "epoch": 14.47, "grad_norm": 7.478918552398682, "learning_rate": 2.762738853503185e-06, "loss": 0.3091, "step": 4545 }, { "epoch": 14.52, "grad_norm": 5.6832990646362305, "learning_rate": 2.7388535031847135e-06, "loss": 0.2731, "step": 4560 }, { "epoch": 14.57, "grad_norm": 6.8369951248168945, "learning_rate": 2.7149681528662423e-06, "loss": 0.296, "step": 4575 }, { "epoch": 14.62, "grad_norm": 9.0847806930542, "learning_rate": 2.6910828025477707e-06, "loss": 0.3189, "step": 4590 }, { "epoch": 14.65, "eval_accuracy": 0.8038095238095239, "eval_loss": 0.6601914763450623, "eval_runtime": 14.1296, "eval_samples_per_second": 74.312, "eval_steps_per_second": 9.342, "step": 4600 }, { "epoch": 14.67, "grad_norm": 3.0164129734039307, "learning_rate": 2.6671974522293e-06, "loss": 0.2682, "step": 4605 }, { "epoch": 14.71, "grad_norm": 23.393142700195312, "learning_rate": 2.6433121019108284e-06, "loss": 0.282, "step": 4620 }, { "epoch": 14.76, "grad_norm": 2.419762372970581, "learning_rate": 2.6194267515923573e-06, "loss": 0.2954, "step": 4635 }, { "epoch": 14.81, "grad_norm": 2.771768093109131, "learning_rate": 2.5955414012738857e-06, "loss": 0.3182, "step": 4650 }, { "epoch": 14.86, "grad_norm": 4.787622928619385, "learning_rate": 2.5716560509554145e-06, "loss": 0.3212, "step": 4665 }, { "epoch": 14.9, "grad_norm": 5.049059867858887, "learning_rate": 2.547770700636943e-06, "loss": 0.2473, "step": 4680 }, { "epoch": 14.95, "grad_norm": 7.5213398933410645, "learning_rate": 2.5238853503184718e-06, "loss": 0.2862, "step": 4695 }, { "epoch": 14.97, "eval_accuracy": 0.7933333333333333, "eval_loss": 0.6800631880760193, "eval_runtime": 14.0526, "eval_samples_per_second": 74.719, "eval_steps_per_second": 9.393, "step": 4700 }, { "epoch": 15.0, "grad_norm": 7.287510395050049, "learning_rate": 2.5e-06, "loss": 0.3077, "step": 4710 }, { "epoch": 15.05, "grad_norm": 6.637314319610596, "learning_rate": 2.476114649681529e-06, "loss": 0.2756, "step": 4725 }, { "epoch": 15.1, "grad_norm": 3.2501256465911865, "learning_rate": 2.4522292993630575e-06, "loss": 0.2493, "step": 4740 }, { "epoch": 15.14, "grad_norm": 5.583963871002197, "learning_rate": 2.4283439490445863e-06, "loss": 0.2417, "step": 4755 }, { "epoch": 15.19, "grad_norm": 0.8829357624053955, "learning_rate": 2.4044585987261147e-06, "loss": 0.2662, "step": 4770 }, { "epoch": 15.24, "grad_norm": 5.202518463134766, "learning_rate": 2.3805732484076436e-06, "loss": 0.2753, "step": 4785 }, { "epoch": 15.29, "grad_norm": 6.174129486083984, "learning_rate": 2.356687898089172e-06, "loss": 0.2895, "step": 4800 }, { "epoch": 15.29, "eval_accuracy": 0.8038095238095239, "eval_loss": 0.6579437255859375, "eval_runtime": 14.0203, "eval_samples_per_second": 74.892, "eval_steps_per_second": 9.415, "step": 4800 }, { "epoch": 15.33, "grad_norm": 4.062588691711426, "learning_rate": 2.332802547770701e-06, "loss": 0.2899, "step": 4815 }, { "epoch": 15.38, "grad_norm": 8.927217483520508, "learning_rate": 2.3089171974522297e-06, "loss": 0.2973, "step": 4830 }, { "epoch": 15.43, "grad_norm": 4.327577590942383, "learning_rate": 2.285031847133758e-06, "loss": 0.3196, "step": 4845 }, { "epoch": 15.48, "grad_norm": 7.842390537261963, "learning_rate": 2.261146496815287e-06, "loss": 0.3559, "step": 4860 }, { "epoch": 15.53, "grad_norm": 2.109755754470825, "learning_rate": 2.2372611464968154e-06, "loss": 0.249, "step": 4875 }, { "epoch": 15.57, "grad_norm": 4.3923420906066895, "learning_rate": 2.213375796178344e-06, "loss": 0.263, "step": 4890 }, { "epoch": 15.61, "eval_accuracy": 0.8, "eval_loss": 0.6687941551208496, "eval_runtime": 13.924, "eval_samples_per_second": 75.409, "eval_steps_per_second": 9.48, "step": 4900 }, { "epoch": 15.62, "grad_norm": 11.293251991271973, "learning_rate": 2.189490445859873e-06, "loss": 0.2603, "step": 4905 }, { "epoch": 15.67, "grad_norm": 4.060614109039307, "learning_rate": 2.1656050955414015e-06, "loss": 0.3092, "step": 4920 }, { "epoch": 15.72, "grad_norm": 7.007171154022217, "learning_rate": 2.1417197452229303e-06, "loss": 0.2563, "step": 4935 }, { "epoch": 15.76, "grad_norm": 4.375155925750732, "learning_rate": 2.1178343949044587e-06, "loss": 0.2675, "step": 4950 }, { "epoch": 15.81, "grad_norm": 8.038476943969727, "learning_rate": 2.0939490445859876e-06, "loss": 0.3577, "step": 4965 }, { "epoch": 15.86, "grad_norm": 9.939512252807617, "learning_rate": 2.070063694267516e-06, "loss": 0.2964, "step": 4980 }, { "epoch": 15.91, "grad_norm": 5.272069931030273, "learning_rate": 2.046178343949045e-06, "loss": 0.3214, "step": 4995 }, { "epoch": 15.92, "eval_accuracy": 0.8057142857142857, "eval_loss": 0.6546884775161743, "eval_runtime": 14.1631, "eval_samples_per_second": 74.137, "eval_steps_per_second": 9.32, "step": 5000 }, { "epoch": 15.96, "grad_norm": 1.2013658285140991, "learning_rate": 2.0222929936305737e-06, "loss": 0.2633, "step": 5010 }, { "epoch": 16.0, "grad_norm": 8.995028495788574, "learning_rate": 1.998407643312102e-06, "loss": 0.2928, "step": 5025 }, { "epoch": 16.05, "grad_norm": 8.143821716308594, "learning_rate": 1.974522292993631e-06, "loss": 0.3392, "step": 5040 }, { "epoch": 16.1, "grad_norm": 4.234311103820801, "learning_rate": 1.9506369426751593e-06, "loss": 0.2818, "step": 5055 }, { "epoch": 16.15, "grad_norm": 2.7548768520355225, "learning_rate": 1.926751592356688e-06, "loss": 0.2542, "step": 5070 }, { "epoch": 16.19, "grad_norm": 8.989328384399414, "learning_rate": 1.9028662420382168e-06, "loss": 0.251, "step": 5085 }, { "epoch": 16.24, "grad_norm": 7.30033540725708, "learning_rate": 1.8789808917197455e-06, "loss": 0.2867, "step": 5100 }, { "epoch": 16.24, "eval_accuracy": 0.7923809523809524, "eval_loss": 0.6775221824645996, "eval_runtime": 14.1819, "eval_samples_per_second": 74.038, "eval_steps_per_second": 9.308, "step": 5100 }, { "epoch": 16.29, "grad_norm": 5.032180309295654, "learning_rate": 1.8550955414012739e-06, "loss": 0.3202, "step": 5115 }, { "epoch": 16.34, "grad_norm": 7.76137113571167, "learning_rate": 1.8312101910828025e-06, "loss": 0.308, "step": 5130 }, { "epoch": 16.39, "grad_norm": 4.482850551605225, "learning_rate": 1.8073248407643311e-06, "loss": 0.3087, "step": 5145 }, { "epoch": 16.43, "grad_norm": 5.670340538024902, "learning_rate": 1.78343949044586e-06, "loss": 0.2621, "step": 5160 }, { "epoch": 16.48, "grad_norm": 4.9566216468811035, "learning_rate": 1.7595541401273886e-06, "loss": 0.3127, "step": 5175 }, { "epoch": 16.53, "grad_norm": 3.655395984649658, "learning_rate": 1.7356687898089172e-06, "loss": 0.2242, "step": 5190 }, { "epoch": 16.56, "eval_accuracy": 0.8085714285714286, "eval_loss": 0.6378137469291687, "eval_runtime": 13.9378, "eval_samples_per_second": 75.335, "eval_steps_per_second": 9.471, "step": 5200 }, { "epoch": 16.58, "grad_norm": 2.2976183891296387, "learning_rate": 1.7117834394904459e-06, "loss": 0.2682, "step": 5205 }, { "epoch": 16.62, "grad_norm": 3.8970346450805664, "learning_rate": 1.6878980891719745e-06, "loss": 0.2673, "step": 5220 }, { "epoch": 16.67, "grad_norm": 2.6978113651275635, "learning_rate": 1.6640127388535031e-06, "loss": 0.2508, "step": 5235 }, { "epoch": 16.72, "grad_norm": 9.089079856872559, "learning_rate": 1.640127388535032e-06, "loss": 0.2523, "step": 5250 }, { "epoch": 16.77, "grad_norm": 3.596012830734253, "learning_rate": 1.6162420382165606e-06, "loss": 0.2849, "step": 5265 }, { "epoch": 16.82, "grad_norm": 3.2140870094299316, "learning_rate": 1.5923566878980892e-06, "loss": 0.2513, "step": 5280 }, { "epoch": 16.86, "grad_norm": 6.678956031799316, "learning_rate": 1.5684713375796179e-06, "loss": 0.2839, "step": 5295 }, { "epoch": 16.88, "eval_accuracy": 0.799047619047619, "eval_loss": 0.6760995984077454, "eval_runtime": 14.0394, "eval_samples_per_second": 74.789, "eval_steps_per_second": 9.402, "step": 5300 }, { "epoch": 16.91, "grad_norm": 4.08018684387207, "learning_rate": 1.5445859872611465e-06, "loss": 0.3049, "step": 5310 }, { "epoch": 16.96, "grad_norm": 9.095474243164062, "learning_rate": 1.5207006369426751e-06, "loss": 0.2526, "step": 5325 }, { "epoch": 17.01, "grad_norm": 3.758715867996216, "learning_rate": 1.496815286624204e-06, "loss": 0.2515, "step": 5340 }, { "epoch": 17.05, "grad_norm": 5.71665096282959, "learning_rate": 1.4729299363057326e-06, "loss": 0.2906, "step": 5355 }, { "epoch": 17.1, "grad_norm": 4.952653884887695, "learning_rate": 1.4490445859872612e-06, "loss": 0.2915, "step": 5370 }, { "epoch": 17.15, "grad_norm": 7.454645156860352, "learning_rate": 1.4251592356687899e-06, "loss": 0.2878, "step": 5385 }, { "epoch": 17.2, "grad_norm": 2.2225170135498047, "learning_rate": 1.4012738853503185e-06, "loss": 0.2424, "step": 5400 }, { "epoch": 17.2, "eval_accuracy": 0.8123809523809524, "eval_loss": 0.6385903358459473, "eval_runtime": 13.8843, "eval_samples_per_second": 75.625, "eval_steps_per_second": 9.507, "step": 5400 }, { "epoch": 17.25, "grad_norm": 4.7702765464782715, "learning_rate": 1.3773885350318471e-06, "loss": 0.2845, "step": 5415 }, { "epoch": 17.29, "grad_norm": 4.871506214141846, "learning_rate": 1.353503184713376e-06, "loss": 0.2517, "step": 5430 }, { "epoch": 17.34, "grad_norm": 2.186405897140503, "learning_rate": 1.3296178343949046e-06, "loss": 0.2052, "step": 5445 }, { "epoch": 17.39, "grad_norm": 7.742133140563965, "learning_rate": 1.3057324840764332e-06, "loss": 0.2932, "step": 5460 }, { "epoch": 17.44, "grad_norm": 4.161474704742432, "learning_rate": 1.2818471337579619e-06, "loss": 0.2783, "step": 5475 }, { "epoch": 17.48, "grad_norm": 3.773857593536377, "learning_rate": 1.2579617834394905e-06, "loss": 0.2666, "step": 5490 }, { "epoch": 17.52, "eval_accuracy": 0.8133333333333334, "eval_loss": 0.6492887139320374, "eval_runtime": 13.7993, "eval_samples_per_second": 76.091, "eval_steps_per_second": 9.566, "step": 5500 }, { "epoch": 17.53, "grad_norm": 3.4864587783813477, "learning_rate": 1.2340764331210191e-06, "loss": 0.2598, "step": 5505 }, { "epoch": 17.58, "grad_norm": 10.541982650756836, "learning_rate": 1.210191082802548e-06, "loss": 0.278, "step": 5520 }, { "epoch": 17.63, "grad_norm": 3.506603240966797, "learning_rate": 1.1863057324840766e-06, "loss": 0.2818, "step": 5535 }, { "epoch": 17.68, "grad_norm": 3.9437873363494873, "learning_rate": 1.1624203821656052e-06, "loss": 0.2264, "step": 5550 }, { "epoch": 17.72, "grad_norm": 10.139039993286133, "learning_rate": 1.1385350318471339e-06, "loss": 0.3066, "step": 5565 }, { "epoch": 17.77, "grad_norm": 4.62479829788208, "learning_rate": 1.1146496815286625e-06, "loss": 0.2531, "step": 5580 }, { "epoch": 17.82, "grad_norm": 7.675441741943359, "learning_rate": 1.0907643312101911e-06, "loss": 0.2259, "step": 5595 }, { "epoch": 17.83, "eval_accuracy": 0.8047619047619048, "eval_loss": 0.6514009237289429, "eval_runtime": 14.2419, "eval_samples_per_second": 73.726, "eval_steps_per_second": 9.268, "step": 5600 }, { "epoch": 17.87, "grad_norm": 3.4506428241729736, "learning_rate": 1.06687898089172e-06, "loss": 0.2672, "step": 5610 }, { "epoch": 17.91, "grad_norm": 2.7243833541870117, "learning_rate": 1.0429936305732486e-06, "loss": 0.2652, "step": 5625 }, { "epoch": 17.96, "grad_norm": 4.135616302490234, "learning_rate": 1.0191082802547772e-06, "loss": 0.2011, "step": 5640 }, { "epoch": 18.01, "grad_norm": 7.302999496459961, "learning_rate": 9.952229299363059e-07, "loss": 0.2782, "step": 5655 }, { "epoch": 18.06, "grad_norm": 7.608941555023193, "learning_rate": 9.713375796178345e-07, "loss": 0.2871, "step": 5670 }, { "epoch": 18.11, "grad_norm": 7.69070291519165, "learning_rate": 9.474522292993632e-07, "loss": 0.2782, "step": 5685 }, { "epoch": 18.15, "grad_norm": 4.241176605224609, "learning_rate": 9.235668789808917e-07, "loss": 0.2533, "step": 5700 }, { "epoch": 18.15, "eval_accuracy": 0.8, "eval_loss": 0.6676008105278015, "eval_runtime": 14.3329, "eval_samples_per_second": 73.258, "eval_steps_per_second": 9.21, "step": 5700 }, { "epoch": 18.2, "grad_norm": 5.718116283416748, "learning_rate": 8.996815286624204e-07, "loss": 0.2683, "step": 5715 }, { "epoch": 18.25, "grad_norm": 7.374713897705078, "learning_rate": 8.757961783439491e-07, "loss": 0.265, "step": 5730 }, { "epoch": 18.3, "grad_norm": 1.2615108489990234, "learning_rate": 8.519108280254777e-07, "loss": 0.2169, "step": 5745 }, { "epoch": 18.34, "grad_norm": 9.361245155334473, "learning_rate": 8.280254777070064e-07, "loss": 0.2921, "step": 5760 }, { "epoch": 18.39, "grad_norm": 6.853837490081787, "learning_rate": 8.041401273885351e-07, "loss": 0.2734, "step": 5775 }, { "epoch": 18.44, "grad_norm": 4.097062110900879, "learning_rate": 7.802547770700637e-07, "loss": 0.2697, "step": 5790 }, { "epoch": 18.47, "eval_accuracy": 0.800952380952381, "eval_loss": 0.6705303192138672, "eval_runtime": 14.6447, "eval_samples_per_second": 71.698, "eval_steps_per_second": 9.014, "step": 5800 }, { "epoch": 18.49, "grad_norm": 4.86583948135376, "learning_rate": 7.563694267515924e-07, "loss": 0.269, "step": 5805 }, { "epoch": 18.54, "grad_norm": 7.498669624328613, "learning_rate": 7.324840764331211e-07, "loss": 0.223, "step": 5820 }, { "epoch": 18.58, "grad_norm": 4.731110572814941, "learning_rate": 7.085987261146497e-07, "loss": 0.2215, "step": 5835 }, { "epoch": 18.63, "grad_norm": 4.388888359069824, "learning_rate": 6.847133757961784e-07, "loss": 0.275, "step": 5850 }, { "epoch": 18.68, "grad_norm": 7.220559120178223, "learning_rate": 6.608280254777071e-07, "loss": 0.252, "step": 5865 }, { "epoch": 18.73, "grad_norm": 6.225268363952637, "learning_rate": 6.369426751592357e-07, "loss": 0.2532, "step": 5880 }, { "epoch": 18.77, "grad_norm": 5.6799702644348145, "learning_rate": 6.130573248407644e-07, "loss": 0.2558, "step": 5895 }, { "epoch": 18.79, "eval_accuracy": 0.8076190476190476, "eval_loss": 0.6749628782272339, "eval_runtime": 14.118, "eval_samples_per_second": 74.373, "eval_steps_per_second": 9.35, "step": 5900 }, { "epoch": 18.82, "grad_norm": 1.1455104351043701, "learning_rate": 5.89171974522293e-07, "loss": 0.2455, "step": 5910 }, { "epoch": 18.87, "grad_norm": 0.5723968744277954, "learning_rate": 5.652866242038217e-07, "loss": 0.2393, "step": 5925 }, { "epoch": 18.92, "grad_norm": 6.428089141845703, "learning_rate": 5.414012738853504e-07, "loss": 0.2649, "step": 5940 }, { "epoch": 18.96, "grad_norm": 4.993350028991699, "learning_rate": 5.17515923566879e-07, "loss": 0.2689, "step": 5955 }, { "epoch": 19.01, "grad_norm": 8.829191207885742, "learning_rate": 4.936305732484077e-07, "loss": 0.3045, "step": 5970 }, { "epoch": 19.06, "grad_norm": 8.624272346496582, "learning_rate": 4.6974522292993636e-07, "loss": 0.229, "step": 5985 }, { "epoch": 19.11, "grad_norm": 1.7336400747299194, "learning_rate": 4.45859872611465e-07, "loss": 0.2469, "step": 6000 }, { "epoch": 19.11, "eval_accuracy": 0.799047619047619, "eval_loss": 0.6750813722610474, "eval_runtime": 14.571, "eval_samples_per_second": 72.061, "eval_steps_per_second": 9.059, "step": 6000 }, { "epoch": 19.16, "grad_norm": 7.821689605712891, "learning_rate": 4.219745222929936e-07, "loss": 0.2518, "step": 6015 }, { "epoch": 19.2, "grad_norm": 4.377063751220703, "learning_rate": 3.980891719745223e-07, "loss": 0.2744, "step": 6030 }, { "epoch": 19.25, "grad_norm": 3.8449316024780273, "learning_rate": 3.74203821656051e-07, "loss": 0.2222, "step": 6045 }, { "epoch": 19.3, "grad_norm": 9.11525821685791, "learning_rate": 3.503184713375796e-07, "loss": 0.2353, "step": 6060 }, { "epoch": 19.35, "grad_norm": 2.008242607116699, "learning_rate": 3.264331210191083e-07, "loss": 0.2803, "step": 6075 }, { "epoch": 19.39, "grad_norm": 3.9407541751861572, "learning_rate": 3.02547770700637e-07, "loss": 0.284, "step": 6090 }, { "epoch": 19.43, "eval_accuracy": 0.7980952380952381, "eval_loss": 0.6737999320030212, "eval_runtime": 14.3198, "eval_samples_per_second": 73.325, "eval_steps_per_second": 9.218, "step": 6100 }, { "epoch": 19.44, "grad_norm": 4.276314735412598, "learning_rate": 2.786624203821656e-07, "loss": 0.282, "step": 6105 }, { "epoch": 19.49, "grad_norm": 4.9115729331970215, "learning_rate": 2.547770700636943e-07, "loss": 0.2635, "step": 6120 }, { "epoch": 19.54, "grad_norm": 2.512660026550293, "learning_rate": 2.3089171974522294e-07, "loss": 0.2404, "step": 6135 }, { "epoch": 19.59, "grad_norm": 4.966971397399902, "learning_rate": 2.070063694267516e-07, "loss": 0.2979, "step": 6150 }, { "epoch": 19.63, "grad_norm": 4.7076029777526855, "learning_rate": 1.8312101910828028e-07, "loss": 0.2563, "step": 6165 }, { "epoch": 19.68, "grad_norm": 9.003276824951172, "learning_rate": 1.5923566878980893e-07, "loss": 0.2283, "step": 6180 }, { "epoch": 19.73, "grad_norm": 3.8437254428863525, "learning_rate": 1.353503184713376e-07, "loss": 0.2534, "step": 6195 }, { "epoch": 19.75, "eval_accuracy": 0.8019047619047619, "eval_loss": 0.675845742225647, "eval_runtime": 14.3373, "eval_samples_per_second": 73.235, "eval_steps_per_second": 9.207, "step": 6200 }, { "epoch": 19.78, "grad_norm": 4.477542400360107, "learning_rate": 1.1146496815286625e-07, "loss": 0.2368, "step": 6210 }, { "epoch": 19.82, "grad_norm": 9.546680450439453, "learning_rate": 8.75796178343949e-08, "loss": 0.2796, "step": 6225 }, { "epoch": 19.87, "grad_norm": 7.132513999938965, "learning_rate": 6.369426751592358e-08, "loss": 0.2658, "step": 6240 }, { "epoch": 19.92, "grad_norm": 5.739116191864014, "learning_rate": 3.9808917197452233e-08, "loss": 0.3468, "step": 6255 }, { "epoch": 19.97, "grad_norm": 7.056791305541992, "learning_rate": 1.5923566878980894e-08, "loss": 0.2755, "step": 6270 }, { "epoch": 20.0, "step": 6280, "total_flos": 1.555375746295849e+19, "train_loss": 0.5583157776647313, "train_runtime": 5878.7322, "train_samples_per_second": 34.14, "train_steps_per_second": 1.068 } ], "logging_steps": 15, "max_steps": 6280, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 100, "total_flos": 1.555375746295849e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }