diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,1771 +1,3513 @@ { - "best_metric": 0.6309406161308289, - "best_model_checkpoint": "Action_agent/checkpoint-2600", - "epoch": 10.0, + "best_metric": 0.6351959109306335, + "best_model_checkpoint": "Action_agent/checkpoint-4100", + "epoch": 20.0, "eval_steps": 100, - "global_step": 3140, + "global_step": 6280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, - "grad_norm": 6.1902642250061035, - "learning_rate": 9.952229299363057e-06, - "loss": 2.3093, + "grad_norm": 1.1194300651550293, + "learning_rate": 9.97611464968153e-06, + "loss": 2.2988, "step": 15 }, { "epoch": 0.1, - "grad_norm": 8.609545707702637, - "learning_rate": 9.904458598726116e-06, - "loss": 2.2798, + "grad_norm": 1.3563270568847656, + "learning_rate": 9.952229299363057e-06, + "loss": 2.2782, "step": 30 }, { "epoch": 0.14, - "grad_norm": 8.198623657226562, - "learning_rate": 9.856687898089172e-06, - "loss": 2.2163, + "grad_norm": 1.2270524501800537, + "learning_rate": 9.928343949044586e-06, + "loss": 2.2582, "step": 45 }, { "epoch": 0.19, - "grad_norm": 7.1882829666137695, - "learning_rate": 9.80891719745223e-06, - "loss": 2.1529, + "grad_norm": 1.1236611604690552, + "learning_rate": 9.904458598726116e-06, + "loss": 2.2436, "step": 60 }, { "epoch": 0.24, - "grad_norm": 8.259012222290039, - "learning_rate": 9.761146496815288e-06, - "loss": 2.114, + "grad_norm": 1.2250008583068848, + "learning_rate": 9.880573248407644e-06, + "loss": 2.2234, "step": 75 }, { "epoch": 0.29, - "grad_norm": 9.213942527770996, - "learning_rate": 9.713375796178345e-06, - "loss": 2.039, + "grad_norm": 1.230263590812683, + "learning_rate": 9.856687898089172e-06, + "loss": 2.1987, "step": 90 }, { "epoch": 0.32, - "eval_accuracy": 0.4847619047619048, - "eval_loss": 1.7706103324890137, - "eval_runtime": 17.9171, - "eval_samples_per_second": 58.603, - "eval_steps_per_second": 7.367, + "eval_accuracy": 0.3914285714285714, + "eval_loss": 2.1640379428863525, + "eval_runtime": 20.4497, + "eval_samples_per_second": 51.346, + "eval_steps_per_second": 6.455, "step": 100 }, { "epoch": 0.33, - "grad_norm": 8.229316711425781, - "learning_rate": 9.665605095541401e-06, - "loss": 1.9431, + "grad_norm": 1.2624423503875732, + "learning_rate": 9.832802547770702e-06, + "loss": 2.1866, "step": 105 }, { "epoch": 0.38, - "grad_norm": 9.092002868652344, - "learning_rate": 9.617834394904459e-06, - "loss": 1.7759, + "grad_norm": 1.2295231819152832, + "learning_rate": 9.80891719745223e-06, + "loss": 2.1503, "step": 120 }, { "epoch": 0.43, - "grad_norm": 12.155420303344727, - "learning_rate": 9.570063694267517e-06, - "loss": 1.7233, + "grad_norm": 1.6050071716308594, + "learning_rate": 9.78503184713376e-06, + "loss": 2.1169, "step": 135 }, { "epoch": 0.48, - "grad_norm": 15.371952056884766, - "learning_rate": 9.522292993630574e-06, - "loss": 1.6441, + "grad_norm": 1.3574703931808472, + "learning_rate": 9.761146496815288e-06, + "loss": 2.0902, "step": 150 }, { "epoch": 0.53, - "grad_norm": 15.928441047668457, - "learning_rate": 9.47452229299363e-06, - "loss": 1.5518, + "grad_norm": 1.575221300125122, + "learning_rate": 9.737261146496816e-06, + "loss": 2.052, "step": 165 }, { "epoch": 0.57, - "grad_norm": 14.13963794708252, - "learning_rate": 9.426751592356688e-06, - "loss": 1.4362, + "grad_norm": 1.443302035331726, + "learning_rate": 9.713375796178345e-06, + "loss": 2.0154, "step": 180 }, { "epoch": 0.62, - "grad_norm": 22.86189842224121, - "learning_rate": 9.378980891719746e-06, - "loss": 1.3695, + "grad_norm": 1.4323551654815674, + "learning_rate": 9.689490445859873e-06, + "loss": 1.9807, "step": 195 }, { "epoch": 0.64, - "eval_accuracy": 0.6457142857142857, - "eval_loss": 1.0885976552963257, - "eval_runtime": 13.391, - "eval_samples_per_second": 78.411, - "eval_steps_per_second": 9.857, + "eval_accuracy": 0.6142857142857143, + "eval_loss": 1.9168660640716553, + "eval_runtime": 13.8079, + "eval_samples_per_second": 76.043, + "eval_steps_per_second": 9.56, "step": 200 }, { "epoch": 0.67, - "grad_norm": 18.267860412597656, - "learning_rate": 9.331210191082803e-06, - "loss": 1.2723, + "grad_norm": 1.587117314338684, + "learning_rate": 9.665605095541401e-06, + "loss": 1.9042, "step": 210 }, { "epoch": 0.72, - "grad_norm": 17.083765029907227, - "learning_rate": 9.283439490445861e-06, - "loss": 1.2886, + "grad_norm": 1.4769114255905151, + "learning_rate": 9.641719745222931e-06, + "loss": 1.8832, "step": 225 }, { "epoch": 0.76, - "grad_norm": 13.116181373596191, - "learning_rate": 9.235668789808919e-06, - "loss": 1.1825, + "grad_norm": 1.5509356260299683, + "learning_rate": 9.617834394904459e-06, + "loss": 1.8388, "step": 240 }, { "epoch": 0.81, - "grad_norm": 15.386600494384766, - "learning_rate": 9.187898089171975e-06, - "loss": 1.2512, + "grad_norm": 1.6834059953689575, + "learning_rate": 9.593949044585989e-06, + "loss": 1.7997, "step": 255 }, { "epoch": 0.86, - "grad_norm": 36.78362274169922, - "learning_rate": 9.140127388535032e-06, - "loss": 1.1629, + "grad_norm": 1.6854455471038818, + "learning_rate": 9.570063694267517e-06, + "loss": 1.7652, "step": 270 }, { "epoch": 0.91, - "grad_norm": 16.107894897460938, - "learning_rate": 9.09235668789809e-06, - "loss": 1.0824, + "grad_norm": 1.7652379274368286, + "learning_rate": 9.546178343949045e-06, + "loss": 1.7315, "step": 285 }, { "epoch": 0.96, - "grad_norm": 15.064929962158203, - "learning_rate": 9.044585987261148e-06, - "loss": 1.099, + "grad_norm": 1.6533325910568237, + "learning_rate": 9.522292993630574e-06, + "loss": 1.6738, "step": 300 }, { "epoch": 0.96, - "eval_accuracy": 0.68, - "eval_loss": 0.9092212915420532, - "eval_runtime": 14.0444, - "eval_samples_per_second": 74.763, - "eval_steps_per_second": 9.399, + "eval_accuracy": 0.72, + "eval_loss": 1.6147921085357666, + "eval_runtime": 14.0649, + "eval_samples_per_second": 74.654, + "eval_steps_per_second": 9.385, "step": 300 }, { "epoch": 1.0, - "grad_norm": 18.469070434570312, - "learning_rate": 8.996815286624204e-06, - "loss": 1.0649, + "grad_norm": 1.9673418998718262, + "learning_rate": 9.498407643312102e-06, + "loss": 1.6181, "step": 315 }, { "epoch": 1.05, - "grad_norm": 17.13619613647461, - "learning_rate": 8.949044585987261e-06, - "loss": 1.1011, + "grad_norm": 1.8408273458480835, + "learning_rate": 9.47452229299363e-06, + "loss": 1.601, "step": 330 }, { "epoch": 1.1, - "grad_norm": 18.0528621673584, - "learning_rate": 8.901273885350319e-06, - "loss": 1.0787, + "grad_norm": 1.7604966163635254, + "learning_rate": 9.45063694267516e-06, + "loss": 1.5844, "step": 345 }, { "epoch": 1.15, - "grad_norm": 14.162128448486328, - "learning_rate": 8.853503184713377e-06, - "loss": 1.0927, + "grad_norm": 1.9570591449737549, + "learning_rate": 9.426751592356688e-06, + "loss": 1.5673, "step": 360 }, { "epoch": 1.19, - "grad_norm": 21.933330535888672, - "learning_rate": 8.805732484076433e-06, - "loss": 1.087, + "grad_norm": 1.7698423862457275, + "learning_rate": 9.402866242038218e-06, + "loss": 1.5021, "step": 375 }, { "epoch": 1.24, - "grad_norm": 16.677528381347656, - "learning_rate": 8.757961783439492e-06, - "loss": 1.0011, + "grad_norm": 2.1795897483825684, + "learning_rate": 9.378980891719746e-06, + "loss": 1.4828, "step": 390 }, { "epoch": 1.27, - "eval_accuracy": 0.7171428571428572, - "eval_loss": 0.8182899951934814, - "eval_runtime": 13.5557, - "eval_samples_per_second": 77.458, - "eval_steps_per_second": 9.738, + "eval_accuracy": 0.7704761904761904, + "eval_loss": 1.3860931396484375, + "eval_runtime": 14.2522, + "eval_samples_per_second": 73.673, + "eval_steps_per_second": 9.262, "step": 400 }, { "epoch": 1.29, - "grad_norm": 13.870976448059082, - "learning_rate": 8.710191082802548e-06, - "loss": 0.9318, + "grad_norm": 1.9033102989196777, + "learning_rate": 9.355095541401275e-06, + "loss": 1.4153, "step": 405 }, { "epoch": 1.34, - "grad_norm": 14.700624465942383, - "learning_rate": 8.662420382165606e-06, - "loss": 0.8846, + "grad_norm": 2.0485243797302246, + "learning_rate": 9.331210191082803e-06, + "loss": 1.3602, "step": 420 }, { "epoch": 1.39, - "grad_norm": 17.18898582458496, - "learning_rate": 8.614649681528664e-06, - "loss": 1.0089, + "grad_norm": 2.0998847484588623, + "learning_rate": 9.307324840764333e-06, + "loss": 1.3972, "step": 435 }, { "epoch": 1.43, - "grad_norm": 18.89067840576172, - "learning_rate": 8.566878980891721e-06, - "loss": 0.9356, + "grad_norm": 2.2247631549835205, + "learning_rate": 9.283439490445861e-06, + "loss": 1.3401, "step": 450 }, { "epoch": 1.48, - "grad_norm": 15.696223258972168, - "learning_rate": 8.519108280254777e-06, - "loss": 0.8215, + "grad_norm": 2.1834170818328857, + "learning_rate": 9.259554140127389e-06, + "loss": 1.2974, "step": 465 }, { "epoch": 1.53, - "grad_norm": 36.806602478027344, - "learning_rate": 8.471337579617835e-06, - "loss": 0.9668, + "grad_norm": 2.6833572387695312, + "learning_rate": 9.235668789808919e-06, + "loss": 1.2803, "step": 480 }, { "epoch": 1.58, - "grad_norm": 13.624329566955566, - "learning_rate": 8.423566878980893e-06, - "loss": 0.8437, + "grad_norm": 2.371852397918701, + "learning_rate": 9.211783439490447e-06, + "loss": 1.2768, "step": 495 }, { "epoch": 1.59, - "eval_accuracy": 0.719047619047619, - "eval_loss": 0.7674332857131958, - "eval_runtime": 13.5992, - "eval_samples_per_second": 77.211, - "eval_steps_per_second": 9.706, + "eval_accuracy": 0.7590476190476191, + "eval_loss": 1.2411518096923828, + "eval_runtime": 14.0673, + "eval_samples_per_second": 74.641, + "eval_steps_per_second": 9.383, "step": 500 }, { "epoch": 1.62, - "grad_norm": 15.652780532836914, - "learning_rate": 8.37579617834395e-06, - "loss": 0.8169, + "grad_norm": 2.595338821411133, + "learning_rate": 9.187898089171975e-06, + "loss": 1.2409, "step": 510 }, { "epoch": 1.67, - "grad_norm": 13.216598510742188, - "learning_rate": 8.328025477707006e-06, - "loss": 0.9283, + "grad_norm": 1.8707057237625122, + "learning_rate": 9.164012738853504e-06, + "loss": 1.279, "step": 525 }, { "epoch": 1.72, - "grad_norm": 19.80782127380371, - "learning_rate": 8.280254777070064e-06, - "loss": 0.8614, + "grad_norm": 6.527611255645752, + "learning_rate": 9.140127388535032e-06, + "loss": 1.2422, "step": 540 }, { "epoch": 1.77, - "grad_norm": 18.636619567871094, - "learning_rate": 8.232484076433122e-06, - "loss": 0.8656, + "grad_norm": 2.4771523475646973, + "learning_rate": 9.116242038216562e-06, + "loss": 1.2176, "step": 555 }, { "epoch": 1.82, - "grad_norm": 17.433523178100586, - "learning_rate": 8.18471337579618e-06, - "loss": 0.8313, + "grad_norm": 2.560882329940796, + "learning_rate": 9.09235668789809e-06, + "loss": 1.1891, "step": 570 }, { "epoch": 1.86, - "grad_norm": 14.271307945251465, - "learning_rate": 8.136942675159237e-06, - "loss": 0.8857, + "grad_norm": 2.966686487197876, + "learning_rate": 9.068471337579618e-06, + "loss": 1.201, "step": 585 }, { "epoch": 1.91, - "grad_norm": 16.59528923034668, - "learning_rate": 8.089171974522295e-06, - "loss": 0.8613, + "grad_norm": 2.8730111122131348, + "learning_rate": 9.044585987261148e-06, + "loss": 1.1759, "step": 600 }, { "epoch": 1.91, - "eval_accuracy": 0.7409523809523809, - "eval_loss": 0.7168479561805725, - "eval_runtime": 13.4058, - "eval_samples_per_second": 78.324, - "eval_steps_per_second": 9.846, + "eval_accuracy": 0.7914285714285715, + "eval_loss": 1.116868257522583, + "eval_runtime": 13.8703, + "eval_samples_per_second": 75.701, + "eval_steps_per_second": 9.517, "step": 600 }, { "epoch": 1.96, - "grad_norm": 13.598006248474121, - "learning_rate": 8.04140127388535e-06, - "loss": 0.8694, + "grad_norm": 2.659374237060547, + "learning_rate": 9.020700636942676e-06, + "loss": 1.1196, "step": 615 }, { "epoch": 2.01, - "grad_norm": 13.958739280700684, - "learning_rate": 7.993630573248408e-06, - "loss": 0.9166, + "grad_norm": 2.438401699066162, + "learning_rate": 8.996815286624204e-06, + "loss": 1.1535, "step": 630 }, { "epoch": 2.05, - "grad_norm": 13.80545425415039, - "learning_rate": 7.945859872611466e-06, - "loss": 0.8781, + "grad_norm": 2.551445245742798, + "learning_rate": 8.972929936305733e-06, + "loss": 1.1448, "step": 645 }, { "epoch": 2.1, - "grad_norm": 14.67716121673584, - "learning_rate": 7.898089171974524e-06, - "loss": 0.7684, + "grad_norm": 2.738217353820801, + "learning_rate": 8.949044585987261e-06, + "loss": 1.0833, "step": 660 }, { "epoch": 2.15, - "grad_norm": 18.161645889282227, - "learning_rate": 7.85031847133758e-06, - "loss": 0.7923, + "grad_norm": 2.850508689880371, + "learning_rate": 8.925159235668791e-06, + "loss": 1.0304, "step": 675 }, { "epoch": 2.2, - "grad_norm": 11.349693298339844, - "learning_rate": 7.802547770700637e-06, - "loss": 0.7427, + "grad_norm": 1.8277325630187988, + "learning_rate": 8.901273885350319e-06, + "loss": 1.0314, "step": 690 }, { "epoch": 2.23, - "eval_accuracy": 0.7352380952380952, - "eval_loss": 0.7270055413246155, - "eval_runtime": 13.6582, - "eval_samples_per_second": 76.877, - "eval_steps_per_second": 9.665, + "eval_accuracy": 0.7761904761904762, + "eval_loss": 1.0599384307861328, + "eval_runtime": 13.8327, + "eval_samples_per_second": 75.907, + "eval_steps_per_second": 9.543, "step": 700 }, { "epoch": 2.25, - "grad_norm": 16.357065200805664, - "learning_rate": 7.754777070063695e-06, - "loss": 0.7645, + "grad_norm": 3.3399343490600586, + "learning_rate": 8.877388535031847e-06, + "loss": 1.0278, "step": 705 }, { "epoch": 2.29, - "grad_norm": 15.481508255004883, - "learning_rate": 7.707006369426753e-06, - "loss": 0.7973, + "grad_norm": 2.1914594173431396, + "learning_rate": 8.853503184713377e-06, + "loss": 1.0493, "step": 720 }, { "epoch": 2.34, - "grad_norm": 16.786632537841797, - "learning_rate": 7.659235668789809e-06, - "loss": 0.8256, + "grad_norm": 4.2753682136535645, + "learning_rate": 8.829617834394906e-06, + "loss": 1.0377, "step": 735 }, { "epoch": 2.39, - "grad_norm": 11.738802909851074, - "learning_rate": 7.611464968152867e-06, - "loss": 0.8307, + "grad_norm": 2.1339023113250732, + "learning_rate": 8.805732484076433e-06, + "loss": 1.0431, "step": 750 }, { "epoch": 2.44, - "grad_norm": 13.264825820922852, - "learning_rate": 7.563694267515924e-06, - "loss": 0.7431, + "grad_norm": 2.3232874870300293, + "learning_rate": 8.781847133757962e-06, + "loss": 0.9801, "step": 765 }, { "epoch": 2.48, - "grad_norm": 15.430547714233398, - "learning_rate": 7.515923566878982e-06, - "loss": 0.7867, + "grad_norm": 2.304199695587158, + "learning_rate": 8.757961783439492e-06, + "loss": 0.9999, "step": 780 }, { "epoch": 2.53, - "grad_norm": 15.555388450622559, - "learning_rate": 7.468152866242039e-06, - "loss": 0.693, + "grad_norm": 2.624868154525757, + "learning_rate": 8.734076433121018e-06, + "loss": 0.9702, "step": 795 }, { "epoch": 2.55, - "eval_accuracy": 0.7676190476190476, - "eval_loss": 0.6801217198371887, - "eval_runtime": 13.4462, - "eval_samples_per_second": 78.089, - "eval_steps_per_second": 9.817, + "eval_accuracy": 0.8104761904761905, + "eval_loss": 0.9639754295349121, + "eval_runtime": 13.7909, + "eval_samples_per_second": 76.137, + "eval_steps_per_second": 9.572, "step": 800 }, { "epoch": 2.58, - "grad_norm": 16.23971176147461, - "learning_rate": 7.4203821656050955e-06, - "loss": 0.7791, + "grad_norm": 3.725691080093384, + "learning_rate": 8.710191082802548e-06, + "loss": 0.9801, "step": 810 }, { "epoch": 2.63, - "grad_norm": 22.803543090820312, - "learning_rate": 7.372611464968153e-06, - "loss": 0.7564, + "grad_norm": 3.1988706588745117, + "learning_rate": 8.686305732484078e-06, + "loss": 0.976, "step": 825 }, { "epoch": 2.68, - "grad_norm": 15.14857292175293, - "learning_rate": 7.32484076433121e-06, - "loss": 0.6895, + "grad_norm": 3.571751356124878, + "learning_rate": 8.662420382165606e-06, + "loss": 0.9098, "step": 840 }, { "epoch": 2.72, - "grad_norm": 18.52122688293457, - "learning_rate": 7.2770700636942685e-06, - "loss": 0.7016, + "grad_norm": 3.5634512901306152, + "learning_rate": 8.638535031847134e-06, + "loss": 0.894, "step": 855 }, { "epoch": 2.77, - "grad_norm": 11.38332748413086, - "learning_rate": 7.2292993630573245e-06, - "loss": 0.8174, + "grad_norm": 3.1144227981567383, + "learning_rate": 8.614649681528664e-06, + "loss": 0.9718, "step": 870 }, { "epoch": 2.82, - "grad_norm": 22.539424896240234, - "learning_rate": 7.181528662420383e-06, - "loss": 0.7147, + "grad_norm": 3.4376463890075684, + "learning_rate": 8.590764331210192e-06, + "loss": 0.9247, "step": 885 }, { "epoch": 2.87, - "grad_norm": 15.064950942993164, - "learning_rate": 7.13375796178344e-06, - "loss": 0.7789, + "grad_norm": 3.0620603561401367, + "learning_rate": 8.566878980891721e-06, + "loss": 0.9559, "step": 900 }, { "epoch": 2.87, - "eval_accuracy": 0.7590476190476191, - "eval_loss": 0.6831705570220947, - "eval_runtime": 13.5379, - "eval_samples_per_second": 77.56, - "eval_steps_per_second": 9.75, + "eval_accuracy": 0.8076190476190476, + "eval_loss": 0.9138039946556091, + "eval_runtime": 13.8645, + "eval_samples_per_second": 75.733, + "eval_steps_per_second": 9.521, "step": 900 }, { "epoch": 2.91, - "grad_norm": 17.286598205566406, - "learning_rate": 7.085987261146498e-06, - "loss": 0.6743, + "grad_norm": 3.61348557472229, + "learning_rate": 8.54299363057325e-06, + "loss": 0.8909, "step": 915 }, { "epoch": 2.96, - "grad_norm": 19.45290756225586, - "learning_rate": 7.0382165605095544e-06, - "loss": 0.6263, + "grad_norm": 3.5213446617126465, + "learning_rate": 8.519108280254777e-06, + "loss": 0.8482, "step": 930 }, { "epoch": 3.01, - "grad_norm": 26.688581466674805, - "learning_rate": 6.990445859872612e-06, - "loss": 0.7736, + "grad_norm": 3.8047091960906982, + "learning_rate": 8.495222929936307e-06, + "loss": 0.9267, "step": 945 }, { "epoch": 3.06, - "grad_norm": 22.443763732910156, - "learning_rate": 6.942675159235669e-06, - "loss": 0.6392, + "grad_norm": 4.242620468139648, + "learning_rate": 8.471337579617835e-06, + "loss": 0.8823, "step": 960 }, { "epoch": 3.11, - "grad_norm": 18.81976318359375, - "learning_rate": 6.894904458598727e-06, - "loss": 0.7653, + "grad_norm": 3.3440823554992676, + "learning_rate": 8.447452229299363e-06, + "loss": 0.8623, "step": 975 }, { "epoch": 3.15, - "grad_norm": 12.312933921813965, - "learning_rate": 6.8471337579617835e-06, - "loss": 0.6863, + "grad_norm": 4.062607288360596, + "learning_rate": 8.423566878980893e-06, + "loss": 0.858, "step": 990 }, { "epoch": 3.18, - "eval_accuracy": 0.7752380952380953, - "eval_loss": 0.665543794631958, - "eval_runtime": 13.4956, - "eval_samples_per_second": 77.803, - "eval_steps_per_second": 9.781, + "eval_accuracy": 0.8247619047619048, + "eval_loss": 0.860478401184082, + "eval_runtime": 14.0508, + "eval_samples_per_second": 74.729, + "eval_steps_per_second": 9.394, "step": 1000 }, { "epoch": 3.2, - "grad_norm": 15.371318817138672, - "learning_rate": 6.799363057324841e-06, - "loss": 0.7106, + "grad_norm": 2.4473769664764404, + "learning_rate": 8.39968152866242e-06, + "loss": 0.8473, "step": 1005 }, { "epoch": 3.25, - "grad_norm": 18.258623123168945, - "learning_rate": 6.751592356687898e-06, - "loss": 0.7305, + "grad_norm": 5.5681023597717285, + "learning_rate": 8.37579617834395e-06, + "loss": 0.865, "step": 1020 }, { "epoch": 3.3, - "grad_norm": 16.52337074279785, - "learning_rate": 6.7038216560509565e-06, - "loss": 0.6947, + "grad_norm": 2.966853618621826, + "learning_rate": 8.351910828025478e-06, + "loss": 0.8306, "step": 1035 }, { "epoch": 3.34, - "grad_norm": 18.67824363708496, - "learning_rate": 6.6560509554140125e-06, - "loss": 0.6669, + "grad_norm": 5.360078811645508, + "learning_rate": 8.328025477707006e-06, + "loss": 0.7947, "step": 1050 }, { "epoch": 3.39, - "grad_norm": 16.26685905456543, - "learning_rate": 6.608280254777071e-06, - "loss": 0.6801, + "grad_norm": 5.492262840270996, + "learning_rate": 8.304140127388536e-06, + "loss": 0.8343, "step": 1065 }, { "epoch": 3.44, - "grad_norm": 13.744972229003906, - "learning_rate": 6.560509554140128e-06, - "loss": 0.6035, + "grad_norm": 2.8821961879730225, + "learning_rate": 8.280254777070064e-06, + "loss": 0.7892, "step": 1080 }, { "epoch": 3.49, - "grad_norm": 12.479057312011719, - "learning_rate": 6.5127388535031856e-06, - "loss": 0.6437, + "grad_norm": 3.5130903720855713, + "learning_rate": 8.256369426751592e-06, + "loss": 0.7858, "step": 1095 }, { "epoch": 3.5, - "eval_accuracy": 0.7771428571428571, - "eval_loss": 0.6382023692131042, - "eval_runtime": 13.3473, - "eval_samples_per_second": 78.667, - "eval_steps_per_second": 9.89, + "eval_accuracy": 0.8371428571428572, + "eval_loss": 0.8164299726486206, + "eval_runtime": 14.1175, + "eval_samples_per_second": 74.376, + "eval_steps_per_second": 9.35, "step": 1100 }, { "epoch": 3.54, - "grad_norm": 14.826581954956055, - "learning_rate": 6.464968152866242e-06, - "loss": 0.7309, + "grad_norm": 4.600940704345703, + "learning_rate": 8.232484076433122e-06, + "loss": 0.8238, "step": 1110 }, { "epoch": 3.58, - "grad_norm": 12.955341339111328, - "learning_rate": 6.4171974522293e-06, - "loss": 0.6864, + "grad_norm": 2.8502960205078125, + "learning_rate": 8.208598726114651e-06, + "loss": 0.8122, "step": 1125 }, { "epoch": 3.63, - "grad_norm": 14.903204917907715, - "learning_rate": 6.369426751592357e-06, - "loss": 0.6711, + "grad_norm": 3.0440945625305176, + "learning_rate": 8.18471337579618e-06, + "loss": 0.7492, "step": 1140 }, { "epoch": 3.68, - "grad_norm": 15.349693298339844, - "learning_rate": 6.321656050955415e-06, - "loss": 0.6362, + "grad_norm": 3.4030416011810303, + "learning_rate": 8.160828025477707e-06, + "loss": 0.7699, "step": 1155 }, { "epoch": 3.73, - "grad_norm": 25.346343994140625, - "learning_rate": 6.2738853503184715e-06, - "loss": 0.6359, + "grad_norm": 6.065947532653809, + "learning_rate": 8.136942675159237e-06, + "loss": 0.7903, "step": 1170 }, { "epoch": 3.77, - "grad_norm": 12.536116600036621, - "learning_rate": 6.226114649681529e-06, - "loss": 0.6991, + "grad_norm": 3.8829727172851562, + "learning_rate": 8.113057324840765e-06, + "loss": 0.7977, "step": 1185 }, { "epoch": 3.82, - "grad_norm": 19.788801193237305, - "learning_rate": 6.178343949044586e-06, - "loss": 0.6741, + "grad_norm": 3.397552013397217, + "learning_rate": 8.089171974522295e-06, + "loss": 0.7898, "step": 1200 }, { "epoch": 3.82, - "eval_accuracy": 0.7790476190476191, - "eval_loss": 0.6445861458778381, - "eval_runtime": 13.6114, - "eval_samples_per_second": 77.141, - "eval_steps_per_second": 9.698, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.7916920781135559, + "eval_runtime": 14.238, + "eval_samples_per_second": 73.746, + "eval_steps_per_second": 9.271, "step": 1200 }, { "epoch": 3.87, - "grad_norm": 16.279836654663086, - "learning_rate": 6.1305732484076445e-06, - "loss": 0.6977, + "grad_norm": 3.440532684326172, + "learning_rate": 8.065286624203823e-06, + "loss": 0.7507, "step": 1215 }, { "epoch": 3.92, - "grad_norm": 18.798139572143555, - "learning_rate": 6.0828025477707005e-06, - "loss": 0.653, + "grad_norm": 2.4461238384246826, + "learning_rate": 8.04140127388535e-06, + "loss": 0.7454, "step": 1230 }, { "epoch": 3.96, - "grad_norm": 17.142087936401367, - "learning_rate": 6.035031847133759e-06, - "loss": 0.6778, + "grad_norm": 2.5467453002929688, + "learning_rate": 8.01751592356688e-06, + "loss": 0.7682, "step": 1245 }, { "epoch": 4.01, - "grad_norm": 17.632762908935547, - "learning_rate": 5.987261146496816e-06, - "loss": 0.5343, + "grad_norm": 5.4643025398254395, + "learning_rate": 7.993630573248408e-06, + "loss": 0.7136, "step": 1260 }, { "epoch": 4.06, - "grad_norm": 14.896882057189941, - "learning_rate": 5.9394904458598736e-06, - "loss": 0.5694, + "grad_norm": 4.570471286773682, + "learning_rate": 7.969745222929936e-06, + "loss": 0.6882, "step": 1275 }, { "epoch": 4.11, - "grad_norm": 19.62409019470215, - "learning_rate": 5.89171974522293e-06, - "loss": 0.5871, + "grad_norm": 2.5436484813690186, + "learning_rate": 7.945859872611466e-06, + "loss": 0.6909, "step": 1290 }, { "epoch": 4.14, - "eval_accuracy": 0.7838095238095238, - "eval_loss": 0.6551438570022583, - "eval_runtime": 13.8432, - "eval_samples_per_second": 75.849, - "eval_steps_per_second": 9.535, + "eval_accuracy": 0.8038095238095239, + "eval_loss": 0.7995317578315735, + "eval_runtime": 14.274, + "eval_samples_per_second": 73.56, + "eval_steps_per_second": 9.248, "step": 1300 }, { "epoch": 4.16, - "grad_norm": 16.879796981811523, - "learning_rate": 5.843949044585988e-06, - "loss": 0.6344, + "grad_norm": 5.001487731933594, + "learning_rate": 7.921974522292994e-06, + "loss": 0.7197, "step": 1305 }, { "epoch": 4.2, - "grad_norm": 18.603700637817383, - "learning_rate": 5.796178343949045e-06, - "loss": 0.5768, + "grad_norm": 3.342618465423584, + "learning_rate": 7.898089171974524e-06, + "loss": 0.6634, "step": 1320 }, { "epoch": 4.25, - "grad_norm": 16.433502197265625, - "learning_rate": 5.748407643312103e-06, - "loss": 0.5884, + "grad_norm": 4.680523872375488, + "learning_rate": 7.874203821656052e-06, + "loss": 0.6952, "step": 1335 }, { "epoch": 4.3, - "grad_norm": 17.979280471801758, - "learning_rate": 5.7006369426751594e-06, - "loss": 0.6167, + "grad_norm": 4.881319522857666, + "learning_rate": 7.85031847133758e-06, + "loss": 0.7134, "step": 1350 }, { "epoch": 4.35, - "grad_norm": 20.778549194335938, - "learning_rate": 5.652866242038217e-06, - "loss": 0.6594, + "grad_norm": 5.452937126159668, + "learning_rate": 7.82643312101911e-06, + "loss": 0.7078, "step": 1365 }, { "epoch": 4.39, - "grad_norm": 14.834670066833496, - "learning_rate": 5.605095541401274e-06, - "loss": 0.6214, + "grad_norm": 3.7072432041168213, + "learning_rate": 7.802547770700637e-06, + "loss": 0.6691, "step": 1380 }, { "epoch": 4.44, - "grad_norm": 19.214466094970703, - "learning_rate": 5.5573248407643325e-06, - "loss": 0.6051, + "grad_norm": 4.637836456298828, + "learning_rate": 7.778662420382165e-06, + "loss": 0.6619, "step": 1395 }, { "epoch": 4.46, - "eval_accuracy": 0.7638095238095238, - "eval_loss": 0.6970483660697937, - "eval_runtime": 20.5305, - "eval_samples_per_second": 51.143, - "eval_steps_per_second": 6.429, + "eval_accuracy": 0.7828571428571428, + "eval_loss": 0.8194388151168823, + "eval_runtime": 14.1277, + "eval_samples_per_second": 74.322, + "eval_steps_per_second": 9.343, "step": 1400 }, { "epoch": 4.49, - "grad_norm": 16.332500457763672, - "learning_rate": 5.5095541401273885e-06, - "loss": 0.5996, + "grad_norm": 4.957500457763672, + "learning_rate": 7.754777070063695e-06, + "loss": 0.6907, "step": 1410 }, { "epoch": 4.54, - "grad_norm": 16.794343948364258, - "learning_rate": 5.461783439490447e-06, - "loss": 0.702, + "grad_norm": 6.341672420501709, + "learning_rate": 7.730891719745223e-06, + "loss": 0.7597, "step": 1425 }, { "epoch": 4.59, - "grad_norm": 21.159442901611328, - "learning_rate": 5.414012738853504e-06, - "loss": 0.5742, + "grad_norm": 5.114958763122559, + "learning_rate": 7.707006369426753e-06, + "loss": 0.6816, "step": 1440 }, { "epoch": 4.63, - "grad_norm": 26.400766372680664, - "learning_rate": 5.3662420382165615e-06, - "loss": 0.6288, + "grad_norm": 4.16578483581543, + "learning_rate": 7.68312101910828e-06, + "loss": 0.7146, "step": 1455 }, { "epoch": 4.68, - "grad_norm": 19.17631721496582, - "learning_rate": 5.318471337579618e-06, - "loss": 0.5819, + "grad_norm": 4.3378071784973145, + "learning_rate": 7.659235668789809e-06, + "loss": 0.666, "step": 1470 }, { "epoch": 4.73, - "grad_norm": 18.10342025756836, - "learning_rate": 5.270700636942676e-06, - "loss": 0.5842, + "grad_norm": 3.4012575149536133, + "learning_rate": 7.635350318471338e-06, + "loss": 0.654, "step": 1485 }, { "epoch": 4.78, - "grad_norm": 21.941911697387695, - "learning_rate": 5.222929936305733e-06, - "loss": 0.5175, + "grad_norm": 4.172365188598633, + "learning_rate": 7.611464968152867e-06, + "loss": 0.6457, "step": 1500 }, { "epoch": 4.78, - "eval_accuracy": 0.7790476190476191, - "eval_loss": 0.6552723050117493, - "eval_runtime": 13.5024, - "eval_samples_per_second": 77.764, - "eval_steps_per_second": 9.776, + "eval_accuracy": 0.8085714285714286, + "eval_loss": 0.7536157369613647, + "eval_runtime": 14.3149, + "eval_samples_per_second": 73.35, + "eval_steps_per_second": 9.221, "step": 1500 }, { "epoch": 4.82, - "grad_norm": 24.317623138427734, - "learning_rate": 5.175159235668791e-06, - "loss": 0.5984, + "grad_norm": 4.036574840545654, + "learning_rate": 7.587579617834395e-06, + "loss": 0.6452, "step": 1515 }, { "epoch": 4.87, - "grad_norm": 14.877484321594238, - "learning_rate": 5.1273885350318474e-06, - "loss": 0.6142, + "grad_norm": 3.928567409515381, + "learning_rate": 7.563694267515924e-06, + "loss": 0.6374, "step": 1530 }, { "epoch": 4.92, - "grad_norm": 20.296701431274414, - "learning_rate": 5.079617834394905e-06, - "loss": 0.719, + "grad_norm": 4.106697082519531, + "learning_rate": 7.539808917197453e-06, + "loss": 0.7006, "step": 1545 }, { "epoch": 4.97, - "grad_norm": 20.335296630859375, - "learning_rate": 5.031847133757962e-06, - "loss": 0.5651, + "grad_norm": 4.1516242027282715, + "learning_rate": 7.515923566878982e-06, + "loss": 0.5827, "step": 1560 }, { "epoch": 5.02, - "grad_norm": 17.09543228149414, - "learning_rate": 4.98407643312102e-06, - "loss": 0.4632, + "grad_norm": 4.836935043334961, + "learning_rate": 7.49203821656051e-06, + "loss": 0.5722, "step": 1575 }, { "epoch": 5.06, - "grad_norm": 15.416642189025879, - "learning_rate": 4.9363057324840765e-06, - "loss": 0.5795, + "grad_norm": 3.066807985305786, + "learning_rate": 7.468152866242039e-06, + "loss": 0.6155, "step": 1590 }, { "epoch": 5.1, - "eval_accuracy": 0.7771428571428571, - "eval_loss": 0.6666560173034668, - "eval_runtime": 14.1067, - "eval_samples_per_second": 74.433, - "eval_steps_per_second": 9.357, + "eval_accuracy": 0.8257142857142857, + "eval_loss": 0.7212250828742981, + "eval_runtime": 14.0875, + "eval_samples_per_second": 74.534, + "eval_steps_per_second": 9.37, "step": 1600 }, { "epoch": 5.11, - "grad_norm": 12.152099609375, - "learning_rate": 4.888535031847134e-06, - "loss": 0.6119, + "grad_norm": 4.183410167694092, + "learning_rate": 7.4442675159235675e-06, + "loss": 0.5963, "step": 1605 }, { "epoch": 5.16, - "grad_norm": 11.709696769714355, - "learning_rate": 4.840764331210192e-06, - "loss": 0.5521, + "grad_norm": 3.687758207321167, + "learning_rate": 7.4203821656050955e-06, + "loss": 0.6106, "step": 1620 }, { "epoch": 5.21, - "grad_norm": 12.4248685836792, - "learning_rate": 4.792993630573249e-06, - "loss": 0.586, + "grad_norm": 3.360952138900757, + "learning_rate": 7.396496815286624e-06, + "loss": 0.6153, "step": 1635 }, { "epoch": 5.25, - "grad_norm": 22.69182777404785, - "learning_rate": 4.745222929936306e-06, - "loss": 0.5848, + "grad_norm": 6.3952507972717285, + "learning_rate": 7.372611464968153e-06, + "loss": 0.6328, "step": 1650 }, { "epoch": 5.3, - "grad_norm": 15.92928409576416, - "learning_rate": 4.697452229299363e-06, - "loss": 0.5922, + "grad_norm": 3.9628074169158936, + "learning_rate": 7.348726114649683e-06, + "loss": 0.6233, "step": 1665 }, { "epoch": 5.35, - "grad_norm": 25.377580642700195, - "learning_rate": 4.649681528662421e-06, - "loss": 0.6579, + "grad_norm": 5.332892417907715, + "learning_rate": 7.32484076433121e-06, + "loss": 0.6638, "step": 1680 }, { "epoch": 5.4, - "grad_norm": 12.89096450805664, - "learning_rate": 4.601910828025479e-06, - "loss": 0.4919, + "grad_norm": 4.033740043640137, + "learning_rate": 7.300955414012739e-06, + "loss": 0.5511, "step": 1695 }, { "epoch": 5.41, - "eval_accuracy": 0.7904761904761904, - "eval_loss": 0.6316953897476196, - "eval_runtime": 13.547, - "eval_samples_per_second": 77.508, - "eval_steps_per_second": 9.744, + "eval_accuracy": 0.8095238095238095, + "eval_loss": 0.7273786067962646, + "eval_runtime": 14.0255, + "eval_samples_per_second": 74.864, + "eval_steps_per_second": 9.411, "step": 1700 }, { "epoch": 5.45, - "grad_norm": 13.04831314086914, - "learning_rate": 4.554140127388535e-06, - "loss": 0.5459, + "grad_norm": 2.9152443408966064, + "learning_rate": 7.2770700636942685e-06, + "loss": 0.5926, "step": 1710 }, { "epoch": 5.49, - "grad_norm": 14.792088508605957, - "learning_rate": 4.506369426751593e-06, - "loss": 0.4729, + "grad_norm": 4.412004470825195, + "learning_rate": 7.253184713375797e-06, + "loss": 0.575, "step": 1725 }, { "epoch": 5.54, - "grad_norm": 20.434284210205078, - "learning_rate": 4.45859872611465e-06, - "loss": 0.5285, + "grad_norm": 4.271115779876709, + "learning_rate": 7.2292993630573245e-06, + "loss": 0.5522, "step": 1740 }, { "epoch": 5.59, - "grad_norm": 16.0216064453125, - "learning_rate": 4.410828025477708e-06, - "loss": 0.5891, + "grad_norm": 5.693962574005127, + "learning_rate": 7.205414012738854e-06, + "loss": 0.5918, "step": 1755 }, { "epoch": 5.64, - "grad_norm": 14.537184715270996, - "learning_rate": 4.3630573248407645e-06, - "loss": 0.6203, + "grad_norm": 2.9795901775360107, + "learning_rate": 7.181528662420383e-06, + "loss": 0.5804, "step": 1770 }, { "epoch": 5.68, - "grad_norm": 16.755977630615234, - "learning_rate": 4.315286624203822e-06, - "loss": 0.5832, + "grad_norm": 4.588832855224609, + "learning_rate": 7.157643312101912e-06, + "loss": 0.6135, "step": 1785 }, { "epoch": 5.73, - "grad_norm": 18.05998992919922, - "learning_rate": 4.26751592356688e-06, - "loss": 0.4986, + "grad_norm": 5.463626861572266, + "learning_rate": 7.13375796178344e-06, + "loss": 0.5486, "step": 1800 }, { "epoch": 5.73, - "eval_accuracy": 0.780952380952381, - "eval_loss": 0.6485886573791504, - "eval_runtime": 13.712, - "eval_samples_per_second": 76.575, - "eval_steps_per_second": 9.627, + "eval_accuracy": 0.8285714285714286, + "eval_loss": 0.7047600150108337, + "eval_runtime": 14.2281, + "eval_samples_per_second": 73.798, + "eval_steps_per_second": 9.277, "step": 1800 }, { "epoch": 5.78, - "grad_norm": 13.940254211425781, - "learning_rate": 4.219745222929937e-06, - "loss": 0.5582, + "grad_norm": 4.97993278503418, + "learning_rate": 7.109872611464969e-06, + "loss": 0.5619, "step": 1815 }, { "epoch": 5.83, - "grad_norm": 13.54953670501709, - "learning_rate": 4.171974522292994e-06, - "loss": 0.5189, + "grad_norm": 2.6514835357666016, + "learning_rate": 7.085987261146498e-06, + "loss": 0.5108, "step": 1830 }, { "epoch": 5.88, - "grad_norm": 19.552183151245117, - "learning_rate": 4.124203821656051e-06, - "loss": 0.6037, + "grad_norm": 3.859992742538452, + "learning_rate": 7.0621019108280264e-06, + "loss": 0.586, "step": 1845 }, { "epoch": 5.92, - "grad_norm": 13.757224082946777, - "learning_rate": 4.076433121019109e-06, - "loss": 0.5537, + "grad_norm": 4.4486494064331055, + "learning_rate": 7.0382165605095544e-06, + "loss": 0.5856, "step": 1860 }, { "epoch": 5.97, - "grad_norm": 24.593406677246094, - "learning_rate": 4.0286624203821666e-06, - "loss": 0.5527, + "grad_norm": 4.080160617828369, + "learning_rate": 7.014331210191083e-06, + "loss": 0.5792, "step": 1875 }, { "epoch": 6.02, - "grad_norm": 22.236400604248047, - "learning_rate": 3.980891719745223e-06, - "loss": 0.5104, + "grad_norm": 5.243125915527344, + "learning_rate": 6.990445859872612e-06, + "loss": 0.5679, "step": 1890 }, { "epoch": 6.05, - "eval_accuracy": 0.7742857142857142, - "eval_loss": 0.6699539422988892, - "eval_runtime": 13.5651, - "eval_samples_per_second": 77.405, - "eval_steps_per_second": 9.731, + "eval_accuracy": 0.8180952380952381, + "eval_loss": 0.712360143661499, + "eval_runtime": 14.0568, + "eval_samples_per_second": 74.697, + "eval_steps_per_second": 9.39, "step": 1900 }, { "epoch": 6.07, - "grad_norm": 15.87308120727539, - "learning_rate": 3.933121019108281e-06, - "loss": 0.5268, + "grad_norm": 3.062335252761841, + "learning_rate": 6.966560509554141e-06, + "loss": 0.515, "step": 1905 }, { "epoch": 6.11, - "grad_norm": 13.48481273651123, - "learning_rate": 3.885350318471338e-06, - "loss": 0.5421, + "grad_norm": 2.6931862831115723, + "learning_rate": 6.942675159235669e-06, + "loss": 0.5465, "step": 1920 }, { "epoch": 6.16, - "grad_norm": 13.895825386047363, - "learning_rate": 3.837579617834396e-06, - "loss": 0.6139, + "grad_norm": 6.212809085845947, + "learning_rate": 6.918789808917198e-06, + "loss": 0.549, "step": 1935 }, { "epoch": 6.21, - "grad_norm": 14.655675888061523, - "learning_rate": 3.789808917197453e-06, - "loss": 0.495, + "grad_norm": 2.02023983001709, + "learning_rate": 6.894904458598727e-06, + "loss": 0.5534, "step": 1950 }, { "epoch": 6.26, - "grad_norm": 21.782032012939453, - "learning_rate": 3.7420382165605097e-06, - "loss": 0.513, + "grad_norm": 4.084249496459961, + "learning_rate": 6.8710191082802555e-06, + "loss": 0.5061, "step": 1965 }, { "epoch": 6.31, - "grad_norm": 16.350772857666016, - "learning_rate": 3.694267515923567e-06, - "loss": 0.5182, + "grad_norm": 9.472137451171875, + "learning_rate": 6.8471337579617835e-06, + "loss": 0.5017, "step": 1980 }, { "epoch": 6.35, - "grad_norm": 12.87532901763916, - "learning_rate": 3.6464968152866242e-06, - "loss": 0.4919, + "grad_norm": 3.419933319091797, + "learning_rate": 6.823248407643312e-06, + "loss": 0.4914, "step": 1995 }, { "epoch": 6.37, - "eval_accuracy": 0.7819047619047619, - "eval_loss": 0.6527658700942993, - "eval_runtime": 13.9166, - "eval_samples_per_second": 75.449, - "eval_steps_per_second": 9.485, + "eval_accuracy": 0.800952380952381, + "eval_loss": 0.7276927828788757, + "eval_runtime": 14.1066, + "eval_samples_per_second": 74.433, + "eval_steps_per_second": 9.357, "step": 2000 }, { "epoch": 6.4, - "grad_norm": 12.642027854919434, - "learning_rate": 3.5987261146496815e-06, - "loss": 0.5212, + "grad_norm": 2.971423387527466, + "learning_rate": 6.799363057324841e-06, + "loss": 0.5342, "step": 2010 }, { "epoch": 6.45, - "grad_norm": 13.786490440368652, - "learning_rate": 3.5509554140127388e-06, - "loss": 0.5004, + "grad_norm": 3.118852376937866, + "learning_rate": 6.77547770700637e-06, + "loss": 0.5055, "step": 2025 }, { "epoch": 6.5, - "grad_norm": 28.24700927734375, - "learning_rate": 3.5031847133757964e-06, - "loss": 0.539, + "grad_norm": 3.238327741622925, + "learning_rate": 6.751592356687898e-06, + "loss": 0.5287, "step": 2040 }, { "epoch": 6.54, - "grad_norm": 10.891915321350098, - "learning_rate": 3.4554140127388537e-06, - "loss": 0.5316, + "grad_norm": 4.170012950897217, + "learning_rate": 6.727707006369427e-06, + "loss": 0.5199, "step": 2055 }, { "epoch": 6.59, - "grad_norm": 21.343164443969727, - "learning_rate": 3.407643312101911e-06, - "loss": 0.5497, + "grad_norm": 6.786019325256348, + "learning_rate": 6.7038216560509565e-06, + "loss": 0.5561, "step": 2070 }, { "epoch": 6.64, - "grad_norm": 15.246662139892578, - "learning_rate": 3.3598726114649682e-06, - "loss": 0.5212, + "grad_norm": 4.666538715362549, + "learning_rate": 6.679936305732485e-06, + "loss": 0.4712, "step": 2085 }, { "epoch": 6.69, - "grad_norm": 18.424856185913086, - "learning_rate": 3.3121019108280255e-06, - "loss": 0.5144, + "grad_norm": 3.341716766357422, + "learning_rate": 6.6560509554140125e-06, + "loss": 0.525, "step": 2100 }, { "epoch": 6.69, - "eval_accuracy": 0.7876190476190477, - "eval_loss": 0.6354712843894958, - "eval_runtime": 13.6508, - "eval_samples_per_second": 76.919, - "eval_steps_per_second": 9.67, + "eval_accuracy": 0.8123809523809524, + "eval_loss": 0.6971268653869629, + "eval_runtime": 14.2262, + "eval_samples_per_second": 73.808, + "eval_steps_per_second": 9.279, "step": 2100 }, { "epoch": 6.74, - "grad_norm": 15.570305824279785, - "learning_rate": 3.2643312101910827e-06, - "loss": 0.5892, + "grad_norm": 8.099006652832031, + "learning_rate": 6.632165605095542e-06, + "loss": 0.5809, "step": 2115 }, { "epoch": 6.78, - "grad_norm": 16.673995971679688, - "learning_rate": 3.2165605095541404e-06, - "loss": 0.5079, + "grad_norm": 4.491795063018799, + "learning_rate": 6.608280254777071e-06, + "loss": 0.4867, "step": 2130 }, { "epoch": 6.83, - "grad_norm": 17.703060150146484, - "learning_rate": 3.1687898089171977e-06, - "loss": 0.496, + "grad_norm": 2.734052896499634, + "learning_rate": 6.5843949044586e-06, + "loss": 0.4816, "step": 2145 }, { "epoch": 6.88, - "grad_norm": 14.203299522399902, - "learning_rate": 3.121019108280255e-06, - "loss": 0.5223, + "grad_norm": 6.541310787200928, + "learning_rate": 6.560509554140128e-06, + "loss": 0.4877, "step": 2160 }, { "epoch": 6.93, - "grad_norm": 14.10352897644043, - "learning_rate": 3.0732484076433122e-06, - "loss": 0.521, + "grad_norm": 4.009987831115723, + "learning_rate": 6.536624203821657e-06, + "loss": 0.5303, "step": 2175 }, { "epoch": 6.97, - "grad_norm": 13.882482528686523, - "learning_rate": 3.0254777070063695e-06, - "loss": 0.5554, + "grad_norm": 5.541009902954102, + "learning_rate": 6.5127388535031856e-06, + "loss": 0.5081, "step": 2190 }, { "epoch": 7.01, - "eval_accuracy": 0.7771428571428571, - "eval_loss": 0.6552413702011108, - "eval_runtime": 13.0208, - "eval_samples_per_second": 80.64, - "eval_steps_per_second": 10.138, + "eval_accuracy": 0.8161904761904762, + "eval_loss": 0.6869356632232666, + "eval_runtime": 14.0402, + "eval_samples_per_second": 74.785, + "eval_steps_per_second": 9.402, "step": 2200 }, { "epoch": 7.02, - "grad_norm": 12.480643272399902, - "learning_rate": 2.9777070063694267e-06, - "loss": 0.4216, + "grad_norm": 3.370013475418091, + "learning_rate": 6.488853503184714e-06, + "loss": 0.441, "step": 2205 }, { "epoch": 7.07, - "grad_norm": 14.39759349822998, - "learning_rate": 2.9299363057324844e-06, - "loss": 0.4888, + "grad_norm": 3.942737340927124, + "learning_rate": 6.464968152866242e-06, + "loss": 0.4422, "step": 2220 }, { "epoch": 7.12, - "grad_norm": 17.724123001098633, - "learning_rate": 2.8821656050955417e-06, - "loss": 0.4579, + "grad_norm": 4.051229476928711, + "learning_rate": 6.441082802547771e-06, + "loss": 0.4439, "step": 2235 }, { "epoch": 7.17, - "grad_norm": 14.149361610412598, - "learning_rate": 2.834394904458599e-06, - "loss": 0.5295, + "grad_norm": 4.835233211517334, + "learning_rate": 6.4171974522293e-06, + "loss": 0.4789, "step": 2250 }, { "epoch": 7.21, - "grad_norm": 18.39142608642578, - "learning_rate": 2.786624203821656e-06, - "loss": 0.4918, + "grad_norm": 5.995761871337891, + "learning_rate": 6.393312101910829e-06, + "loss": 0.4776, "step": 2265 }, { "epoch": 7.26, - "grad_norm": 21.38290023803711, - "learning_rate": 2.7388535031847135e-06, - "loss": 0.5542, + "grad_norm": 3.5410878658294678, + "learning_rate": 6.369426751592357e-06, + "loss": 0.5077, "step": 2280 }, { "epoch": 7.31, - "grad_norm": 21.44352912902832, - "learning_rate": 2.6910828025477707e-06, - "loss": 0.5389, + "grad_norm": 5.137345790863037, + "learning_rate": 6.345541401273886e-06, + "loss": 0.5072, "step": 2295 }, { "epoch": 7.32, - "eval_accuracy": 0.7876190476190477, - "eval_loss": 0.6360692381858826, - "eval_runtime": 13.6127, - "eval_samples_per_second": 77.134, - "eval_steps_per_second": 9.697, + "eval_accuracy": 0.8076190476190476, + "eval_loss": 0.6836773157119751, + "eval_runtime": 14.3682, + "eval_samples_per_second": 73.078, + "eval_steps_per_second": 9.187, "step": 2300 }, { "epoch": 7.36, - "grad_norm": 14.326496124267578, - "learning_rate": 2.6433121019108284e-06, - "loss": 0.4785, + "grad_norm": 7.055994987487793, + "learning_rate": 6.321656050955415e-06, + "loss": 0.4419, "step": 2310 }, { "epoch": 7.4, - "grad_norm": 17.98026466369629, - "learning_rate": 2.5955414012738857e-06, - "loss": 0.5193, + "grad_norm": 5.709652423858643, + "learning_rate": 6.2977707006369435e-06, + "loss": 0.485, "step": 2325 }, { "epoch": 7.45, - "grad_norm": 11.728538513183594, - "learning_rate": 2.547770700636943e-06, - "loss": 0.4371, + "grad_norm": 3.7861547470092773, + "learning_rate": 6.2738853503184715e-06, + "loss": 0.4473, "step": 2340 }, { "epoch": 7.5, - "grad_norm": 17.007251739501953, - "learning_rate": 2.5e-06, - "loss": 0.4969, + "grad_norm": 5.255743503570557, + "learning_rate": 6.25e-06, + "loss": 0.4558, "step": 2355 }, { "epoch": 7.55, - "grad_norm": 15.3156099319458, - "learning_rate": 2.4522292993630575e-06, - "loss": 0.5297, + "grad_norm": 3.864764928817749, + "learning_rate": 6.226114649681529e-06, + "loss": 0.4522, "step": 2370 }, { "epoch": 7.6, - "grad_norm": 13.694135665893555, - "learning_rate": 2.4044585987261147e-06, - "loss": 0.4651, + "grad_norm": 3.3340399265289307, + "learning_rate": 6.202229299363057e-06, + "loss": 0.4358, "step": 2385 }, { "epoch": 7.64, - "grad_norm": 16.395017623901367, - "learning_rate": 2.356687898089172e-06, - "loss": 0.5751, + "grad_norm": 6.069746971130371, + "learning_rate": 6.178343949044586e-06, + "loss": 0.4702, "step": 2400 }, { "epoch": 7.64, - "eval_accuracy": 0.7904761904761904, - "eval_loss": 0.6376513838768005, - "eval_runtime": 13.653, - "eval_samples_per_second": 76.906, - "eval_steps_per_second": 9.668, + "eval_accuracy": 0.8152380952380952, + "eval_loss": 0.6736096739768982, + "eval_runtime": 14.301, + "eval_samples_per_second": 73.421, + "eval_steps_per_second": 9.23, "step": 2400 }, { "epoch": 7.69, - "grad_norm": 21.47723388671875, - "learning_rate": 2.3089171974522297e-06, - "loss": 0.4762, + "grad_norm": 8.816964149475098, + "learning_rate": 6.154458598726115e-06, + "loss": 0.4847, "step": 2415 }, { "epoch": 7.74, - "grad_norm": 17.56719970703125, - "learning_rate": 2.261146496815287e-06, - "loss": 0.6217, + "grad_norm": 2.001786708831787, + "learning_rate": 6.1305732484076445e-06, + "loss": 0.5597, "step": 2430 }, { "epoch": 7.79, - "grad_norm": 12.036867141723633, - "learning_rate": 2.213375796178344e-06, - "loss": 0.4727, + "grad_norm": 4.528175354003906, + "learning_rate": 6.106687898089172e-06, + "loss": 0.4268, "step": 2445 }, { "epoch": 7.83, - "grad_norm": 18.971595764160156, - "learning_rate": 2.1656050955414015e-06, - "loss": 0.4601, + "grad_norm": 5.561005115509033, + "learning_rate": 6.0828025477707005e-06, + "loss": 0.4001, "step": 2460 }, { "epoch": 7.88, - "grad_norm": 18.308382034301758, - "learning_rate": 2.1178343949044587e-06, - "loss": 0.5086, + "grad_norm": 5.51016902923584, + "learning_rate": 6.05891719745223e-06, + "loss": 0.4159, "step": 2475 }, { "epoch": 7.93, - "grad_norm": 13.486546516418457, - "learning_rate": 2.070063694267516e-06, - "loss": 0.4743, + "grad_norm": 2.6793384552001953, + "learning_rate": 6.035031847133759e-06, + "loss": 0.4303, "step": 2490 }, { "epoch": 7.96, - "eval_accuracy": 0.7866666666666666, - "eval_loss": 0.6417487859725952, - "eval_runtime": 13.707, - "eval_samples_per_second": 76.603, - "eval_steps_per_second": 9.63, + "eval_accuracy": 0.8104761904761905, + "eval_loss": 0.6692745685577393, + "eval_runtime": 14.1903, + "eval_samples_per_second": 73.994, + "eval_steps_per_second": 9.302, "step": 2500 }, { "epoch": 7.98, - "grad_norm": 12.4083833694458, - "learning_rate": 2.0222929936305737e-06, - "loss": 0.453, + "grad_norm": 3.1929242610931396, + "learning_rate": 6.011146496815287e-06, + "loss": 0.43, "step": 2505 }, { "epoch": 8.03, - "grad_norm": 10.967087745666504, - "learning_rate": 1.974522292993631e-06, - "loss": 0.4937, + "grad_norm": 3.281541585922241, + "learning_rate": 5.987261146496816e-06, + "loss": 0.4189, "step": 2520 }, { "epoch": 8.07, - "grad_norm": 11.663314819335938, - "learning_rate": 1.926751592356688e-06, - "loss": 0.5115, + "grad_norm": 4.396710395812988, + "learning_rate": 5.963375796178345e-06, + "loss": 0.4211, "step": 2535 }, { "epoch": 8.12, - "grad_norm": 10.820151329040527, - "learning_rate": 1.8789808917197455e-06, - "loss": 0.4281, + "grad_norm": 2.817596435546875, + "learning_rate": 5.9394904458598736e-06, + "loss": 0.4409, "step": 2550 }, { "epoch": 8.17, - "grad_norm": 15.378673553466797, - "learning_rate": 1.8312101910828025e-06, - "loss": 0.5341, + "grad_norm": 5.4012250900268555, + "learning_rate": 5.9156050955414016e-06, + "loss": 0.4407, "step": 2565 }, { "epoch": 8.22, - "grad_norm": 18.212982177734375, - "learning_rate": 1.78343949044586e-06, - "loss": 0.5331, + "grad_norm": 8.333674430847168, + "learning_rate": 5.89171974522293e-06, + "loss": 0.4267, "step": 2580 }, { "epoch": 8.26, - "grad_norm": 25.97978401184082, - "learning_rate": 1.7356687898089172e-06, - "loss": 0.4519, + "grad_norm": 3.2275092601776123, + "learning_rate": 5.867834394904459e-06, + "loss": 0.3916, "step": 2595 }, { "epoch": 8.28, - "eval_accuracy": 0.7895238095238095, - "eval_loss": 0.6309406161308289, - "eval_runtime": 13.8301, - "eval_samples_per_second": 75.921, - "eval_steps_per_second": 9.544, + "eval_accuracy": 0.8238095238095238, + "eval_loss": 0.6487377882003784, + "eval_runtime": 13.7782, + "eval_samples_per_second": 76.208, + "eval_steps_per_second": 9.58, "step": 2600 }, { "epoch": 8.31, - "grad_norm": 16.035921096801758, - "learning_rate": 1.6878980891719745e-06, - "loss": 0.4434, + "grad_norm": 6.641006946563721, + "learning_rate": 5.843949044585988e-06, + "loss": 0.4032, "step": 2610 }, { "epoch": 8.36, - "grad_norm": 33.498626708984375, - "learning_rate": 1.640127388535032e-06, - "loss": 0.5621, + "grad_norm": 7.206479072570801, + "learning_rate": 5.820063694267516e-06, + "loss": 0.4515, "step": 2625 }, { "epoch": 8.41, - "grad_norm": 22.457271575927734, - "learning_rate": 1.5923566878980892e-06, - "loss": 0.4843, + "grad_norm": 8.875749588012695, + "learning_rate": 5.796178343949045e-06, + "loss": 0.4438, "step": 2640 }, { "epoch": 8.46, - "grad_norm": 20.040433883666992, - "learning_rate": 1.5445859872611465e-06, - "loss": 0.5306, + "grad_norm": 8.141197204589844, + "learning_rate": 5.772292993630574e-06, + "loss": 0.439, "step": 2655 }, { "epoch": 8.5, - "grad_norm": 25.173227310180664, - "learning_rate": 1.496815286624204e-06, - "loss": 0.5078, + "grad_norm": 5.922451019287109, + "learning_rate": 5.748407643312103e-06, + "loss": 0.4322, "step": 2670 }, { "epoch": 8.55, - "grad_norm": 14.426128387451172, - "learning_rate": 1.4490445859872612e-06, - "loss": 0.6197, + "grad_norm": 5.907478332519531, + "learning_rate": 5.724522292993631e-06, + "loss": 0.5118, "step": 2685 }, { "epoch": 8.6, - "grad_norm": 21.540132522583008, - "learning_rate": 1.4012738853503185e-06, - "loss": 0.5058, + "grad_norm": 7.069815635681152, + "learning_rate": 5.7006369426751594e-06, + "loss": 0.4002, "step": 2700 }, { "epoch": 8.6, - "eval_accuracy": 0.7866666666666666, - "eval_loss": 0.6453108787536621, - "eval_runtime": 13.9306, - "eval_samples_per_second": 75.373, - "eval_steps_per_second": 9.476, + "eval_accuracy": 0.8161904761904762, + "eval_loss": 0.6660885214805603, + "eval_runtime": 14.4979, + "eval_samples_per_second": 72.424, + "eval_steps_per_second": 9.105, "step": 2700 }, { "epoch": 8.65, - "grad_norm": 17.211627960205078, - "learning_rate": 1.353503184713376e-06, - "loss": 0.6111, + "grad_norm": 5.684536457061768, + "learning_rate": 5.676751592356688e-06, + "loss": 0.4734, "step": 2715 }, { "epoch": 8.69, - "grad_norm": 12.991823196411133, - "learning_rate": 1.3057324840764332e-06, - "loss": 0.4392, + "grad_norm": 6.170379161834717, + "learning_rate": 5.652866242038217e-06, + "loss": 0.3783, "step": 2730 }, { "epoch": 8.74, - "grad_norm": 10.215910911560059, - "learning_rate": 1.2579617834394905e-06, - "loss": 0.5505, + "grad_norm": 4.068530082702637, + "learning_rate": 5.628980891719745e-06, + "loss": 0.4746, "step": 2745 }, { "epoch": 8.79, - "grad_norm": 16.872520446777344, - "learning_rate": 1.210191082802548e-06, - "loss": 0.4304, + "grad_norm": 4.95632791519165, + "learning_rate": 5.605095541401274e-06, + "loss": 0.3979, "step": 2760 }, { "epoch": 8.84, - "grad_norm": 14.673178672790527, - "learning_rate": 1.1624203821656052e-06, - "loss": 0.4886, + "grad_norm": 2.6091508865356445, + "learning_rate": 5.581210191082803e-06, + "loss": 0.4066, "step": 2775 }, { "epoch": 8.89, - "grad_norm": 17.11809730529785, - "learning_rate": 1.1146496815286625e-06, - "loss": 0.4754, + "grad_norm": 4.2863030433654785, + "learning_rate": 5.5573248407643325e-06, + "loss": 0.3965, "step": 2790 }, { "epoch": 8.92, - "eval_accuracy": 0.7904761904761904, - "eval_loss": 0.6414105296134949, - "eval_runtime": 13.6725, - "eval_samples_per_second": 76.797, - "eval_steps_per_second": 9.654, + "eval_accuracy": 0.8142857142857143, + "eval_loss": 0.6611486673355103, + "eval_runtime": 14.1692, + "eval_samples_per_second": 74.104, + "eval_steps_per_second": 9.316, "step": 2800 }, { "epoch": 8.93, - "grad_norm": 10.194275856018066, - "learning_rate": 1.06687898089172e-06, - "loss": 0.5041, + "grad_norm": 4.6702561378479, + "learning_rate": 5.53343949044586e-06, + "loss": 0.4176, "step": 2805 }, { "epoch": 8.98, - "grad_norm": 24.478836059570312, - "learning_rate": 1.0191082802547772e-06, - "loss": 0.5016, + "grad_norm": 7.670042037963867, + "learning_rate": 5.5095541401273885e-06, + "loss": 0.4, "step": 2820 }, { "epoch": 9.03, - "grad_norm": 16.150724411010742, - "learning_rate": 9.713375796178345e-07, - "loss": 0.486, + "grad_norm": 5.234184741973877, + "learning_rate": 5.485668789808918e-06, + "loss": 0.4021, "step": 2835 }, { "epoch": 9.08, - "grad_norm": 15.390515327453613, - "learning_rate": 9.235668789808917e-07, - "loss": 0.5024, + "grad_norm": 5.475564479827881, + "learning_rate": 5.461783439490447e-06, + "loss": 0.3984, "step": 2850 }, { "epoch": 9.12, - "grad_norm": 20.307998657226562, - "learning_rate": 8.757961783439491e-07, - "loss": 0.4982, + "grad_norm": 3.48297119140625, + "learning_rate": 5.437898089171974e-06, + "loss": 0.4107, "step": 2865 }, { "epoch": 9.17, - "grad_norm": 18.18573760986328, - "learning_rate": 8.280254777070064e-07, - "loss": 0.5054, + "grad_norm": 5.547008514404297, + "learning_rate": 5.414012738853504e-06, + "loss": 0.4205, "step": 2880 }, { "epoch": 9.22, - "grad_norm": 15.57632827758789, - "learning_rate": 7.802547770700637e-07, - "loss": 0.4637, + "grad_norm": 5.729915142059326, + "learning_rate": 5.390127388535033e-06, + "loss": 0.3946, "step": 2895 }, { "epoch": 9.24, - "eval_accuracy": 0.7904761904761904, - "eval_loss": 0.6329751014709473, - "eval_runtime": 13.6535, - "eval_samples_per_second": 76.904, - "eval_steps_per_second": 9.668, + "eval_accuracy": 0.8142857142857143, + "eval_loss": 0.652323842048645, + "eval_runtime": 14.2667, + "eval_samples_per_second": 73.598, + "eval_steps_per_second": 9.252, "step": 2900 }, { "epoch": 9.27, - "grad_norm": 11.808470726013184, - "learning_rate": 7.324840764331211e-07, - "loss": 0.5804, + "grad_norm": 3.7174131870269775, + "learning_rate": 5.3662420382165615e-06, + "loss": 0.4326, "step": 2910 }, { "epoch": 9.32, - "grad_norm": 19.781538009643555, - "learning_rate": 6.847133757961784e-07, - "loss": 0.5309, + "grad_norm": 4.2326130867004395, + "learning_rate": 5.3423566878980895e-06, + "loss": 0.3826, "step": 2925 }, { "epoch": 9.36, - "grad_norm": 15.966341972351074, - "learning_rate": 6.369426751592357e-07, - "loss": 0.536, + "grad_norm": 6.576159954071045, + "learning_rate": 5.318471337579618e-06, + "loss": 0.4053, "step": 2940 }, { "epoch": 9.41, - "grad_norm": 11.989510536193848, - "learning_rate": 5.89171974522293e-07, - "loss": 0.4474, + "grad_norm": 3.004859685897827, + "learning_rate": 5.294585987261147e-06, + "loss": 0.3833, "step": 2955 }, { "epoch": 9.46, - "grad_norm": 13.803847312927246, - "learning_rate": 5.414012738853504e-07, - "loss": 0.4868, + "grad_norm": 4.646060943603516, + "learning_rate": 5.270700636942676e-06, + "loss": 0.3586, "step": 2970 }, { "epoch": 9.51, - "grad_norm": 16.266407012939453, - "learning_rate": 4.936305732484077e-07, - "loss": 0.4453, + "grad_norm": 3.8182735443115234, + "learning_rate": 5.246815286624204e-06, + "loss": 0.3463, "step": 2985 }, { "epoch": 9.55, - "grad_norm": 17.96660804748535, - "learning_rate": 4.45859872611465e-07, - "loss": 0.5028, + "grad_norm": 4.330494403839111, + "learning_rate": 5.222929936305733e-06, + "loss": 0.3794, "step": 3000 }, { "epoch": 9.55, - "eval_accuracy": 0.7857142857142857, - "eval_loss": 0.641762375831604, - "eval_runtime": 13.8591, - "eval_samples_per_second": 75.763, - "eval_steps_per_second": 9.524, + "eval_accuracy": 0.8047619047619048, + "eval_loss": 0.6615909934043884, + "eval_runtime": 13.9408, + "eval_samples_per_second": 75.318, + "eval_steps_per_second": 9.469, "step": 3000 }, { "epoch": 9.6, - "grad_norm": 15.89122200012207, - "learning_rate": 3.980891719745223e-07, - "loss": 0.4376, + "grad_norm": 5.949002742767334, + "learning_rate": 5.199044585987262e-06, + "loss": 0.3572, "step": 3015 }, { "epoch": 9.65, - "grad_norm": 18.61841583251953, - "learning_rate": 3.503184713375796e-07, - "loss": 0.3695, + "grad_norm": 4.592197418212891, + "learning_rate": 5.175159235668791e-06, + "loss": 0.3428, "step": 3030 }, { "epoch": 9.7, - "grad_norm": 19.915699005126953, - "learning_rate": 3.02547770700637e-07, - "loss": 0.4777, + "grad_norm": 6.483883857727051, + "learning_rate": 5.151273885350319e-06, + "loss": 0.4247, "step": 3045 }, { "epoch": 9.75, - "grad_norm": 22.503381729125977, - "learning_rate": 2.547770700636943e-07, - "loss": 0.6298, + "grad_norm": 4.930222988128662, + "learning_rate": 5.1273885350318474e-06, + "loss": 0.467, "step": 3060 }, { "epoch": 9.79, - "grad_norm": 16.984233856201172, - "learning_rate": 2.070063694267516e-07, - "loss": 0.4051, + "grad_norm": 4.973721504211426, + "learning_rate": 5.103503184713376e-06, + "loss": 0.3293, "step": 3075 }, { "epoch": 9.84, - "grad_norm": 19.879077911376953, - "learning_rate": 1.5923566878980893e-07, - "loss": 0.4227, + "grad_norm": 3.4401309490203857, + "learning_rate": 5.079617834394905e-06, + "loss": 0.3257, "step": 3090 }, { "epoch": 9.87, - "eval_accuracy": 0.7914285714285715, - "eval_loss": 0.6412155628204346, - "eval_runtime": 13.7603, - "eval_samples_per_second": 76.307, - "eval_steps_per_second": 9.593, + "eval_accuracy": 0.8028571428571428, + "eval_loss": 0.6717351675033569, + "eval_runtime": 14.2902, + "eval_samples_per_second": 73.477, + "eval_steps_per_second": 9.237, "step": 3100 }, { "epoch": 9.89, - "grad_norm": 18.370866775512695, - "learning_rate": 1.1146496815286625e-07, - "loss": 0.4079, + "grad_norm": 3.4736809730529785, + "learning_rate": 5.055732484076433e-06, + "loss": 0.3339, "step": 3105 }, { "epoch": 9.94, - "grad_norm": 13.279521942138672, - "learning_rate": 6.369426751592358e-08, - "loss": 0.4035, + "grad_norm": 9.58438777923584, + "learning_rate": 5.031847133757962e-06, + "loss": 0.3268, "step": 3120 }, { "epoch": 9.98, - "grad_norm": 16.93092155456543, - "learning_rate": 1.5923566878980894e-08, - "loss": 0.4605, + "grad_norm": 5.36261510848999, + "learning_rate": 5.007961783439491e-06, + "loss": 0.3418, "step": 3135 }, { - "epoch": 10.0, - "step": 3140, - "total_flos": 3.265548125287219e+18, - "train_loss": 0.7155859537944672, - "train_runtime": 2626.1064, - "train_samples_per_second": 38.212, - "train_steps_per_second": 1.196 + "epoch": 10.03, + "grad_norm": 4.828367710113525, + "learning_rate": 4.98407643312102e-06, + "loss": 0.4032, + "step": 3150 + }, + { + "epoch": 10.08, + "grad_norm": 3.9634041786193848, + "learning_rate": 4.960191082802548e-06, + "loss": 0.3709, + "step": 3165 + }, + { + "epoch": 10.13, + "grad_norm": 4.182370185852051, + "learning_rate": 4.9363057324840765e-06, + "loss": 0.4119, + "step": 3180 + }, + { + "epoch": 10.18, + "grad_norm": 2.144193410873413, + "learning_rate": 4.912420382165605e-06, + "loss": 0.4175, + "step": 3195 + }, + { + "epoch": 10.19, + "eval_accuracy": 0.8057142857142857, + "eval_loss": 0.6530217528343201, + "eval_runtime": 14.3184, + "eval_samples_per_second": 73.332, + "eval_steps_per_second": 9.219, + "step": 3200 + }, + { + "epoch": 10.22, + "grad_norm": 6.0914716720581055, + "learning_rate": 4.888535031847134e-06, + "loss": 0.3614, + "step": 3210 + }, + { + "epoch": 10.27, + "grad_norm": 7.094061851501465, + "learning_rate": 4.864649681528662e-06, + "loss": 0.4015, + "step": 3225 + }, + { + "epoch": 10.32, + "grad_norm": 6.06875467300415, + "learning_rate": 4.840764331210192e-06, + "loss": 0.3543, + "step": 3240 + }, + { + "epoch": 10.37, + "grad_norm": 5.071672439575195, + "learning_rate": 4.81687898089172e-06, + "loss": 0.3798, + "step": 3255 + }, + { + "epoch": 10.41, + "grad_norm": 3.41487979888916, + "learning_rate": 4.792993630573249e-06, + "loss": 0.3878, + "step": 3270 + }, + { + "epoch": 10.46, + "grad_norm": 6.576408386230469, + "learning_rate": 4.7691082802547775e-06, + "loss": 0.4392, + "step": 3285 + }, + { + "epoch": 10.51, + "grad_norm": 4.69280481338501, + "learning_rate": 4.745222929936306e-06, + "loss": 0.3559, + "step": 3300 + }, + { + "epoch": 10.51, + "eval_accuracy": 0.7885714285714286, + "eval_loss": 0.6882754564285278, + "eval_runtime": 14.2678, + "eval_samples_per_second": 73.592, + "eval_steps_per_second": 9.252, + "step": 3300 + }, + { + "epoch": 10.56, + "grad_norm": 3.288024663925171, + "learning_rate": 4.721337579617834e-06, + "loss": 0.36, + "step": 3315 + }, + { + "epoch": 10.61, + "grad_norm": 5.169658660888672, + "learning_rate": 4.697452229299363e-06, + "loss": 0.3983, + "step": 3330 + }, + { + "epoch": 10.65, + "grad_norm": 6.143041610717773, + "learning_rate": 4.673566878980892e-06, + "loss": 0.3467, + "step": 3345 + }, + { + "epoch": 10.7, + "grad_norm": 4.744012355804443, + "learning_rate": 4.649681528662421e-06, + "loss": 0.3497, + "step": 3360 + }, + { + "epoch": 10.75, + "grad_norm": 3.161777973175049, + "learning_rate": 4.625796178343949e-06, + "loss": 0.3649, + "step": 3375 + }, + { + "epoch": 10.8, + "grad_norm": 5.027173042297363, + "learning_rate": 4.601910828025479e-06, + "loss": 0.3824, + "step": 3390 + }, + { + "epoch": 10.83, + "eval_accuracy": 0.8, + "eval_loss": 0.6610695719718933, + "eval_runtime": 14.447, + "eval_samples_per_second": 72.68, + "eval_steps_per_second": 9.137, + "step": 3400 + }, + { + "epoch": 10.84, + "grad_norm": 4.828523635864258, + "learning_rate": 4.5780254777070066e-06, + "loss": 0.3232, + "step": 3405 + }, + { + "epoch": 10.89, + "grad_norm": 6.017237186431885, + "learning_rate": 4.554140127388535e-06, + "loss": 0.3464, + "step": 3420 + }, + { + "epoch": 10.94, + "grad_norm": 4.829287528991699, + "learning_rate": 4.530254777070064e-06, + "loss": 0.3401, + "step": 3435 + }, + { + "epoch": 10.99, + "grad_norm": 4.1853485107421875, + "learning_rate": 4.506369426751593e-06, + "loss": 0.3005, + "step": 3450 + }, + { + "epoch": 11.04, + "grad_norm": 10.277836799621582, + "learning_rate": 4.482484076433121e-06, + "loss": 0.3473, + "step": 3465 + }, + { + "epoch": 11.08, + "grad_norm": 3.646463394165039, + "learning_rate": 4.45859872611465e-06, + "loss": 0.2844, + "step": 3480 + }, + { + "epoch": 11.13, + "grad_norm": 5.018289089202881, + "learning_rate": 4.434713375796179e-06, + "loss": 0.3589, + "step": 3495 + }, + { + "epoch": 11.15, + "eval_accuracy": 0.8019047619047619, + "eval_loss": 0.665899932384491, + "eval_runtime": 13.9371, + "eval_samples_per_second": 75.339, + "eval_steps_per_second": 9.471, + "step": 3500 + }, + { + "epoch": 11.18, + "grad_norm": 5.166079044342041, + "learning_rate": 4.410828025477708e-06, + "loss": 0.3608, + "step": 3510 + }, + { + "epoch": 11.23, + "grad_norm": 3.933661937713623, + "learning_rate": 4.386942675159236e-06, + "loss": 0.323, + "step": 3525 + }, + { + "epoch": 11.27, + "grad_norm": 6.12958288192749, + "learning_rate": 4.3630573248407645e-06, + "loss": 0.4135, + "step": 3540 + }, + { + "epoch": 11.32, + "grad_norm": 8.609708786010742, + "learning_rate": 4.339171974522293e-06, + "loss": 0.3714, + "step": 3555 + }, + { + "epoch": 11.37, + "grad_norm": 5.927934646606445, + "learning_rate": 4.315286624203822e-06, + "loss": 0.3629, + "step": 3570 + }, + { + "epoch": 11.42, + "grad_norm": 3.802996873855591, + "learning_rate": 4.29140127388535e-06, + "loss": 0.3184, + "step": 3585 + }, + { + "epoch": 11.46, + "grad_norm": 6.791957378387451, + "learning_rate": 4.26751592356688e-06, + "loss": 0.3299, + "step": 3600 + }, + { + "epoch": 11.46, + "eval_accuracy": 0.7961904761904762, + "eval_loss": 0.681881844997406, + "eval_runtime": 14.3394, + "eval_samples_per_second": 73.225, + "eval_steps_per_second": 9.205, + "step": 3600 + }, + { + "epoch": 11.51, + "grad_norm": 6.895874977111816, + "learning_rate": 4.243630573248408e-06, + "loss": 0.3421, + "step": 3615 + }, + { + "epoch": 11.56, + "grad_norm": 2.5626707077026367, + "learning_rate": 4.219745222929937e-06, + "loss": 0.333, + "step": 3630 + }, + { + "epoch": 11.61, + "grad_norm": 6.374701023101807, + "learning_rate": 4.1958598726114655e-06, + "loss": 0.3841, + "step": 3645 + }, + { + "epoch": 11.66, + "grad_norm": 3.2361741065979004, + "learning_rate": 4.171974522292994e-06, + "loss": 0.3102, + "step": 3660 + }, + { + "epoch": 11.7, + "grad_norm": 4.866982936859131, + "learning_rate": 4.148089171974522e-06, + "loss": 0.3589, + "step": 3675 + }, + { + "epoch": 11.75, + "grad_norm": 8.816326141357422, + "learning_rate": 4.124203821656051e-06, + "loss": 0.3736, + "step": 3690 + }, + { + "epoch": 11.78, + "eval_accuracy": 0.8114285714285714, + "eval_loss": 0.6405251026153564, + "eval_runtime": 14.9862, + "eval_samples_per_second": 70.064, + "eval_steps_per_second": 8.808, + "step": 3700 + }, + { + "epoch": 11.8, + "grad_norm": 5.2895121574401855, + "learning_rate": 4.10031847133758e-06, + "loss": 0.354, + "step": 3705 + }, + { + "epoch": 11.85, + "grad_norm": 7.5266499519348145, + "learning_rate": 4.076433121019109e-06, + "loss": 0.3051, + "step": 3720 + }, + { + "epoch": 11.89, + "grad_norm": 4.967874050140381, + "learning_rate": 4.052547770700637e-06, + "loss": 0.3428, + "step": 3735 + }, + { + "epoch": 11.94, + "grad_norm": 4.581750392913818, + "learning_rate": 4.0286624203821666e-06, + "loss": 0.3441, + "step": 3750 + }, + { + "epoch": 11.99, + "grad_norm": 3.4178411960601807, + "learning_rate": 4.0047770700636946e-06, + "loss": 0.3154, + "step": 3765 + }, + { + "epoch": 12.04, + "grad_norm": 7.328088760375977, + "learning_rate": 3.980891719745223e-06, + "loss": 0.3493, + "step": 3780 + }, + { + "epoch": 12.09, + "grad_norm": 24.022367477416992, + "learning_rate": 3.957006369426752e-06, + "loss": 0.3576, + "step": 3795 + }, + { + "epoch": 12.1, + "eval_accuracy": 0.7961904761904762, + "eval_loss": 0.6725224256515503, + "eval_runtime": 14.2063, + "eval_samples_per_second": 73.911, + "eval_steps_per_second": 9.292, + "step": 3800 + }, + { + "epoch": 12.13, + "grad_norm": 2.5664048194885254, + "learning_rate": 3.933121019108281e-06, + "loss": 0.3368, + "step": 3810 + }, + { + "epoch": 12.18, + "grad_norm": 7.625591278076172, + "learning_rate": 3.909235668789809e-06, + "loss": 0.3103, + "step": 3825 + }, + { + "epoch": 12.23, + "grad_norm": 3.343187093734741, + "learning_rate": 3.885350318471338e-06, + "loss": 0.3318, + "step": 3840 + }, + { + "epoch": 12.28, + "grad_norm": 7.195040702819824, + "learning_rate": 3.861464968152867e-06, + "loss": 0.2984, + "step": 3855 + }, + { + "epoch": 12.32, + "grad_norm": 5.504820346832275, + "learning_rate": 3.837579617834396e-06, + "loss": 0.2789, + "step": 3870 + }, + { + "epoch": 12.37, + "grad_norm": 7.319380760192871, + "learning_rate": 3.813694267515924e-06, + "loss": 0.289, + "step": 3885 + }, + { + "epoch": 12.42, + "grad_norm": 8.737920761108398, + "learning_rate": 3.789808917197453e-06, + "loss": 0.3454, + "step": 3900 + }, + { + "epoch": 12.42, + "eval_accuracy": 0.7942857142857143, + "eval_loss": 0.7025014162063599, + "eval_runtime": 13.9336, + "eval_samples_per_second": 75.358, + "eval_steps_per_second": 9.474, + "step": 3900 + }, + { + "epoch": 12.47, + "grad_norm": 6.828339576721191, + "learning_rate": 3.7659235668789813e-06, + "loss": 0.3705, + "step": 3915 + }, + { + "epoch": 12.52, + "grad_norm": 8.453927993774414, + "learning_rate": 3.7420382165605097e-06, + "loss": 0.3016, + "step": 3930 + }, + { + "epoch": 12.56, + "grad_norm": 4.459524631500244, + "learning_rate": 3.7181528662420386e-06, + "loss": 0.2653, + "step": 3945 + }, + { + "epoch": 12.61, + "grad_norm": 1.677200436592102, + "learning_rate": 3.694267515923567e-06, + "loss": 0.3191, + "step": 3960 + }, + { + "epoch": 12.66, + "grad_norm": 6.538283824920654, + "learning_rate": 3.670382165605096e-06, + "loss": 0.3162, + "step": 3975 + }, + { + "epoch": 12.71, + "grad_norm": 6.386500358581543, + "learning_rate": 3.6464968152866242e-06, + "loss": 0.3049, + "step": 3990 + }, + { + "epoch": 12.74, + "eval_accuracy": 0.8133333333333334, + "eval_loss": 0.6439189314842224, + "eval_runtime": 14.144, + "eval_samples_per_second": 74.237, + "eval_steps_per_second": 9.333, + "step": 4000 + }, + { + "epoch": 12.75, + "grad_norm": 10.936306953430176, + "learning_rate": 3.622611464968153e-06, + "loss": 0.2914, + "step": 4005 + }, + { + "epoch": 12.8, + "grad_norm": 7.365331649780273, + "learning_rate": 3.5987261146496815e-06, + "loss": 0.2969, + "step": 4020 + }, + { + "epoch": 12.85, + "grad_norm": 3.8908514976501465, + "learning_rate": 3.5748407643312103e-06, + "loss": 0.3224, + "step": 4035 + }, + { + "epoch": 12.9, + "grad_norm": 5.096860885620117, + "learning_rate": 3.5509554140127388e-06, + "loss": 0.346, + "step": 4050 + }, + { + "epoch": 12.95, + "grad_norm": 9.797178268432617, + "learning_rate": 3.527070063694268e-06, + "loss": 0.3392, + "step": 4065 + }, + { + "epoch": 12.99, + "grad_norm": 1.396338939666748, + "learning_rate": 3.5031847133757964e-06, + "loss": 0.3111, + "step": 4080 + }, + { + "epoch": 13.04, + "grad_norm": 5.88714599609375, + "learning_rate": 3.4792993630573253e-06, + "loss": 0.3363, + "step": 4095 + }, + { + "epoch": 13.06, + "eval_accuracy": 0.8142857142857143, + "eval_loss": 0.6351959109306335, + "eval_runtime": 14.1319, + "eval_samples_per_second": 74.3, + "eval_steps_per_second": 9.341, + "step": 4100 + }, + { + "epoch": 13.09, + "grad_norm": 3.2820403575897217, + "learning_rate": 3.4554140127388537e-06, + "loss": 0.3142, + "step": 4110 + }, + { + "epoch": 13.14, + "grad_norm": 7.780394077301025, + "learning_rate": 3.4315286624203825e-06, + "loss": 0.355, + "step": 4125 + }, + { + "epoch": 13.18, + "grad_norm": 4.955718517303467, + "learning_rate": 3.407643312101911e-06, + "loss": 0.2956, + "step": 4140 + }, + { + "epoch": 13.23, + "grad_norm": 3.0316593647003174, + "learning_rate": 3.38375796178344e-06, + "loss": 0.2811, + "step": 4155 + }, + { + "epoch": 13.28, + "grad_norm": 7.823929786682129, + "learning_rate": 3.3598726114649682e-06, + "loss": 0.3386, + "step": 4170 + }, + { + "epoch": 13.33, + "grad_norm": 6.2089457511901855, + "learning_rate": 3.335987261146497e-06, + "loss": 0.3011, + "step": 4185 + }, + { + "epoch": 13.38, + "grad_norm": 4.865994453430176, + "learning_rate": 3.3121019108280255e-06, + "loss": 0.3273, + "step": 4200 + }, + { + "epoch": 13.38, + "eval_accuracy": 0.7885714285714286, + "eval_loss": 0.6794772148132324, + "eval_runtime": 13.9608, + "eval_samples_per_second": 75.21, + "eval_steps_per_second": 9.455, + "step": 4200 + }, + { + "epoch": 13.42, + "grad_norm": 8.2437105178833, + "learning_rate": 3.2882165605095543e-06, + "loss": 0.342, + "step": 4215 + }, + { + "epoch": 13.47, + "grad_norm": 6.45313835144043, + "learning_rate": 3.2643312101910827e-06, + "loss": 0.2899, + "step": 4230 + }, + { + "epoch": 13.52, + "grad_norm": 5.616313457489014, + "learning_rate": 3.240445859872612e-06, + "loss": 0.2714, + "step": 4245 + }, + { + "epoch": 13.57, + "grad_norm": 6.7722554206848145, + "learning_rate": 3.2165605095541404e-06, + "loss": 0.3199, + "step": 4260 + }, + { + "epoch": 13.61, + "grad_norm": 3.373429775238037, + "learning_rate": 3.1926751592356693e-06, + "loss": 0.309, + "step": 4275 + }, + { + "epoch": 13.66, + "grad_norm": 6.058035373687744, + "learning_rate": 3.1687898089171977e-06, + "loss": 0.283, + "step": 4290 + }, + { + "epoch": 13.69, + "eval_accuracy": 0.8, + "eval_loss": 0.6704856157302856, + "eval_runtime": 14.0603, + "eval_samples_per_second": 74.678, + "eval_steps_per_second": 9.388, + "step": 4300 + }, + { + "epoch": 13.71, + "grad_norm": 11.277039527893066, + "learning_rate": 3.1449044585987265e-06, + "loss": 0.3114, + "step": 4305 + }, + { + "epoch": 13.76, + "grad_norm": 6.542344093322754, + "learning_rate": 3.121019108280255e-06, + "loss": 0.2487, + "step": 4320 + }, + { + "epoch": 13.81, + "grad_norm": 4.342966556549072, + "learning_rate": 3.097133757961784e-06, + "loss": 0.3076, + "step": 4335 + }, + { + "epoch": 13.85, + "grad_norm": 1.972347617149353, + "learning_rate": 3.0732484076433122e-06, + "loss": 0.2782, + "step": 4350 + }, + { + "epoch": 13.9, + "grad_norm": 5.91991662979126, + "learning_rate": 3.049363057324841e-06, + "loss": 0.3133, + "step": 4365 + }, + { + "epoch": 13.95, + "grad_norm": 5.045269012451172, + "learning_rate": 3.0254777070063695e-06, + "loss": 0.2368, + "step": 4380 + }, + { + "epoch": 14.0, + "grad_norm": 6.446601867675781, + "learning_rate": 3.0015923566878983e-06, + "loss": 0.2607, + "step": 4395 + }, + { + "epoch": 14.01, + "eval_accuracy": 0.7914285714285715, + "eval_loss": 0.6731985211372375, + "eval_runtime": 14.0854, + "eval_samples_per_second": 74.545, + "eval_steps_per_second": 9.371, + "step": 4400 + }, + { + "epoch": 14.04, + "grad_norm": 3.5464470386505127, + "learning_rate": 2.9777070063694267e-06, + "loss": 0.2699, + "step": 4410 + }, + { + "epoch": 14.09, + "grad_norm": 3.760664701461792, + "learning_rate": 2.953821656050956e-06, + "loss": 0.2393, + "step": 4425 + }, + { + "epoch": 14.14, + "grad_norm": 7.939091205596924, + "learning_rate": 2.9299363057324844e-06, + "loss": 0.3154, + "step": 4440 + }, + { + "epoch": 14.19, + "grad_norm": 5.330219745635986, + "learning_rate": 2.9060509554140133e-06, + "loss": 0.2955, + "step": 4455 + }, + { + "epoch": 14.24, + "grad_norm": 4.532066345214844, + "learning_rate": 2.8821656050955417e-06, + "loss": 0.3213, + "step": 4470 + }, + { + "epoch": 14.28, + "grad_norm": 9.095784187316895, + "learning_rate": 2.8582802547770705e-06, + "loss": 0.2958, + "step": 4485 + }, + { + "epoch": 14.33, + "grad_norm": 8.94389820098877, + "learning_rate": 2.834394904458599e-06, + "loss": 0.3174, + "step": 4500 + }, + { + "epoch": 14.33, + "eval_accuracy": 0.8047619047619048, + "eval_loss": 0.6691258549690247, + "eval_runtime": 13.9708, + "eval_samples_per_second": 75.157, + "eval_steps_per_second": 9.448, + "step": 4500 + }, + { + "epoch": 14.38, + "grad_norm": 7.957052230834961, + "learning_rate": 2.810509554140128e-06, + "loss": 0.3231, + "step": 4515 + }, + { + "epoch": 14.43, + "grad_norm": 4.776412487030029, + "learning_rate": 2.786624203821656e-06, + "loss": 0.3328, + "step": 4530 + }, + { + "epoch": 14.47, + "grad_norm": 7.478918552398682, + "learning_rate": 2.762738853503185e-06, + "loss": 0.3091, + "step": 4545 + }, + { + "epoch": 14.52, + "grad_norm": 5.6832990646362305, + "learning_rate": 2.7388535031847135e-06, + "loss": 0.2731, + "step": 4560 + }, + { + "epoch": 14.57, + "grad_norm": 6.8369951248168945, + "learning_rate": 2.7149681528662423e-06, + "loss": 0.296, + "step": 4575 + }, + { + "epoch": 14.62, + "grad_norm": 9.0847806930542, + "learning_rate": 2.6910828025477707e-06, + "loss": 0.3189, + "step": 4590 + }, + { + "epoch": 14.65, + "eval_accuracy": 0.8038095238095239, + "eval_loss": 0.6601914763450623, + "eval_runtime": 14.1296, + "eval_samples_per_second": 74.312, + "eval_steps_per_second": 9.342, + "step": 4600 + }, + { + "epoch": 14.67, + "grad_norm": 3.0164129734039307, + "learning_rate": 2.6671974522293e-06, + "loss": 0.2682, + "step": 4605 + }, + { + "epoch": 14.71, + "grad_norm": 23.393142700195312, + "learning_rate": 2.6433121019108284e-06, + "loss": 0.282, + "step": 4620 + }, + { + "epoch": 14.76, + "grad_norm": 2.419762372970581, + "learning_rate": 2.6194267515923573e-06, + "loss": 0.2954, + "step": 4635 + }, + { + "epoch": 14.81, + "grad_norm": 2.771768093109131, + "learning_rate": 2.5955414012738857e-06, + "loss": 0.3182, + "step": 4650 + }, + { + "epoch": 14.86, + "grad_norm": 4.787622928619385, + "learning_rate": 2.5716560509554145e-06, + "loss": 0.3212, + "step": 4665 + }, + { + "epoch": 14.9, + "grad_norm": 5.049059867858887, + "learning_rate": 2.547770700636943e-06, + "loss": 0.2473, + "step": 4680 + }, + { + "epoch": 14.95, + "grad_norm": 7.5213398933410645, + "learning_rate": 2.5238853503184718e-06, + "loss": 0.2862, + "step": 4695 + }, + { + "epoch": 14.97, + "eval_accuracy": 0.7933333333333333, + "eval_loss": 0.6800631880760193, + "eval_runtime": 14.0526, + "eval_samples_per_second": 74.719, + "eval_steps_per_second": 9.393, + "step": 4700 + }, + { + "epoch": 15.0, + "grad_norm": 7.287510395050049, + "learning_rate": 2.5e-06, + "loss": 0.3077, + "step": 4710 + }, + { + "epoch": 15.05, + "grad_norm": 6.637314319610596, + "learning_rate": 2.476114649681529e-06, + "loss": 0.2756, + "step": 4725 + }, + { + "epoch": 15.1, + "grad_norm": 3.2501256465911865, + "learning_rate": 2.4522292993630575e-06, + "loss": 0.2493, + "step": 4740 + }, + { + "epoch": 15.14, + "grad_norm": 5.583963871002197, + "learning_rate": 2.4283439490445863e-06, + "loss": 0.2417, + "step": 4755 + }, + { + "epoch": 15.19, + "grad_norm": 0.8829357624053955, + "learning_rate": 2.4044585987261147e-06, + "loss": 0.2662, + "step": 4770 + }, + { + "epoch": 15.24, + "grad_norm": 5.202518463134766, + "learning_rate": 2.3805732484076436e-06, + "loss": 0.2753, + "step": 4785 + }, + { + "epoch": 15.29, + "grad_norm": 6.174129486083984, + "learning_rate": 2.356687898089172e-06, + "loss": 0.2895, + "step": 4800 + }, + { + "epoch": 15.29, + "eval_accuracy": 0.8038095238095239, + "eval_loss": 0.6579437255859375, + "eval_runtime": 14.0203, + "eval_samples_per_second": 74.892, + "eval_steps_per_second": 9.415, + "step": 4800 + }, + { + "epoch": 15.33, + "grad_norm": 4.062588691711426, + "learning_rate": 2.332802547770701e-06, + "loss": 0.2899, + "step": 4815 + }, + { + "epoch": 15.38, + "grad_norm": 8.927217483520508, + "learning_rate": 2.3089171974522297e-06, + "loss": 0.2973, + "step": 4830 + }, + { + "epoch": 15.43, + "grad_norm": 4.327577590942383, + "learning_rate": 2.285031847133758e-06, + "loss": 0.3196, + "step": 4845 + }, + { + "epoch": 15.48, + "grad_norm": 7.842390537261963, + "learning_rate": 2.261146496815287e-06, + "loss": 0.3559, + "step": 4860 + }, + { + "epoch": 15.53, + "grad_norm": 2.109755754470825, + "learning_rate": 2.2372611464968154e-06, + "loss": 0.249, + "step": 4875 + }, + { + "epoch": 15.57, + "grad_norm": 4.3923420906066895, + "learning_rate": 2.213375796178344e-06, + "loss": 0.263, + "step": 4890 + }, + { + "epoch": 15.61, + "eval_accuracy": 0.8, + "eval_loss": 0.6687941551208496, + "eval_runtime": 13.924, + "eval_samples_per_second": 75.409, + "eval_steps_per_second": 9.48, + "step": 4900 + }, + { + "epoch": 15.62, + "grad_norm": 11.293251991271973, + "learning_rate": 2.189490445859873e-06, + "loss": 0.2603, + "step": 4905 + }, + { + "epoch": 15.67, + "grad_norm": 4.060614109039307, + "learning_rate": 2.1656050955414015e-06, + "loss": 0.3092, + "step": 4920 + }, + { + "epoch": 15.72, + "grad_norm": 7.007171154022217, + "learning_rate": 2.1417197452229303e-06, + "loss": 0.2563, + "step": 4935 + }, + { + "epoch": 15.76, + "grad_norm": 4.375155925750732, + "learning_rate": 2.1178343949044587e-06, + "loss": 0.2675, + "step": 4950 + }, + { + "epoch": 15.81, + "grad_norm": 8.038476943969727, + "learning_rate": 2.0939490445859876e-06, + "loss": 0.3577, + "step": 4965 + }, + { + "epoch": 15.86, + "grad_norm": 9.939512252807617, + "learning_rate": 2.070063694267516e-06, + "loss": 0.2964, + "step": 4980 + }, + { + "epoch": 15.91, + "grad_norm": 5.272069931030273, + "learning_rate": 2.046178343949045e-06, + "loss": 0.3214, + "step": 4995 + }, + { + "epoch": 15.92, + "eval_accuracy": 0.8057142857142857, + "eval_loss": 0.6546884775161743, + "eval_runtime": 14.1631, + "eval_samples_per_second": 74.137, + "eval_steps_per_second": 9.32, + "step": 5000 + }, + { + "epoch": 15.96, + "grad_norm": 1.2013658285140991, + "learning_rate": 2.0222929936305737e-06, + "loss": 0.2633, + "step": 5010 + }, + { + "epoch": 16.0, + "grad_norm": 8.995028495788574, + "learning_rate": 1.998407643312102e-06, + "loss": 0.2928, + "step": 5025 + }, + { + "epoch": 16.05, + "grad_norm": 8.143821716308594, + "learning_rate": 1.974522292993631e-06, + "loss": 0.3392, + "step": 5040 + }, + { + "epoch": 16.1, + "grad_norm": 4.234311103820801, + "learning_rate": 1.9506369426751593e-06, + "loss": 0.2818, + "step": 5055 + }, + { + "epoch": 16.15, + "grad_norm": 2.7548768520355225, + "learning_rate": 1.926751592356688e-06, + "loss": 0.2542, + "step": 5070 + }, + { + "epoch": 16.19, + "grad_norm": 8.989328384399414, + "learning_rate": 1.9028662420382168e-06, + "loss": 0.251, + "step": 5085 + }, + { + "epoch": 16.24, + "grad_norm": 7.30033540725708, + "learning_rate": 1.8789808917197455e-06, + "loss": 0.2867, + "step": 5100 + }, + { + "epoch": 16.24, + "eval_accuracy": 0.7923809523809524, + "eval_loss": 0.6775221824645996, + "eval_runtime": 14.1819, + "eval_samples_per_second": 74.038, + "eval_steps_per_second": 9.308, + "step": 5100 + }, + { + "epoch": 16.29, + "grad_norm": 5.032180309295654, + "learning_rate": 1.8550955414012739e-06, + "loss": 0.3202, + "step": 5115 + }, + { + "epoch": 16.34, + "grad_norm": 7.76137113571167, + "learning_rate": 1.8312101910828025e-06, + "loss": 0.308, + "step": 5130 + }, + { + "epoch": 16.39, + "grad_norm": 4.482850551605225, + "learning_rate": 1.8073248407643311e-06, + "loss": 0.3087, + "step": 5145 + }, + { + "epoch": 16.43, + "grad_norm": 5.670340538024902, + "learning_rate": 1.78343949044586e-06, + "loss": 0.2621, + "step": 5160 + }, + { + "epoch": 16.48, + "grad_norm": 4.9566216468811035, + "learning_rate": 1.7595541401273886e-06, + "loss": 0.3127, + "step": 5175 + }, + { + "epoch": 16.53, + "grad_norm": 3.655395984649658, + "learning_rate": 1.7356687898089172e-06, + "loss": 0.2242, + "step": 5190 + }, + { + "epoch": 16.56, + "eval_accuracy": 0.8085714285714286, + "eval_loss": 0.6378137469291687, + "eval_runtime": 13.9378, + "eval_samples_per_second": 75.335, + "eval_steps_per_second": 9.471, + "step": 5200 + }, + { + "epoch": 16.58, + "grad_norm": 2.2976183891296387, + "learning_rate": 1.7117834394904459e-06, + "loss": 0.2682, + "step": 5205 + }, + { + "epoch": 16.62, + "grad_norm": 3.8970346450805664, + "learning_rate": 1.6878980891719745e-06, + "loss": 0.2673, + "step": 5220 + }, + { + "epoch": 16.67, + "grad_norm": 2.6978113651275635, + "learning_rate": 1.6640127388535031e-06, + "loss": 0.2508, + "step": 5235 + }, + { + "epoch": 16.72, + "grad_norm": 9.089079856872559, + "learning_rate": 1.640127388535032e-06, + "loss": 0.2523, + "step": 5250 + }, + { + "epoch": 16.77, + "grad_norm": 3.596012830734253, + "learning_rate": 1.6162420382165606e-06, + "loss": 0.2849, + "step": 5265 + }, + { + "epoch": 16.82, + "grad_norm": 3.2140870094299316, + "learning_rate": 1.5923566878980892e-06, + "loss": 0.2513, + "step": 5280 + }, + { + "epoch": 16.86, + "grad_norm": 6.678956031799316, + "learning_rate": 1.5684713375796179e-06, + "loss": 0.2839, + "step": 5295 + }, + { + "epoch": 16.88, + "eval_accuracy": 0.799047619047619, + "eval_loss": 0.6760995984077454, + "eval_runtime": 14.0394, + "eval_samples_per_second": 74.789, + "eval_steps_per_second": 9.402, + "step": 5300 + }, + { + "epoch": 16.91, + "grad_norm": 4.08018684387207, + "learning_rate": 1.5445859872611465e-06, + "loss": 0.3049, + "step": 5310 + }, + { + "epoch": 16.96, + "grad_norm": 9.095474243164062, + "learning_rate": 1.5207006369426751e-06, + "loss": 0.2526, + "step": 5325 + }, + { + "epoch": 17.01, + "grad_norm": 3.758715867996216, + "learning_rate": 1.496815286624204e-06, + "loss": 0.2515, + "step": 5340 + }, + { + "epoch": 17.05, + "grad_norm": 5.71665096282959, + "learning_rate": 1.4729299363057326e-06, + "loss": 0.2906, + "step": 5355 + }, + { + "epoch": 17.1, + "grad_norm": 4.952653884887695, + "learning_rate": 1.4490445859872612e-06, + "loss": 0.2915, + "step": 5370 + }, + { + "epoch": 17.15, + "grad_norm": 7.454645156860352, + "learning_rate": 1.4251592356687899e-06, + "loss": 0.2878, + "step": 5385 + }, + { + "epoch": 17.2, + "grad_norm": 2.2225170135498047, + "learning_rate": 1.4012738853503185e-06, + "loss": 0.2424, + "step": 5400 + }, + { + "epoch": 17.2, + "eval_accuracy": 0.8123809523809524, + "eval_loss": 0.6385903358459473, + "eval_runtime": 13.8843, + "eval_samples_per_second": 75.625, + "eval_steps_per_second": 9.507, + "step": 5400 + }, + { + "epoch": 17.25, + "grad_norm": 4.7702765464782715, + "learning_rate": 1.3773885350318471e-06, + "loss": 0.2845, + "step": 5415 + }, + { + "epoch": 17.29, + "grad_norm": 4.871506214141846, + "learning_rate": 1.353503184713376e-06, + "loss": 0.2517, + "step": 5430 + }, + { + "epoch": 17.34, + "grad_norm": 2.186405897140503, + "learning_rate": 1.3296178343949046e-06, + "loss": 0.2052, + "step": 5445 + }, + { + "epoch": 17.39, + "grad_norm": 7.742133140563965, + "learning_rate": 1.3057324840764332e-06, + "loss": 0.2932, + "step": 5460 + }, + { + "epoch": 17.44, + "grad_norm": 4.161474704742432, + "learning_rate": 1.2818471337579619e-06, + "loss": 0.2783, + "step": 5475 + }, + { + "epoch": 17.48, + "grad_norm": 3.773857593536377, + "learning_rate": 1.2579617834394905e-06, + "loss": 0.2666, + "step": 5490 + }, + { + "epoch": 17.52, + "eval_accuracy": 0.8133333333333334, + "eval_loss": 0.6492887139320374, + "eval_runtime": 13.7993, + "eval_samples_per_second": 76.091, + "eval_steps_per_second": 9.566, + "step": 5500 + }, + { + "epoch": 17.53, + "grad_norm": 3.4864587783813477, + "learning_rate": 1.2340764331210191e-06, + "loss": 0.2598, + "step": 5505 + }, + { + "epoch": 17.58, + "grad_norm": 10.541982650756836, + "learning_rate": 1.210191082802548e-06, + "loss": 0.278, + "step": 5520 + }, + { + "epoch": 17.63, + "grad_norm": 3.506603240966797, + "learning_rate": 1.1863057324840766e-06, + "loss": 0.2818, + "step": 5535 + }, + { + "epoch": 17.68, + "grad_norm": 3.9437873363494873, + "learning_rate": 1.1624203821656052e-06, + "loss": 0.2264, + "step": 5550 + }, + { + "epoch": 17.72, + "grad_norm": 10.139039993286133, + "learning_rate": 1.1385350318471339e-06, + "loss": 0.3066, + "step": 5565 + }, + { + "epoch": 17.77, + "grad_norm": 4.62479829788208, + "learning_rate": 1.1146496815286625e-06, + "loss": 0.2531, + "step": 5580 + }, + { + "epoch": 17.82, + "grad_norm": 7.675441741943359, + "learning_rate": 1.0907643312101911e-06, + "loss": 0.2259, + "step": 5595 + }, + { + "epoch": 17.83, + "eval_accuracy": 0.8047619047619048, + "eval_loss": 0.6514009237289429, + "eval_runtime": 14.2419, + "eval_samples_per_second": 73.726, + "eval_steps_per_second": 9.268, + "step": 5600 + }, + { + "epoch": 17.87, + "grad_norm": 3.4506428241729736, + "learning_rate": 1.06687898089172e-06, + "loss": 0.2672, + "step": 5610 + }, + { + "epoch": 17.91, + "grad_norm": 2.7243833541870117, + "learning_rate": 1.0429936305732486e-06, + "loss": 0.2652, + "step": 5625 + }, + { + "epoch": 17.96, + "grad_norm": 4.135616302490234, + "learning_rate": 1.0191082802547772e-06, + "loss": 0.2011, + "step": 5640 + }, + { + "epoch": 18.01, + "grad_norm": 7.302999496459961, + "learning_rate": 9.952229299363059e-07, + "loss": 0.2782, + "step": 5655 + }, + { + "epoch": 18.06, + "grad_norm": 7.608941555023193, + "learning_rate": 9.713375796178345e-07, + "loss": 0.2871, + "step": 5670 + }, + { + "epoch": 18.11, + "grad_norm": 7.69070291519165, + "learning_rate": 9.474522292993632e-07, + "loss": 0.2782, + "step": 5685 + }, + { + "epoch": 18.15, + "grad_norm": 4.241176605224609, + "learning_rate": 9.235668789808917e-07, + "loss": 0.2533, + "step": 5700 + }, + { + "epoch": 18.15, + "eval_accuracy": 0.8, + "eval_loss": 0.6676008105278015, + "eval_runtime": 14.3329, + "eval_samples_per_second": 73.258, + "eval_steps_per_second": 9.21, + "step": 5700 + }, + { + "epoch": 18.2, + "grad_norm": 5.718116283416748, + "learning_rate": 8.996815286624204e-07, + "loss": 0.2683, + "step": 5715 + }, + { + "epoch": 18.25, + "grad_norm": 7.374713897705078, + "learning_rate": 8.757961783439491e-07, + "loss": 0.265, + "step": 5730 + }, + { + "epoch": 18.3, + "grad_norm": 1.2615108489990234, + "learning_rate": 8.519108280254777e-07, + "loss": 0.2169, + "step": 5745 + }, + { + "epoch": 18.34, + "grad_norm": 9.361245155334473, + "learning_rate": 8.280254777070064e-07, + "loss": 0.2921, + "step": 5760 + }, + { + "epoch": 18.39, + "grad_norm": 6.853837490081787, + "learning_rate": 8.041401273885351e-07, + "loss": 0.2734, + "step": 5775 + }, + { + "epoch": 18.44, + "grad_norm": 4.097062110900879, + "learning_rate": 7.802547770700637e-07, + "loss": 0.2697, + "step": 5790 + }, + { + "epoch": 18.47, + "eval_accuracy": 0.800952380952381, + "eval_loss": 0.6705303192138672, + "eval_runtime": 14.6447, + "eval_samples_per_second": 71.698, + "eval_steps_per_second": 9.014, + "step": 5800 + }, + { + "epoch": 18.49, + "grad_norm": 4.86583948135376, + "learning_rate": 7.563694267515924e-07, + "loss": 0.269, + "step": 5805 + }, + { + "epoch": 18.54, + "grad_norm": 7.498669624328613, + "learning_rate": 7.324840764331211e-07, + "loss": 0.223, + "step": 5820 + }, + { + "epoch": 18.58, + "grad_norm": 4.731110572814941, + "learning_rate": 7.085987261146497e-07, + "loss": 0.2215, + "step": 5835 + }, + { + "epoch": 18.63, + "grad_norm": 4.388888359069824, + "learning_rate": 6.847133757961784e-07, + "loss": 0.275, + "step": 5850 + }, + { + "epoch": 18.68, + "grad_norm": 7.220559120178223, + "learning_rate": 6.608280254777071e-07, + "loss": 0.252, + "step": 5865 + }, + { + "epoch": 18.73, + "grad_norm": 6.225268363952637, + "learning_rate": 6.369426751592357e-07, + "loss": 0.2532, + "step": 5880 + }, + { + "epoch": 18.77, + "grad_norm": 5.6799702644348145, + "learning_rate": 6.130573248407644e-07, + "loss": 0.2558, + "step": 5895 + }, + { + "epoch": 18.79, + "eval_accuracy": 0.8076190476190476, + "eval_loss": 0.6749628782272339, + "eval_runtime": 14.118, + "eval_samples_per_second": 74.373, + "eval_steps_per_second": 9.35, + "step": 5900 + }, + { + "epoch": 18.82, + "grad_norm": 1.1455104351043701, + "learning_rate": 5.89171974522293e-07, + "loss": 0.2455, + "step": 5910 + }, + { + "epoch": 18.87, + "grad_norm": 0.5723968744277954, + "learning_rate": 5.652866242038217e-07, + "loss": 0.2393, + "step": 5925 + }, + { + "epoch": 18.92, + "grad_norm": 6.428089141845703, + "learning_rate": 5.414012738853504e-07, + "loss": 0.2649, + "step": 5940 + }, + { + "epoch": 18.96, + "grad_norm": 4.993350028991699, + "learning_rate": 5.17515923566879e-07, + "loss": 0.2689, + "step": 5955 + }, + { + "epoch": 19.01, + "grad_norm": 8.829191207885742, + "learning_rate": 4.936305732484077e-07, + "loss": 0.3045, + "step": 5970 + }, + { + "epoch": 19.06, + "grad_norm": 8.624272346496582, + "learning_rate": 4.6974522292993636e-07, + "loss": 0.229, + "step": 5985 + }, + { + "epoch": 19.11, + "grad_norm": 1.7336400747299194, + "learning_rate": 4.45859872611465e-07, + "loss": 0.2469, + "step": 6000 + }, + { + "epoch": 19.11, + "eval_accuracy": 0.799047619047619, + "eval_loss": 0.6750813722610474, + "eval_runtime": 14.571, + "eval_samples_per_second": 72.061, + "eval_steps_per_second": 9.059, + "step": 6000 + }, + { + "epoch": 19.16, + "grad_norm": 7.821689605712891, + "learning_rate": 4.219745222929936e-07, + "loss": 0.2518, + "step": 6015 + }, + { + "epoch": 19.2, + "grad_norm": 4.377063751220703, + "learning_rate": 3.980891719745223e-07, + "loss": 0.2744, + "step": 6030 + }, + { + "epoch": 19.25, + "grad_norm": 3.8449316024780273, + "learning_rate": 3.74203821656051e-07, + "loss": 0.2222, + "step": 6045 + }, + { + "epoch": 19.3, + "grad_norm": 9.11525821685791, + "learning_rate": 3.503184713375796e-07, + "loss": 0.2353, + "step": 6060 + }, + { + "epoch": 19.35, + "grad_norm": 2.008242607116699, + "learning_rate": 3.264331210191083e-07, + "loss": 0.2803, + "step": 6075 + }, + { + "epoch": 19.39, + "grad_norm": 3.9407541751861572, + "learning_rate": 3.02547770700637e-07, + "loss": 0.284, + "step": 6090 + }, + { + "epoch": 19.43, + "eval_accuracy": 0.7980952380952381, + "eval_loss": 0.6737999320030212, + "eval_runtime": 14.3198, + "eval_samples_per_second": 73.325, + "eval_steps_per_second": 9.218, + "step": 6100 + }, + { + "epoch": 19.44, + "grad_norm": 4.276314735412598, + "learning_rate": 2.786624203821656e-07, + "loss": 0.282, + "step": 6105 + }, + { + "epoch": 19.49, + "grad_norm": 4.9115729331970215, + "learning_rate": 2.547770700636943e-07, + "loss": 0.2635, + "step": 6120 + }, + { + "epoch": 19.54, + "grad_norm": 2.512660026550293, + "learning_rate": 2.3089171974522294e-07, + "loss": 0.2404, + "step": 6135 + }, + { + "epoch": 19.59, + "grad_norm": 4.966971397399902, + "learning_rate": 2.070063694267516e-07, + "loss": 0.2979, + "step": 6150 + }, + { + "epoch": 19.63, + "grad_norm": 4.7076029777526855, + "learning_rate": 1.8312101910828028e-07, + "loss": 0.2563, + "step": 6165 + }, + { + "epoch": 19.68, + "grad_norm": 9.003276824951172, + "learning_rate": 1.5923566878980893e-07, + "loss": 0.2283, + "step": 6180 + }, + { + "epoch": 19.73, + "grad_norm": 3.8437254428863525, + "learning_rate": 1.353503184713376e-07, + "loss": 0.2534, + "step": 6195 + }, + { + "epoch": 19.75, + "eval_accuracy": 0.8019047619047619, + "eval_loss": 0.675845742225647, + "eval_runtime": 14.3373, + "eval_samples_per_second": 73.235, + "eval_steps_per_second": 9.207, + "step": 6200 + }, + { + "epoch": 19.78, + "grad_norm": 4.477542400360107, + "learning_rate": 1.1146496815286625e-07, + "loss": 0.2368, + "step": 6210 + }, + { + "epoch": 19.82, + "grad_norm": 9.546680450439453, + "learning_rate": 8.75796178343949e-08, + "loss": 0.2796, + "step": 6225 + }, + { + "epoch": 19.87, + "grad_norm": 7.132513999938965, + "learning_rate": 6.369426751592358e-08, + "loss": 0.2658, + "step": 6240 + }, + { + "epoch": 19.92, + "grad_norm": 5.739116191864014, + "learning_rate": 3.9808917197452233e-08, + "loss": 0.3468, + "step": 6255 + }, + { + "epoch": 19.97, + "grad_norm": 7.056791305541992, + "learning_rate": 1.5923566878980894e-08, + "loss": 0.2755, + "step": 6270 + }, + { + "epoch": 20.0, + "step": 6280, + "total_flos": 1.555375746295849e+19, + "train_loss": 0.5583157776647313, + "train_runtime": 5878.7322, + "train_samples_per_second": 34.14, + "train_steps_per_second": 1.068 } ], "logging_steps": 15, - "max_steps": 3140, + "max_steps": 6280, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 20, "save_steps": 100, - "total_flos": 3.265548125287219e+18, + "total_flos": 1.555375746295849e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null