diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,901 +1,4623 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 86.66666666666667, + "epoch": 148.8, "eval_steps": 500, - "global_step": 130, + "global_step": 4650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 1.0, - "eval_accuracy": 0.0, - "eval_loss": 2.030040740966797, - "eval_runtime": 0.2932, - "eval_samples_per_second": 27.282, - "eval_steps_per_second": 3.41, - "step": 1 + "epoch": 0.32, + "grad_norm": 3.6344285011291504, + "learning_rate": 1.0752688172043011e-06, + "loss": 3.6013, + "step": 10 + }, + { + "epoch": 0.64, + "grad_norm": 3.322525978088379, + "learning_rate": 2.1505376344086023e-06, + "loss": 3.6033, + "step": 20 + }, + { + "epoch": 0.96, + "grad_norm": 3.9304728507995605, + "learning_rate": 3.225806451612903e-06, + "loss": 3.5919, + "step": 30 + }, + { + "epoch": 0.992, + "eval_accuracy": 0.03153153153153153, + "eval_loss": 3.5735630989074707, + "eval_runtime": 10.6737, + "eval_samples_per_second": 41.598, + "eval_steps_per_second": 1.312, + "step": 31 + }, + { + "epoch": 1.28, + "grad_norm": 4.976346492767334, + "learning_rate": 4.3010752688172045e-06, + "loss": 3.5714, + "step": 40 + }, + { + "epoch": 1.6, + "grad_norm": 6.2756218910217285, + "learning_rate": 5.376344086021506e-06, + "loss": 3.5632, + "step": 50 + }, + { + "epoch": 1.92, + "grad_norm": 6.759893894195557, + "learning_rate": 6.451612903225806e-06, + "loss": 3.5392, + "step": 60 + }, + { + "epoch": 1.984, + "eval_accuracy": 0.0472972972972973, + "eval_loss": 3.4989891052246094, + "eval_runtime": 9.3519, + "eval_samples_per_second": 47.477, + "eval_steps_per_second": 1.497, + "step": 62 + }, + { + "epoch": 2.24, + "grad_norm": 7.38181209564209, + "learning_rate": 7.526881720430108e-06, + "loss": 3.5077, + "step": 70 + }, + { + "epoch": 2.56, + "grad_norm": 8.292854309082031, + "learning_rate": 8.602150537634409e-06, + "loss": 3.4781, + "step": 80 + }, + { + "epoch": 2.88, + "grad_norm": 10.34473705291748, + "learning_rate": 9.67741935483871e-06, + "loss": 3.4591, + "step": 90 + }, + { + "epoch": 2.976, + "eval_accuracy": 0.11036036036036036, + "eval_loss": 3.3992254734039307, + "eval_runtime": 9.265, + "eval_samples_per_second": 47.922, + "eval_steps_per_second": 1.511, + "step": 93 + }, + { + "epoch": 3.2, + "grad_norm": 14.919889450073242, + "learning_rate": 1.0752688172043012e-05, + "loss": 3.425, + "step": 100 }, { - "epoch": 2.0, - "eval_accuracy": 0.0, - "eval_loss": 2.020792007446289, - "eval_runtime": 0.2487, - "eval_samples_per_second": 32.169, - "eval_steps_per_second": 4.021, - "step": 3 + "epoch": 3.52, + "grad_norm": 13.331893920898438, + "learning_rate": 1.1827956989247313e-05, + "loss": 3.3924, + "step": 110 }, { - "epoch": 3.0, - "eval_accuracy": 0.0, - "eval_loss": 1.996970534324646, - "eval_runtime": 0.2271, - "eval_samples_per_second": 35.23, - "eval_steps_per_second": 4.404, - "step": 5 + "epoch": 3.84, + "grad_norm": 10.675400733947754, + "learning_rate": 1.2903225806451613e-05, + "loss": 3.3361, + "step": 120 }, { "epoch": 4.0, - "eval_accuracy": 0.125, - "eval_loss": 1.9852917194366455, - "eval_runtime": 0.2294, - "eval_samples_per_second": 34.875, - "eval_steps_per_second": 4.359, - "step": 6 - }, - { - "epoch": 5.0, - "eval_accuracy": 0.125, - "eval_loss": 1.9666370153427124, - "eval_runtime": 0.23, - "eval_samples_per_second": 34.788, - "eval_steps_per_second": 4.348, - "step": 7 - }, - { - "epoch": 6.0, - "eval_accuracy": 0.25, - "eval_loss": 1.9215028285980225, - "eval_runtime": 0.2245, - "eval_samples_per_second": 35.632, - "eval_steps_per_second": 4.454, - "step": 9 - }, - { - "epoch": 6.666666666666667, - "grad_norm": 14.784972190856934, - "learning_rate": 3.846153846153846e-05, - "loss": 1.024, - "step": 10 + "eval_accuracy": 0.25225225225225223, + "eval_loss": 3.2525722980499268, + "eval_runtime": 9.2756, + "eval_samples_per_second": 47.868, + "eval_steps_per_second": 1.509, + "step": 125 + }, + { + "epoch": 4.16, + "grad_norm": 14.011612892150879, + "learning_rate": 1.3978494623655914e-05, + "loss": 3.2768, + "step": 130 + }, + { + "epoch": 4.48, + "grad_norm": 12.37881851196289, + "learning_rate": 1.5053763440860215e-05, + "loss": 3.2526, + "step": 140 + }, + { + "epoch": 4.8, + "grad_norm": 11.936776161193848, + "learning_rate": 1.6129032258064517e-05, + "loss": 3.2066, + "step": 150 + }, + { + "epoch": 4.992, + "eval_accuracy": 0.3761261261261261, + "eval_loss": 3.085122585296631, + "eval_runtime": 9.2654, + "eval_samples_per_second": 47.92, + "eval_steps_per_second": 1.511, + "step": 156 + }, + { + "epoch": 5.12, + "grad_norm": 15.466531753540039, + "learning_rate": 1.7204301075268818e-05, + "loss": 3.1437, + "step": 160 + }, + { + "epoch": 5.44, + "grad_norm": 12.19348430633545, + "learning_rate": 1.827956989247312e-05, + "loss": 3.0506, + "step": 170 + }, + { + "epoch": 5.76, + "grad_norm": 18.181127548217773, + "learning_rate": 1.935483870967742e-05, + "loss": 3.0024, + "step": 180 + }, + { + "epoch": 5.984, + "eval_accuracy": 0.4436936936936937, + "eval_loss": 2.837559938430786, + "eval_runtime": 9.2717, + "eval_samples_per_second": 47.888, + "eval_steps_per_second": 1.51, + "step": 187 + }, + { + "epoch": 6.08, + "grad_norm": 11.255278587341309, + "learning_rate": 2.0430107526881722e-05, + "loss": 2.926, + "step": 190 + }, + { + "epoch": 6.4, + "grad_norm": 27.72000503540039, + "learning_rate": 2.1505376344086024e-05, + "loss": 2.8533, + "step": 200 + }, + { + "epoch": 6.72, + "grad_norm": 29.05733299255371, + "learning_rate": 2.258064516129032e-05, + "loss": 2.8094, + "step": 210 + }, + { + "epoch": 6.976, + "eval_accuracy": 0.49099099099099097, + "eval_loss": 2.631969928741455, + "eval_runtime": 9.3148, + "eval_samples_per_second": 47.666, + "eval_steps_per_second": 1.503, + "step": 218 + }, + { + "epoch": 7.04, + "grad_norm": 14.886007308959961, + "learning_rate": 2.3655913978494626e-05, + "loss": 2.7366, + "step": 220 + }, + { + "epoch": 7.36, + "grad_norm": 16.759021759033203, + "learning_rate": 2.4731182795698928e-05, + "loss": 2.6044, + "step": 230 + }, + { + "epoch": 7.68, + "grad_norm": 83.9926528930664, + "learning_rate": 2.5806451612903226e-05, + "loss": 2.5626, + "step": 240 }, { - "epoch": 7.0, - "eval_accuracy": 0.125, - "eval_loss": 1.8757238388061523, - "eval_runtime": 0.2259, - "eval_samples_per_second": 35.411, - "eval_steps_per_second": 4.426, - "step": 11 + "epoch": 8.0, + "grad_norm": 101.79939270019531, + "learning_rate": 2.6881720430107527e-05, + "loss": 2.509, + "step": 250 }, { "epoch": 8.0, - "eval_accuracy": 0.125, - "eval_loss": 1.8579578399658203, - "eval_runtime": 0.2276, - "eval_samples_per_second": 35.149, - "eval_steps_per_second": 4.394, - "step": 12 + "eval_accuracy": 0.536036036036036, + "eval_loss": 2.376469850540161, + "eval_runtime": 9.414, + "eval_samples_per_second": 47.164, + "eval_steps_per_second": 1.487, + "step": 250 + }, + { + "epoch": 8.32, + "grad_norm": 28.594270706176758, + "learning_rate": 2.7956989247311828e-05, + "loss": 2.4039, + "step": 260 + }, + { + "epoch": 8.64, + "grad_norm": 34.03124237060547, + "learning_rate": 2.9032258064516133e-05, + "loss": 2.2813, + "step": 270 + }, + { + "epoch": 8.96, + "grad_norm": 13.96226978302002, + "learning_rate": 3.010752688172043e-05, + "loss": 2.2526, + "step": 280 + }, + { + "epoch": 8.992, + "eval_accuracy": 0.5923423423423423, + "eval_loss": 2.040027141571045, + "eval_runtime": 9.1841, + "eval_samples_per_second": 48.344, + "eval_steps_per_second": 1.524, + "step": 281 + }, + { + "epoch": 9.28, + "grad_norm": 29.609779357910156, + "learning_rate": 3.118279569892473e-05, + "loss": 2.0909, + "step": 290 + }, + { + "epoch": 9.6, + "grad_norm": 29.265300750732422, + "learning_rate": 3.2258064516129034e-05, + "loss": 2.0485, + "step": 300 + }, + { + "epoch": 9.92, + "grad_norm": 32.22190856933594, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.9442, + "step": 310 + }, + { + "epoch": 9.984, + "eval_accuracy": 0.6396396396396397, + "eval_loss": 1.7939640283584595, + "eval_runtime": 9.3505, + "eval_samples_per_second": 47.484, + "eval_steps_per_second": 1.497, + "step": 312 + }, + { + "epoch": 10.24, + "grad_norm": 21.0910701751709, + "learning_rate": 3.4408602150537636e-05, + "loss": 1.9181, + "step": 320 + }, + { + "epoch": 10.56, + "grad_norm": 45.26454544067383, + "learning_rate": 3.548387096774194e-05, + "loss": 1.7878, + "step": 330 + }, + { + "epoch": 10.88, + "grad_norm": 35.63391876220703, + "learning_rate": 3.655913978494624e-05, + "loss": 1.7672, + "step": 340 + }, + { + "epoch": 10.975999999999999, + "eval_accuracy": 0.6824324324324325, + "eval_loss": 1.5892232656478882, + "eval_runtime": 9.2159, + "eval_samples_per_second": 48.177, + "eval_steps_per_second": 1.519, + "step": 343 }, { - "epoch": 9.0, - "eval_accuracy": 0.125, - "eval_loss": 1.8413233757019043, - "eval_runtime": 0.232, - "eval_samples_per_second": 34.49, - "eval_steps_per_second": 4.311, - "step": 13 + "epoch": 11.2, + "grad_norm": 14.121517181396484, + "learning_rate": 3.763440860215054e-05, + "loss": 1.6401, + "step": 350 }, { - "epoch": 10.0, - "eval_accuracy": 0.375, - "eval_loss": 1.7954258918762207, - "eval_runtime": 0.2267, - "eval_samples_per_second": 35.29, - "eval_steps_per_second": 4.411, - "step": 15 + "epoch": 11.52, + "grad_norm": 17.638029098510742, + "learning_rate": 3.870967741935484e-05, + "loss": 1.6047, + "step": 360 }, { - "epoch": 11.0, - "eval_accuracy": 0.5, - "eval_loss": 1.7509543895721436, - "eval_runtime": 0.2337, - "eval_samples_per_second": 34.23, - "eval_steps_per_second": 4.279, - "step": 17 + "epoch": 11.84, + "grad_norm": 28.33017921447754, + "learning_rate": 3.978494623655914e-05, + "loss": 1.5273, + "step": 370 }, { "epoch": 12.0, - "eval_accuracy": 0.625, - "eval_loss": 1.7309346199035645, - "eval_runtime": 0.2261, - "eval_samples_per_second": 35.381, - "eval_steps_per_second": 4.423, - "step": 18 - }, - { - "epoch": 13.0, - "eval_accuracy": 0.625, - "eval_loss": 1.7131527662277222, - "eval_runtime": 0.2297, - "eval_samples_per_second": 34.829, - "eval_steps_per_second": 4.354, - "step": 19 - }, - { - "epoch": 13.333333333333334, - "grad_norm": 9.231935501098633, - "learning_rate": 4.700854700854701e-05, - "loss": 0.8487, - "step": 20 + "eval_accuracy": 0.7184684684684685, + "eval_loss": 1.3499852418899536, + "eval_runtime": 9.247, + "eval_samples_per_second": 48.016, + "eval_steps_per_second": 1.514, + "step": 375 + }, + { + "epoch": 12.16, + "grad_norm": 24.126209259033203, + "learning_rate": 4.0860215053763444e-05, + "loss": 1.4728, + "step": 380 + }, + { + "epoch": 12.48, + "grad_norm": 27.138376235961914, + "learning_rate": 4.1935483870967746e-05, + "loss": 1.4202, + "step": 390 + }, + { + "epoch": 12.8, + "grad_norm": 37.29874038696289, + "learning_rate": 4.301075268817205e-05, + "loss": 1.3854, + "step": 400 + }, + { + "epoch": 12.992, + "eval_accuracy": 0.7162162162162162, + "eval_loss": 1.224295735359192, + "eval_runtime": 9.3269, + "eval_samples_per_second": 47.604, + "eval_steps_per_second": 1.501, + "step": 406 + }, + { + "epoch": 13.12, + "grad_norm": 23.704998016357422, + "learning_rate": 4.408602150537635e-05, + "loss": 1.3024, + "step": 410 + }, + { + "epoch": 13.44, + "grad_norm": 31.681236267089844, + "learning_rate": 4.516129032258064e-05, + "loss": 1.2981, + "step": 420 + }, + { + "epoch": 13.76, + "grad_norm": 26.51857566833496, + "learning_rate": 4.6236559139784944e-05, + "loss": 1.197, + "step": 430 }, { - "epoch": 14.0, - "eval_accuracy": 0.625, - "eval_loss": 1.6768202781677246, - "eval_runtime": 0.2263, - "eval_samples_per_second": 35.354, - "eval_steps_per_second": 4.419, - "step": 21 + "epoch": 13.984, + "eval_accuracy": 0.7387387387387387, + "eval_loss": 1.1022223234176636, + "eval_runtime": 9.3715, + "eval_samples_per_second": 47.378, + "eval_steps_per_second": 1.494, + "step": 437 }, { - "epoch": 15.0, - "eval_accuracy": 0.625, - "eval_loss": 1.64021635055542, - "eval_runtime": 0.2376, - "eval_samples_per_second": 33.673, - "eval_steps_per_second": 4.209, - "step": 23 + "epoch": 14.08, + "grad_norm": 19.252714157104492, + "learning_rate": 4.731182795698925e-05, + "loss": 1.148, + "step": 440 + }, + { + "epoch": 14.4, + "grad_norm": 39.495262145996094, + "learning_rate": 4.8387096774193554e-05, + "loss": 1.1297, + "step": 450 + }, + { + "epoch": 14.72, + "grad_norm": 35.67964172363281, + "learning_rate": 4.9462365591397855e-05, + "loss": 1.1114, + "step": 460 + }, + { + "epoch": 14.975999999999999, + "eval_accuracy": 0.7612612612612613, + "eval_loss": 1.0137522220611572, + "eval_runtime": 9.3185, + "eval_samples_per_second": 47.647, + "eval_steps_per_second": 1.502, + "step": 468 + }, + { + "epoch": 15.04, + "grad_norm": 13.771783828735352, + "learning_rate": 4.994026284348865e-05, + "loss": 1.0861, + "step": 470 + }, + { + "epoch": 15.36, + "grad_norm": 17.045921325683594, + "learning_rate": 4.982078853046595e-05, + "loss": 1.0129, + "step": 480 + }, + { + "epoch": 15.68, + "grad_norm": 30.740581512451172, + "learning_rate": 4.9701314217443254e-05, + "loss": 1.0051, + "step": 490 }, { "epoch": 16.0, - "eval_accuracy": 0.625, - "eval_loss": 1.6197324991226196, - "eval_runtime": 0.2274, - "eval_samples_per_second": 35.183, - "eval_steps_per_second": 4.398, - "step": 24 + "grad_norm": 29.636211395263672, + "learning_rate": 4.9581839904420555e-05, + "loss": 0.9364, + "step": 500 }, { - "epoch": 17.0, - "eval_accuracy": 0.625, - "eval_loss": 1.5951614379882812, - "eval_runtime": 0.2246, - "eval_samples_per_second": 35.616, - "eval_steps_per_second": 4.452, - "step": 25 + "epoch": 16.0, + "eval_accuracy": 0.7747747747747747, + "eval_loss": 0.9164464473724365, + "eval_runtime": 9.2849, + "eval_samples_per_second": 47.819, + "eval_steps_per_second": 1.508, + "step": 500 }, { - "epoch": 18.0, - "eval_accuracy": 0.625, - "eval_loss": 1.525948166847229, - "eval_runtime": 0.229, - "eval_samples_per_second": 34.929, - "eval_steps_per_second": 4.366, - "step": 27 + "epoch": 16.32, + "grad_norm": 15.4671049118042, + "learning_rate": 4.9462365591397855e-05, + "loss": 0.8783, + "step": 510 }, { - "epoch": 19.0, - "eval_accuracy": 0.625, - "eval_loss": 1.4598863124847412, - "eval_runtime": 0.228, - "eval_samples_per_second": 35.095, - "eval_steps_per_second": 4.387, - "step": 29 + "epoch": 16.64, + "grad_norm": 17.964916229248047, + "learning_rate": 4.9342891278375156e-05, + "loss": 0.8561, + "step": 520 }, { - "epoch": 20.0, - "grad_norm": 15.795075416564941, - "learning_rate": 4.2735042735042735e-05, - "loss": 0.6549, - "step": 30 + "epoch": 16.96, + "grad_norm": 24.931062698364258, + "learning_rate": 4.9223416965352456e-05, + "loss": 0.8755, + "step": 530 + }, + { + "epoch": 16.992, + "eval_accuracy": 0.7522522522522522, + "eval_loss": 0.9057530164718628, + "eval_runtime": 9.2393, + "eval_samples_per_second": 48.055, + "eval_steps_per_second": 1.515, + "step": 531 + }, + { + "epoch": 17.28, + "grad_norm": 24.417903900146484, + "learning_rate": 4.910394265232976e-05, + "loss": 0.8487, + "step": 540 + }, + { + "epoch": 17.6, + "grad_norm": 25.21699333190918, + "learning_rate": 4.898446833930705e-05, + "loss": 0.8157, + "step": 550 + }, + { + "epoch": 17.92, + "grad_norm": 17.92466926574707, + "learning_rate": 4.886499402628435e-05, + "loss": 0.7473, + "step": 560 + }, + { + "epoch": 17.984, + "eval_accuracy": 0.7927927927927928, + "eval_loss": 0.804523766040802, + "eval_runtime": 9.2619, + "eval_samples_per_second": 47.938, + "eval_steps_per_second": 1.512, + "step": 562 + }, + { + "epoch": 18.24, + "grad_norm": 25.38798713684082, + "learning_rate": 4.874551971326165e-05, + "loss": 0.7434, + "step": 570 + }, + { + "epoch": 18.56, + "grad_norm": 22.557018280029297, + "learning_rate": 4.862604540023895e-05, + "loss": 0.7647, + "step": 580 + }, + { + "epoch": 18.88, + "grad_norm": 15.224944114685059, + "learning_rate": 4.850657108721625e-05, + "loss": 0.7189, + "step": 590 + }, + { + "epoch": 18.976, + "eval_accuracy": 0.7882882882882883, + "eval_loss": 0.77347731590271, + "eval_runtime": 9.2397, + "eval_samples_per_second": 48.053, + "eval_steps_per_second": 1.515, + "step": 593 + }, + { + "epoch": 19.2, + "grad_norm": 36.273311614990234, + "learning_rate": 4.8387096774193554e-05, + "loss": 0.6816, + "step": 600 + }, + { + "epoch": 19.52, + "grad_norm": 19.850025177001953, + "learning_rate": 4.8267622461170854e-05, + "loss": 0.6405, + "step": 610 + }, + { + "epoch": 19.84, + "grad_norm": 27.864734649658203, + "learning_rate": 4.814814814814815e-05, + "loss": 0.6461, + "step": 620 }, { "epoch": 20.0, - "eval_accuracy": 0.625, - "eval_loss": 1.4526441097259521, - "eval_runtime": 0.2272, - "eval_samples_per_second": 35.213, - "eval_steps_per_second": 4.402, - "step": 30 + "eval_accuracy": 0.8198198198198198, + "eval_loss": 0.6875730156898499, + "eval_runtime": 9.2749, + "eval_samples_per_second": 47.871, + "eval_steps_per_second": 1.509, + "step": 625 }, { - "epoch": 21.0, - "eval_accuracy": 0.625, - "eval_loss": 1.4458982944488525, - "eval_runtime": 0.227, - "eval_samples_per_second": 35.238, - "eval_steps_per_second": 4.405, - "step": 31 + "epoch": 20.16, + "grad_norm": 19.48863410949707, + "learning_rate": 4.802867383512545e-05, + "loss": 0.6304, + "step": 630 + }, + { + "epoch": 20.48, + "grad_norm": 20.945470809936523, + "learning_rate": 4.790919952210275e-05, + "loss": 0.535, + "step": 640 + }, + { + "epoch": 20.8, + "grad_norm": 24.0327205657959, + "learning_rate": 4.778972520908005e-05, + "loss": 0.6041, + "step": 650 + }, + { + "epoch": 20.992, + "eval_accuracy": 0.7972972972972973, + "eval_loss": 0.7211897373199463, + "eval_runtime": 9.4517, + "eval_samples_per_second": 46.976, + "eval_steps_per_second": 1.481, + "step": 656 }, { - "epoch": 22.0, - "eval_accuracy": 0.625, - "eval_loss": 1.4222255945205688, - "eval_runtime": 0.2296, - "eval_samples_per_second": 34.847, - "eval_steps_per_second": 4.356, - "step": 33 + "epoch": 21.12, + "grad_norm": 34.93315505981445, + "learning_rate": 4.767025089605735e-05, + "loss": 0.5494, + "step": 660 }, { - "epoch": 23.0, - "eval_accuracy": 0.625, - "eval_loss": 1.4135932922363281, - "eval_runtime": 0.23, - "eval_samples_per_second": 34.776, - "eval_steps_per_second": 4.347, - "step": 35 + "epoch": 21.44, + "grad_norm": 29.807035446166992, + "learning_rate": 4.755077658303465e-05, + "loss": 0.5279, + "step": 670 + }, + { + "epoch": 21.76, + "grad_norm": 17.350618362426758, + "learning_rate": 4.743130227001195e-05, + "loss": 0.5016, + "step": 680 + }, + { + "epoch": 21.984, + "eval_accuracy": 0.8198198198198198, + "eval_loss": 0.6610586047172546, + "eval_runtime": 9.2263, + "eval_samples_per_second": 48.123, + "eval_steps_per_second": 1.517, + "step": 687 + }, + { + "epoch": 22.08, + "grad_norm": 16.8582706451416, + "learning_rate": 4.731182795698925e-05, + "loss": 0.5321, + "step": 690 + }, + { + "epoch": 22.4, + "grad_norm": 22.713829040527344, + "learning_rate": 4.7192353643966546e-05, + "loss": 0.523, + "step": 700 + }, + { + "epoch": 22.72, + "grad_norm": 11.667048454284668, + "learning_rate": 4.707287933094385e-05, + "loss": 0.4996, + "step": 710 + }, + { + "epoch": 22.976, + "eval_accuracy": 0.8153153153153153, + "eval_loss": 0.6110232472419739, + "eval_runtime": 9.2454, + "eval_samples_per_second": 48.024, + "eval_steps_per_second": 1.514, + "step": 718 + }, + { + "epoch": 23.04, + "grad_norm": 22.822635650634766, + "learning_rate": 4.695340501792115e-05, + "loss": 0.5179, + "step": 720 + }, + { + "epoch": 23.36, + "grad_norm": 18.14311981201172, + "learning_rate": 4.683393070489845e-05, + "loss": 0.4879, + "step": 730 + }, + { + "epoch": 23.68, + "grad_norm": 20.045101165771484, + "learning_rate": 4.671445639187575e-05, + "loss": 0.4288, + "step": 740 }, { "epoch": 24.0, - "eval_accuracy": 0.625, - "eval_loss": 1.4238104820251465, - "eval_runtime": 0.2362, - "eval_samples_per_second": 33.867, - "eval_steps_per_second": 4.233, - "step": 36 - }, - { - "epoch": 25.0, - "eval_accuracy": 0.625, - "eval_loss": 1.4286460876464844, - "eval_runtime": 0.2278, - "eval_samples_per_second": 35.125, - "eval_steps_per_second": 4.391, - "step": 37 - }, - { - "epoch": 26.0, - "eval_accuracy": 0.625, - "eval_loss": 1.42312753200531, - "eval_runtime": 0.2304, - "eval_samples_per_second": 34.725, - "eval_steps_per_second": 4.341, - "step": 39 - }, - { - "epoch": 26.666666666666668, - "grad_norm": 7.896138668060303, - "learning_rate": 3.846153846153846e-05, - "loss": 0.479, - "step": 40 + "grad_norm": 16.844778060913086, + "learning_rate": 4.659498207885305e-05, + "loss": 0.4825, + "step": 750 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.8063063063063063, + "eval_loss": 0.6475990414619446, + "eval_runtime": 9.3325, + "eval_samples_per_second": 47.576, + "eval_steps_per_second": 1.5, + "step": 750 + }, + { + "epoch": 24.32, + "grad_norm": 20.355302810668945, + "learning_rate": 4.647550776583035e-05, + "loss": 0.423, + "step": 760 + }, + { + "epoch": 24.64, + "grad_norm": 14.827963829040527, + "learning_rate": 4.635603345280765e-05, + "loss": 0.4105, + "step": 770 + }, + { + "epoch": 24.96, + "grad_norm": 22.720539093017578, + "learning_rate": 4.6236559139784944e-05, + "loss": 0.434, + "step": 780 + }, + { + "epoch": 24.992, + "eval_accuracy": 0.8040540540540541, + "eval_loss": 0.6792961359024048, + "eval_runtime": 9.238, + "eval_samples_per_second": 48.062, + "eval_steps_per_second": 1.515, + "step": 781 + }, + { + "epoch": 25.28, + "grad_norm": 18.849292755126953, + "learning_rate": 4.6117084826762245e-05, + "loss": 0.3863, + "step": 790 }, { - "epoch": 27.0, - "eval_accuracy": 0.625, - "eval_loss": 1.3963532447814941, - "eval_runtime": 0.225, - "eval_samples_per_second": 35.554, - "eval_steps_per_second": 4.444, - "step": 41 + "epoch": 25.6, + "grad_norm": 10.710004806518555, + "learning_rate": 4.5997610513739546e-05, + "loss": 0.4023, + "step": 800 + }, + { + "epoch": 25.92, + "grad_norm": 17.30459976196289, + "learning_rate": 4.5878136200716846e-05, + "loss": 0.4296, + "step": 810 + }, + { + "epoch": 25.984, + "eval_accuracy": 0.8018018018018018, + "eval_loss": 0.6015193462371826, + "eval_runtime": 9.4018, + "eval_samples_per_second": 47.225, + "eval_steps_per_second": 1.489, + "step": 812 + }, + { + "epoch": 26.24, + "grad_norm": 23.36750030517578, + "learning_rate": 4.575866188769415e-05, + "loss": 0.3688, + "step": 820 + }, + { + "epoch": 26.56, + "grad_norm": 25.308101654052734, + "learning_rate": 4.563918757467145e-05, + "loss": 0.4054, + "step": 830 + }, + { + "epoch": 26.88, + "grad_norm": 37.7635383605957, + "learning_rate": 4.551971326164875e-05, + "loss": 0.36, + "step": 840 + }, + { + "epoch": 26.976, + "eval_accuracy": 0.8063063063063063, + "eval_loss": 0.6614833474159241, + "eval_runtime": 9.3569, + "eval_samples_per_second": 47.451, + "eval_steps_per_second": 1.496, + "step": 843 + }, + { + "epoch": 27.2, + "grad_norm": 16.466156005859375, + "learning_rate": 4.540023894862604e-05, + "loss": 0.3464, + "step": 850 + }, + { + "epoch": 27.52, + "grad_norm": 21.421571731567383, + "learning_rate": 4.528076463560334e-05, + "loss": 0.3376, + "step": 860 + }, + { + "epoch": 27.84, + "grad_norm": 15.365056037902832, + "learning_rate": 4.516129032258064e-05, + "loss": 0.3646, + "step": 870 }, { "epoch": 28.0, - "eval_accuracy": 0.625, - "eval_loss": 1.375662088394165, - "eval_runtime": 0.2285, - "eval_samples_per_second": 35.009, - "eval_steps_per_second": 4.376, - "step": 42 + "eval_accuracy": 0.8220720720720721, + "eval_loss": 0.6059438586235046, + "eval_runtime": 9.3484, + "eval_samples_per_second": 47.495, + "eval_steps_per_second": 1.498, + "step": 875 }, { - "epoch": 29.0, - "eval_accuracy": 0.625, - "eval_loss": 1.350050687789917, - "eval_runtime": 0.2264, - "eval_samples_per_second": 35.335, - "eval_steps_per_second": 4.417, - "step": 43 + "epoch": 28.16, + "grad_norm": 23.176517486572266, + "learning_rate": 4.5041816009557944e-05, + "loss": 0.3498, + "step": 880 }, { - "epoch": 30.0, - "eval_accuracy": 0.625, - "eval_loss": 1.277871012687683, - "eval_runtime": 0.2258, - "eval_samples_per_second": 35.433, - "eval_steps_per_second": 4.429, - "step": 45 + "epoch": 28.48, + "grad_norm": 18.081356048583984, + "learning_rate": 4.4922341696535244e-05, + "loss": 0.3433, + "step": 890 }, { - "epoch": 31.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2359542846679688, - "eval_runtime": 0.2324, - "eval_samples_per_second": 34.427, - "eval_steps_per_second": 4.303, - "step": 47 + "epoch": 28.8, + "grad_norm": 17.244199752807617, + "learning_rate": 4.4802867383512545e-05, + "loss": 0.3542, + "step": 900 }, { - "epoch": 32.0, - "eval_accuracy": 0.625, - "eval_loss": 1.218457579612732, - "eval_runtime": 0.2295, - "eval_samples_per_second": 34.863, - "eval_steps_per_second": 4.358, - "step": 48 - }, - { - "epoch": 33.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1920298337936401, - "eval_runtime": 0.2285, - "eval_samples_per_second": 35.005, - "eval_steps_per_second": 4.376, - "step": 49 - }, - { - "epoch": 33.333333333333336, - "grad_norm": 39.576210021972656, - "learning_rate": 3.418803418803419e-05, - "loss": 0.3504, - "step": 50 + "epoch": 28.992, + "eval_accuracy": 0.7927927927927928, + "eval_loss": 0.6972593069076538, + "eval_runtime": 9.4382, + "eval_samples_per_second": 47.043, + "eval_steps_per_second": 1.483, + "step": 906 }, { - "epoch": 34.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1325857639312744, - "eval_runtime": 0.2278, - "eval_samples_per_second": 35.122, - "eval_steps_per_second": 4.39, - "step": 51 + "epoch": 29.12, + "grad_norm": 18.359115600585938, + "learning_rate": 4.4683393070489845e-05, + "loss": 0.331, + "step": 910 }, { - "epoch": 35.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1017863750457764, - "eval_runtime": 0.2263, - "eval_samples_per_second": 35.352, - "eval_steps_per_second": 4.419, - "step": 53 + "epoch": 29.44, + "grad_norm": 22.027292251586914, + "learning_rate": 4.4563918757467146e-05, + "loss": 0.3245, + "step": 920 }, { - "epoch": 36.0, - "eval_accuracy": 0.625, - "eval_loss": 1.0970098972320557, - "eval_runtime": 0.2308, - "eval_samples_per_second": 34.656, - "eval_steps_per_second": 4.332, - "step": 54 + "epoch": 29.76, + "grad_norm": 23.590885162353516, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.3091, + "step": 930 }, { - "epoch": 37.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1029536724090576, - "eval_runtime": 0.2758, - "eval_samples_per_second": 29.01, - "eval_steps_per_second": 3.626, - "step": 55 + "epoch": 29.984, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.6400348544120789, + "eval_runtime": 9.2623, + "eval_samples_per_second": 47.936, + "eval_steps_per_second": 1.511, + "step": 937 }, { - "epoch": 38.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1377930641174316, - "eval_runtime": 0.226, - "eval_samples_per_second": 35.391, - "eval_steps_per_second": 4.424, - "step": 57 + "epoch": 30.08, + "grad_norm": 10.66484260559082, + "learning_rate": 4.432497013142175e-05, + "loss": 0.2961, + "step": 940 }, { - "epoch": 39.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1719943284988403, - "eval_runtime": 0.2317, - "eval_samples_per_second": 34.529, - "eval_steps_per_second": 4.316, - "step": 59 + "epoch": 30.4, + "grad_norm": 16.36039924621582, + "learning_rate": 4.420549581839905e-05, + "loss": 0.2871, + "step": 950 }, { - "epoch": 40.0, - "grad_norm": 31.4616756439209, - "learning_rate": 2.9914529914529915e-05, - "loss": 0.2864, - "step": 60 + "epoch": 30.72, + "grad_norm": 17.09930419921875, + "learning_rate": 4.408602150537635e-05, + "loss": 0.2774, + "step": 960 }, { - "epoch": 40.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1867077350616455, - "eval_runtime": 0.2264, - "eval_samples_per_second": 35.336, - "eval_steps_per_second": 4.417, - "step": 60 + "epoch": 30.976, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.5798152685165405, + "eval_runtime": 9.2742, + "eval_samples_per_second": 47.875, + "eval_steps_per_second": 1.51, + "step": 968 }, { - "epoch": 41.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1960291862487793, - "eval_runtime": 0.2297, - "eval_samples_per_second": 34.825, - "eval_steps_per_second": 4.353, - "step": 61 + "epoch": 31.04, + "grad_norm": 10.91196060180664, + "learning_rate": 4.396654719235365e-05, + "loss": 0.2825, + "step": 970 }, { - "epoch": 42.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1959271430969238, - "eval_runtime": 0.2272, - "eval_samples_per_second": 35.217, - "eval_steps_per_second": 4.402, - "step": 63 + "epoch": 31.36, + "grad_norm": 18.016260147094727, + "learning_rate": 4.384707287933095e-05, + "loss": 0.2692, + "step": 980 }, { - "epoch": 43.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1726518869400024, - "eval_runtime": 0.2279, - "eval_samples_per_second": 35.11, - "eval_steps_per_second": 4.389, - "step": 65 + "epoch": 31.68, + "grad_norm": 24.916288375854492, + "learning_rate": 4.372759856630825e-05, + "loss": 0.2859, + "step": 990 }, { - "epoch": 44.0, - "eval_accuracy": 0.625, - "eval_loss": 1.165253758430481, - "eval_runtime": 0.227, - "eval_samples_per_second": 35.239, - "eval_steps_per_second": 4.405, - "step": 66 - }, - { - "epoch": 45.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1643680334091187, - "eval_runtime": 0.2339, - "eval_samples_per_second": 34.204, - "eval_steps_per_second": 4.276, - "step": 67 - }, - { - "epoch": 46.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1808971166610718, - "eval_runtime": 0.2293, - "eval_samples_per_second": 34.886, - "eval_steps_per_second": 4.361, - "step": 69 - }, - { - "epoch": 46.666666666666664, - "grad_norm": 18.457019805908203, - "learning_rate": 2.564102564102564e-05, - "loss": 0.2357, - "step": 70 + "epoch": 32.0, + "grad_norm": 22.427671432495117, + "learning_rate": 4.360812425328555e-05, + "loss": 0.3166, + "step": 1000 }, { - "epoch": 47.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1901675462722778, - "eval_runtime": 0.2261, - "eval_samples_per_second": 35.381, - "eval_steps_per_second": 4.423, - "step": 71 + "epoch": 32.0, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.6134085655212402, + "eval_runtime": 9.5628, + "eval_samples_per_second": 46.43, + "eval_steps_per_second": 1.464, + "step": 1000 }, { - "epoch": 48.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1872090101242065, - "eval_runtime": 0.2272, - "eval_samples_per_second": 35.206, - "eval_steps_per_second": 4.401, - "step": 72 + "epoch": 32.32, + "grad_norm": 15.972029685974121, + "learning_rate": 4.3488649940262845e-05, + "loss": 0.242, + "step": 1010 }, { - "epoch": 49.0, - "eval_accuracy": 0.625, - "eval_loss": 1.189396858215332, - "eval_runtime": 0.2263, - "eval_samples_per_second": 35.345, - "eval_steps_per_second": 4.418, - "step": 73 + "epoch": 32.64, + "grad_norm": 12.7739896774292, + "learning_rate": 4.3369175627240145e-05, + "loss": 0.2501, + "step": 1020 }, { - "epoch": 50.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1982437372207642, - "eval_runtime": 0.2428, - "eval_samples_per_second": 32.942, - "eval_steps_per_second": 4.118, - "step": 75 + "epoch": 32.96, + "grad_norm": 31.982995986938477, + "learning_rate": 4.3249701314217446e-05, + "loss": 0.2878, + "step": 1030 }, { - "epoch": 51.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2417709827423096, - "eval_runtime": 0.2272, - "eval_samples_per_second": 35.217, - "eval_steps_per_second": 4.402, - "step": 77 + "epoch": 32.992, + "eval_accuracy": 0.8063063063063063, + "eval_loss": 0.6353267431259155, + "eval_runtime": 9.3031, + "eval_samples_per_second": 47.726, + "eval_steps_per_second": 1.505, + "step": 1031 }, { - "epoch": 52.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2575104236602783, - "eval_runtime": 0.2279, - "eval_samples_per_second": 35.099, - "eval_steps_per_second": 4.387, - "step": 78 - }, - { - "epoch": 53.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2707685232162476, - "eval_runtime": 0.2281, - "eval_samples_per_second": 35.068, - "eval_steps_per_second": 4.384, - "step": 79 - }, - { - "epoch": 53.333333333333336, - "grad_norm": 4.976161479949951, - "learning_rate": 2.1367521367521368e-05, - "loss": 0.1561, - "step": 80 + "epoch": 33.28, + "grad_norm": 16.256759643554688, + "learning_rate": 4.3130227001194746e-05, + "loss": 0.255, + "step": 1040 }, { - "epoch": 54.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2665772438049316, - "eval_runtime": 0.2284, - "eval_samples_per_second": 35.019, - "eval_steps_per_second": 4.377, - "step": 81 + "epoch": 33.6, + "grad_norm": 12.84078311920166, + "learning_rate": 4.301075268817205e-05, + "loss": 0.2319, + "step": 1050 }, { - "epoch": 55.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2240548133850098, - "eval_runtime": 0.2281, - "eval_samples_per_second": 35.073, - "eval_steps_per_second": 4.384, - "step": 83 + "epoch": 33.92, + "grad_norm": 21.021757125854492, + "learning_rate": 4.289127837514935e-05, + "loss": 0.2529, + "step": 1060 }, { - "epoch": 56.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2088531255722046, - "eval_runtime": 0.227, - "eval_samples_per_second": 35.239, - "eval_steps_per_second": 4.405, - "step": 84 + "epoch": 33.984, + "eval_accuracy": 0.8243243243243243, + "eval_loss": 0.6627711653709412, + "eval_runtime": 9.3278, + "eval_samples_per_second": 47.6, + "eval_steps_per_second": 1.501, + "step": 1062 }, { - "epoch": 57.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1913509368896484, - "eval_runtime": 0.2251, - "eval_samples_per_second": 35.546, - "eval_steps_per_second": 4.443, - "step": 85 + "epoch": 34.24, + "grad_norm": 17.96116065979004, + "learning_rate": 4.277180406212665e-05, + "loss": 0.2474, + "step": 1070 }, { - "epoch": 58.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1558996438980103, - "eval_runtime": 0.231, - "eval_samples_per_second": 34.632, - "eval_steps_per_second": 4.329, - "step": 87 + "epoch": 34.56, + "grad_norm": 18.702285766601562, + "learning_rate": 4.265232974910394e-05, + "loss": 0.208, + "step": 1080 }, { - "epoch": 59.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1386878490447998, - "eval_runtime": 0.2302, - "eval_samples_per_second": 34.748, - "eval_steps_per_second": 4.344, - "step": 89 + "epoch": 34.88, + "grad_norm": 20.62737274169922, + "learning_rate": 4.253285543608124e-05, + "loss": 0.2601, + "step": 1090 }, { - "epoch": 60.0, - "grad_norm": 19.57244300842285, - "learning_rate": 1.7094017094017095e-05, - "loss": 0.1453, - "step": 90 + "epoch": 34.976, + "eval_accuracy": 0.8040540540540541, + "eval_loss": 0.6367062330245972, + "eval_runtime": 9.3069, + "eval_samples_per_second": 47.707, + "eval_steps_per_second": 1.504, + "step": 1093 }, { - "epoch": 60.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1336504220962524, - "eval_runtime": 0.2275, - "eval_samples_per_second": 35.169, - "eval_steps_per_second": 4.396, - "step": 90 + "epoch": 35.2, + "grad_norm": 16.595108032226562, + "learning_rate": 4.241338112305854e-05, + "loss": 0.2344, + "step": 1100 }, { - "epoch": 61.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1289597749710083, - "eval_runtime": 0.2329, - "eval_samples_per_second": 34.356, - "eval_steps_per_second": 4.295, - "step": 91 + "epoch": 35.52, + "grad_norm": 23.414844512939453, + "learning_rate": 4.2293906810035844e-05, + "loss": 0.2383, + "step": 1110 }, { - "epoch": 62.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1369203329086304, - "eval_runtime": 0.2293, - "eval_samples_per_second": 34.894, - "eval_steps_per_second": 4.362, - "step": 93 + "epoch": 35.84, + "grad_norm": 13.001286506652832, + "learning_rate": 4.2174432497013144e-05, + "loss": 0.2208, + "step": 1120 }, { - "epoch": 63.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1438777446746826, - "eval_runtime": 0.2263, - "eval_samples_per_second": 35.359, - "eval_steps_per_second": 4.42, - "step": 95 + "epoch": 36.0, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.6312922835350037, + "eval_runtime": 9.351, + "eval_samples_per_second": 47.482, + "eval_steps_per_second": 1.497, + "step": 1125 }, { - "epoch": 64.0, - "eval_accuracy": 0.625, - "eval_loss": 1.14479660987854, - "eval_runtime": 0.2307, - "eval_samples_per_second": 34.673, - "eval_steps_per_second": 4.334, - "step": 96 - }, - { - "epoch": 65.0, - "eval_accuracy": 0.625, - "eval_loss": 1.153009057044983, - "eval_runtime": 0.2302, - "eval_samples_per_second": 34.748, - "eval_steps_per_second": 4.344, - "step": 97 - }, - { - "epoch": 66.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1718435287475586, - "eval_runtime": 0.2286, - "eval_samples_per_second": 35.003, - "eval_steps_per_second": 4.375, - "step": 99 - }, - { - "epoch": 66.66666666666667, - "grad_norm": 5.6247944831848145, - "learning_rate": 1.282051282051282e-05, - "loss": 0.1271, - "step": 100 + "epoch": 36.16, + "grad_norm": 19.46753692626953, + "learning_rate": 4.2054958183990445e-05, + "loss": 0.1937, + "step": 1130 }, { - "epoch": 67.0, - "eval_accuracy": 0.625, - "eval_loss": 1.1965450048446655, - "eval_runtime": 0.2246, - "eval_samples_per_second": 35.611, - "eval_steps_per_second": 4.451, - "step": 101 + "epoch": 36.48, + "grad_norm": 18.740930557250977, + "learning_rate": 4.1935483870967746e-05, + "loss": 0.2337, + "step": 1140 }, { - "epoch": 68.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2091799974441528, - "eval_runtime": 0.2266, - "eval_samples_per_second": 35.297, - "eval_steps_per_second": 4.412, - "step": 102 + "epoch": 36.8, + "grad_norm": 16.52130889892578, + "learning_rate": 4.1816009557945046e-05, + "loss": 0.2342, + "step": 1150 }, { - "epoch": 69.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2176190614700317, - "eval_runtime": 0.226, - "eval_samples_per_second": 35.4, - "eval_steps_per_second": 4.425, - "step": 103 + "epoch": 36.992, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.5969250798225403, + "eval_runtime": 9.5016, + "eval_samples_per_second": 46.729, + "eval_steps_per_second": 1.473, + "step": 1156 }, { - "epoch": 70.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2336797714233398, - "eval_runtime": 0.2275, - "eval_samples_per_second": 35.172, - "eval_steps_per_second": 4.397, - "step": 105 + "epoch": 37.12, + "grad_norm": 20.486103057861328, + "learning_rate": 4.169653524492234e-05, + "loss": 0.2097, + "step": 1160 }, { - "epoch": 71.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2376230955123901, - "eval_runtime": 0.2255, - "eval_samples_per_second": 35.481, - "eval_steps_per_second": 4.435, - "step": 107 + "epoch": 37.44, + "grad_norm": 27.214502334594727, + "learning_rate": 4.157706093189964e-05, + "loss": 0.2257, + "step": 1170 }, { - "epoch": 72.0, - "eval_accuracy": 0.625, - "eval_loss": 1.238419532775879, - "eval_runtime": 0.2258, - "eval_samples_per_second": 35.428, - "eval_steps_per_second": 4.429, - "step": 108 - }, - { - "epoch": 73.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2378357648849487, - "eval_runtime": 0.2254, - "eval_samples_per_second": 35.491, - "eval_steps_per_second": 4.436, - "step": 109 - }, - { - "epoch": 73.33333333333333, - "grad_norm": 16.670150756835938, - "learning_rate": 8.547008547008548e-06, - "loss": 0.1153, - "step": 110 + "epoch": 37.76, + "grad_norm": 13.391148567199707, + "learning_rate": 4.145758661887694e-05, + "loss": 0.2122, + "step": 1180 }, { - "epoch": 74.0, - "eval_accuracy": 0.625, - "eval_loss": 1.238478183746338, - "eval_runtime": 0.2784, - "eval_samples_per_second": 28.737, - "eval_steps_per_second": 3.592, - "step": 111 + "epoch": 37.984, + "eval_accuracy": 0.8198198198198198, + "eval_loss": 0.6390910148620605, + "eval_runtime": 9.1793, + "eval_samples_per_second": 48.37, + "eval_steps_per_second": 1.525, + "step": 1187 }, { - "epoch": 75.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2316068410873413, - "eval_runtime": 0.2262, - "eval_samples_per_second": 35.368, - "eval_steps_per_second": 4.421, - "step": 113 + "epoch": 38.08, + "grad_norm": 14.237875938415527, + "learning_rate": 4.133811230585424e-05, + "loss": 0.1959, + "step": 1190 }, { - "epoch": 76.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2274171113967896, - "eval_runtime": 0.226, - "eval_samples_per_second": 35.392, - "eval_steps_per_second": 4.424, - "step": 114 + "epoch": 38.4, + "grad_norm": 15.39142894744873, + "learning_rate": 4.121863799283154e-05, + "loss": 0.2144, + "step": 1200 }, { - "epoch": 77.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2251871824264526, - "eval_runtime": 0.2311, - "eval_samples_per_second": 34.618, - "eval_steps_per_second": 4.327, - "step": 115 + "epoch": 38.72, + "grad_norm": 20.199790954589844, + "learning_rate": 4.109916367980884e-05, + "loss": 0.1791, + "step": 1210 }, { - "epoch": 78.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2195590734481812, - "eval_runtime": 0.2344, - "eval_samples_per_second": 34.126, - "eval_steps_per_second": 4.266, - "step": 117 + "epoch": 38.976, + "eval_accuracy": 0.8108108108108109, + "eval_loss": 0.6770870685577393, + "eval_runtime": 9.2464, + "eval_samples_per_second": 48.018, + "eval_steps_per_second": 1.514, + "step": 1218 }, { - "epoch": 79.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2144805192947388, - "eval_runtime": 0.2276, - "eval_samples_per_second": 35.145, - "eval_steps_per_second": 4.393, - "step": 119 + "epoch": 39.04, + "grad_norm": 8.080174446105957, + "learning_rate": 4.0979689366786144e-05, + "loss": 0.2185, + "step": 1220 }, { - "epoch": 80.0, - "grad_norm": 18.376235961914062, - "learning_rate": 4.273504273504274e-06, - "loss": 0.0882, - "step": 120 + "epoch": 39.36, + "grad_norm": 35.202110290527344, + "learning_rate": 4.0860215053763444e-05, + "loss": 0.2096, + "step": 1230 }, { - "epoch": 80.0, - "eval_accuracy": 0.625, - "eval_loss": 1.213006615638733, - "eval_runtime": 0.2356, - "eval_samples_per_second": 33.955, - "eval_steps_per_second": 4.244, - "step": 120 + "epoch": 39.68, + "grad_norm": 14.537675857543945, + "learning_rate": 4.074074074074074e-05, + "loss": 0.2078, + "step": 1240 }, { - "epoch": 81.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2117276191711426, - "eval_runtime": 0.2323, - "eval_samples_per_second": 34.443, - "eval_steps_per_second": 4.305, - "step": 121 + "epoch": 40.0, + "grad_norm": 14.412237167358398, + "learning_rate": 4.062126642771804e-05, + "loss": 0.2113, + "step": 1250 }, { - "epoch": 82.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2097160816192627, - "eval_runtime": 0.2308, - "eval_samples_per_second": 34.655, - "eval_steps_per_second": 4.332, - "step": 123 + "epoch": 40.0, + "eval_accuracy": 0.8085585585585585, + "eval_loss": 0.7035388946533203, + "eval_runtime": 9.3229, + "eval_samples_per_second": 47.625, + "eval_steps_per_second": 1.502, + "step": 1250 }, { - "epoch": 83.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2075334787368774, - "eval_runtime": 0.2252, - "eval_samples_per_second": 35.532, - "eval_steps_per_second": 4.441, - "step": 125 + "epoch": 40.32, + "grad_norm": 12.491999626159668, + "learning_rate": 4.050179211469534e-05, + "loss": 0.1753, + "step": 1260 }, { - "epoch": 84.0, - "eval_accuracy": 0.625, - "eval_loss": 1.205414056777954, - "eval_runtime": 0.2269, - "eval_samples_per_second": 35.264, - "eval_steps_per_second": 4.408, - "step": 126 - }, - { - "epoch": 85.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2038739919662476, - "eval_runtime": 0.2445, - "eval_samples_per_second": 32.718, - "eval_steps_per_second": 4.09, - "step": 127 - }, - { - "epoch": 86.0, - "eval_accuracy": 0.625, - "eval_loss": 1.2025001049041748, - "eval_runtime": 0.244, - "eval_samples_per_second": 32.782, - "eval_steps_per_second": 4.098, - "step": 129 - }, - { - "epoch": 86.66666666666667, - "grad_norm": 6.174490928649902, - "learning_rate": 0.0, - "loss": 0.0987, - "step": 130 + "epoch": 40.64, + "grad_norm": 16.06132698059082, + "learning_rate": 4.038231780167264e-05, + "loss": 0.1666, + "step": 1270 }, { - "epoch": 86.66666666666667, - "eval_accuracy": 0.625, - "eval_loss": 1.2021381855010986, - "eval_runtime": 0.2265, - "eval_samples_per_second": 35.317, - "eval_steps_per_second": 4.415, - "step": 130 + "epoch": 40.96, + "grad_norm": 23.15831184387207, + "learning_rate": 4.026284348864994e-05, + "loss": 0.1703, + "step": 1280 + }, + { + "epoch": 40.992, + "eval_accuracy": 0.8153153153153153, + "eval_loss": 0.7095688581466675, + "eval_runtime": 9.4284, + "eval_samples_per_second": 47.092, + "eval_steps_per_second": 1.485, + "step": 1281 + }, + { + "epoch": 41.28, + "grad_norm": 20.108991622924805, + "learning_rate": 4.014336917562724e-05, + "loss": 0.1522, + "step": 1290 + }, + { + "epoch": 41.6, + "grad_norm": 24.87325096130371, + "learning_rate": 4.002389486260454e-05, + "loss": 0.194, + "step": 1300 + }, + { + "epoch": 41.92, + "grad_norm": 15.418107032775879, + "learning_rate": 3.990442054958184e-05, + "loss": 0.1751, + "step": 1310 + }, + { + "epoch": 41.984, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.596449613571167, + "eval_runtime": 9.1487, + "eval_samples_per_second": 48.532, + "eval_steps_per_second": 1.53, + "step": 1312 + }, + { + "epoch": 42.24, + "grad_norm": 6.889113903045654, + "learning_rate": 3.978494623655914e-05, + "loss": 0.1619, + "step": 1320 + }, + { + "epoch": 42.56, + "grad_norm": 16.027725219726562, + "learning_rate": 3.9665471923536444e-05, + "loss": 0.1663, + "step": 1330 + }, + { + "epoch": 42.88, + "grad_norm": 24.438074111938477, + "learning_rate": 3.9545997610513744e-05, + "loss": 0.1889, + "step": 1340 + }, + { + "epoch": 42.976, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.6606948971748352, + "eval_runtime": 9.3714, + "eval_samples_per_second": 47.378, + "eval_steps_per_second": 1.494, + "step": 1343 + }, + { + "epoch": 43.2, + "grad_norm": 16.80255889892578, + "learning_rate": 3.9426523297491045e-05, + "loss": 0.1968, + "step": 1350 + }, + { + "epoch": 43.52, + "grad_norm": 16.66586685180664, + "learning_rate": 3.9307048984468345e-05, + "loss": 0.1971, + "step": 1360 + }, + { + "epoch": 43.84, + "grad_norm": 13.07772445678711, + "learning_rate": 3.9187574671445646e-05, + "loss": 0.1791, + "step": 1370 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.8243243243243243, + "eval_loss": 0.6999737024307251, + "eval_runtime": 9.2212, + "eval_samples_per_second": 48.15, + "eval_steps_per_second": 1.518, + "step": 1375 + }, + { + "epoch": 44.16, + "grad_norm": 11.09182071685791, + "learning_rate": 3.906810035842295e-05, + "loss": 0.1932, + "step": 1380 + }, + { + "epoch": 44.48, + "grad_norm": 12.857522964477539, + "learning_rate": 3.894862604540024e-05, + "loss": 0.1753, + "step": 1390 + }, + { + "epoch": 44.8, + "grad_norm": 13.396793365478516, + "learning_rate": 3.882915173237754e-05, + "loss": 0.1372, + "step": 1400 + }, + { + "epoch": 44.992, + "eval_accuracy": 0.8243243243243243, + "eval_loss": 0.6866002678871155, + "eval_runtime": 9.2589, + "eval_samples_per_second": 47.954, + "eval_steps_per_second": 1.512, + "step": 1406 + }, + { + "epoch": 45.12, + "grad_norm": 13.315982818603516, + "learning_rate": 3.870967741935484e-05, + "loss": 0.169, + "step": 1410 + }, + { + "epoch": 45.44, + "grad_norm": 15.068939208984375, + "learning_rate": 3.859020310633214e-05, + "loss": 0.1658, + "step": 1420 + }, + { + "epoch": 45.76, + "grad_norm": 12.234659194946289, + "learning_rate": 3.847072879330944e-05, + "loss": 0.1785, + "step": 1430 + }, + { + "epoch": 45.984, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.6620847582817078, + "eval_runtime": 9.1604, + "eval_samples_per_second": 48.469, + "eval_steps_per_second": 1.528, + "step": 1437 + }, + { + "epoch": 46.08, + "grad_norm": 15.436200141906738, + "learning_rate": 3.8351254480286743e-05, + "loss": 0.159, + "step": 1440 + }, + { + "epoch": 46.4, + "grad_norm": 10.734991073608398, + "learning_rate": 3.8231780167264044e-05, + "loss": 0.1294, + "step": 1450 + }, + { + "epoch": 46.72, + "grad_norm": 13.887167930603027, + "learning_rate": 3.8112305854241345e-05, + "loss": 0.1469, + "step": 1460 + }, + { + "epoch": 46.976, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.6390572190284729, + "eval_runtime": 9.1369, + "eval_samples_per_second": 48.594, + "eval_steps_per_second": 1.532, + "step": 1468 + }, + { + "epoch": 47.04, + "grad_norm": 16.58067512512207, + "learning_rate": 3.799283154121864e-05, + "loss": 0.162, + "step": 1470 + }, + { + "epoch": 47.36, + "grad_norm": 11.742984771728516, + "learning_rate": 3.787335722819594e-05, + "loss": 0.1564, + "step": 1480 + }, + { + "epoch": 47.68, + "grad_norm": 13.77131175994873, + "learning_rate": 3.775388291517324e-05, + "loss": 0.1489, + "step": 1490 + }, + { + "epoch": 48.0, + "grad_norm": 19.7138671875, + "learning_rate": 3.763440860215054e-05, + "loss": 0.1628, + "step": 1500 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.6623378396034241, + "eval_runtime": 9.2787, + "eval_samples_per_second": 47.852, + "eval_steps_per_second": 1.509, + "step": 1500 + }, + { + "epoch": 48.32, + "grad_norm": 19.31183433532715, + "learning_rate": 3.751493428912784e-05, + "loss": 0.1831, + "step": 1510 + }, + { + "epoch": 48.64, + "grad_norm": 10.484661102294922, + "learning_rate": 3.739545997610514e-05, + "loss": 0.1434, + "step": 1520 + }, + { + "epoch": 48.96, + "grad_norm": 14.728410720825195, + "learning_rate": 3.727598566308244e-05, + "loss": 0.1425, + "step": 1530 + }, + { + "epoch": 48.992, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.6442891955375671, + "eval_runtime": 9.2746, + "eval_samples_per_second": 47.873, + "eval_steps_per_second": 1.509, + "step": 1531 + }, + { + "epoch": 49.28, + "grad_norm": 14.076499938964844, + "learning_rate": 3.715651135005974e-05, + "loss": 0.1476, + "step": 1540 + }, + { + "epoch": 49.6, + "grad_norm": 29.458377838134766, + "learning_rate": 3.7037037037037037e-05, + "loss": 0.1621, + "step": 1550 + }, + { + "epoch": 49.92, + "grad_norm": 11.280054092407227, + "learning_rate": 3.691756272401434e-05, + "loss": 0.1727, + "step": 1560 + }, + { + "epoch": 49.984, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.6360700726509094, + "eval_runtime": 9.1824, + "eval_samples_per_second": 48.353, + "eval_steps_per_second": 1.525, + "step": 1562 + }, + { + "epoch": 50.24, + "grad_norm": 14.625510215759277, + "learning_rate": 3.679808841099164e-05, + "loss": 0.137, + "step": 1570 + }, + { + "epoch": 50.56, + "grad_norm": 15.493489265441895, + "learning_rate": 3.667861409796894e-05, + "loss": 0.1319, + "step": 1580 + }, + { + "epoch": 50.88, + "grad_norm": 8.854424476623535, + "learning_rate": 3.655913978494624e-05, + "loss": 0.1442, + "step": 1590 + }, + { + "epoch": 50.976, + "eval_accuracy": 0.8490990990990991, + "eval_loss": 0.6397358179092407, + "eval_runtime": 9.322, + "eval_samples_per_second": 47.629, + "eval_steps_per_second": 1.502, + "step": 1593 + }, + { + "epoch": 51.2, + "grad_norm": 10.89920425415039, + "learning_rate": 3.643966547192354e-05, + "loss": 0.1272, + "step": 1600 + }, + { + "epoch": 51.52, + "grad_norm": 11.777154922485352, + "learning_rate": 3.632019115890084e-05, + "loss": 0.1019, + "step": 1610 + }, + { + "epoch": 51.84, + "grad_norm": 10.171144485473633, + "learning_rate": 3.6200716845878134e-05, + "loss": 0.1386, + "step": 1620 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.6835172772407532, + "eval_runtime": 9.2477, + "eval_samples_per_second": 48.012, + "eval_steps_per_second": 1.514, + "step": 1625 + }, + { + "epoch": 52.16, + "grad_norm": 16.571006774902344, + "learning_rate": 3.6081242532855435e-05, + "loss": 0.1235, + "step": 1630 + }, + { + "epoch": 52.48, + "grad_norm": 6.069274425506592, + "learning_rate": 3.5961768219832735e-05, + "loss": 0.1715, + "step": 1640 + }, + { + "epoch": 52.8, + "grad_norm": 21.08418083190918, + "learning_rate": 3.5842293906810036e-05, + "loss": 0.1564, + "step": 1650 + }, + { + "epoch": 52.992, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.7071699500083923, + "eval_runtime": 9.4809, + "eval_samples_per_second": 46.831, + "eval_steps_per_second": 1.477, + "step": 1656 + }, + { + "epoch": 53.12, + "grad_norm": 15.996872901916504, + "learning_rate": 3.5722819593787336e-05, + "loss": 0.1518, + "step": 1660 + }, + { + "epoch": 53.44, + "grad_norm": 11.845250129699707, + "learning_rate": 3.560334528076464e-05, + "loss": 0.1328, + "step": 1670 + }, + { + "epoch": 53.76, + "grad_norm": 7.606972694396973, + "learning_rate": 3.548387096774194e-05, + "loss": 0.1151, + "step": 1680 + }, + { + "epoch": 53.984, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.6834642291069031, + "eval_runtime": 9.5012, + "eval_samples_per_second": 46.731, + "eval_steps_per_second": 1.473, + "step": 1687 + }, + { + "epoch": 54.08, + "grad_norm": 10.345972061157227, + "learning_rate": 3.536439665471924e-05, + "loss": 0.1107, + "step": 1690 + }, + { + "epoch": 54.4, + "grad_norm": 11.632840156555176, + "learning_rate": 3.524492234169653e-05, + "loss": 0.1504, + "step": 1700 + }, + { + "epoch": 54.72, + "grad_norm": 10.270337104797363, + "learning_rate": 3.512544802867383e-05, + "loss": 0.1446, + "step": 1710 + }, + { + "epoch": 54.976, + "eval_accuracy": 0.8198198198198198, + "eval_loss": 0.7347476482391357, + "eval_runtime": 9.0983, + "eval_samples_per_second": 48.8, + "eval_steps_per_second": 1.539, + "step": 1718 + }, + { + "epoch": 55.04, + "grad_norm": 9.00815486907959, + "learning_rate": 3.500597371565113e-05, + "loss": 0.116, + "step": 1720 + }, + { + "epoch": 55.36, + "grad_norm": 3.8150763511657715, + "learning_rate": 3.4886499402628434e-05, + "loss": 0.1329, + "step": 1730 + }, + { + "epoch": 55.68, + "grad_norm": 10.648646354675293, + "learning_rate": 3.4767025089605734e-05, + "loss": 0.1214, + "step": 1740 + }, + { + "epoch": 56.0, + "grad_norm": 13.604965209960938, + "learning_rate": 3.4647550776583035e-05, + "loss": 0.1353, + "step": 1750 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.6935142278671265, + "eval_runtime": 9.1929, + "eval_samples_per_second": 48.298, + "eval_steps_per_second": 1.523, + "step": 1750 + }, + { + "epoch": 56.32, + "grad_norm": 16.767179489135742, + "learning_rate": 3.4528076463560336e-05, + "loss": 0.1196, + "step": 1760 + }, + { + "epoch": 56.64, + "grad_norm": 9.230161666870117, + "learning_rate": 3.4408602150537636e-05, + "loss": 0.125, + "step": 1770 + }, + { + "epoch": 56.96, + "grad_norm": 17.74578857421875, + "learning_rate": 3.428912783751494e-05, + "loss": 0.13, + "step": 1780 + }, + { + "epoch": 56.992, + "eval_accuracy": 0.8198198198198198, + "eval_loss": 0.7337386608123779, + "eval_runtime": 9.5019, + "eval_samples_per_second": 46.728, + "eval_steps_per_second": 1.473, + "step": 1781 + }, + { + "epoch": 57.28, + "grad_norm": 8.565692901611328, + "learning_rate": 3.416965352449224e-05, + "loss": 0.112, + "step": 1790 + }, + { + "epoch": 57.6, + "grad_norm": 9.264266014099121, + "learning_rate": 3.405017921146954e-05, + "loss": 0.1194, + "step": 1800 + }, + { + "epoch": 57.92, + "grad_norm": 10.091086387634277, + "learning_rate": 3.393070489844684e-05, + "loss": 0.1312, + "step": 1810 + }, + { + "epoch": 57.984, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.6625228524208069, + "eval_runtime": 9.3477, + "eval_samples_per_second": 47.498, + "eval_steps_per_second": 1.498, + "step": 1812 + }, + { + "epoch": 58.24, + "grad_norm": 11.256436347961426, + "learning_rate": 3.381123058542414e-05, + "loss": 0.1376, + "step": 1820 + }, + { + "epoch": 58.56, + "grad_norm": 11.114587783813477, + "learning_rate": 3.369175627240144e-05, + "loss": 0.1232, + "step": 1830 + }, + { + "epoch": 58.88, + "grad_norm": 16.502622604370117, + "learning_rate": 3.357228195937874e-05, + "loss": 0.1201, + "step": 1840 + }, + { + "epoch": 58.976, + "eval_accuracy": 0.8243243243243243, + "eval_loss": 0.6955697536468506, + "eval_runtime": 9.3809, + "eval_samples_per_second": 47.33, + "eval_steps_per_second": 1.492, + "step": 1843 + }, + { + "epoch": 59.2, + "grad_norm": 15.201342582702637, + "learning_rate": 3.3452807646356034e-05, + "loss": 0.1237, + "step": 1850 + }, + { + "epoch": 59.52, + "grad_norm": 9.248207092285156, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.0885, + "step": 1860 + }, + { + "epoch": 59.84, + "grad_norm": 3.726451873779297, + "learning_rate": 3.3213859020310636e-05, + "loss": 0.1411, + "step": 1870 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.8243243243243243, + "eval_loss": 0.7289673686027527, + "eval_runtime": 9.594, + "eval_samples_per_second": 46.279, + "eval_steps_per_second": 1.459, + "step": 1875 + }, + { + "epoch": 60.16, + "grad_norm": 14.892931938171387, + "learning_rate": 3.3094384707287936e-05, + "loss": 0.1327, + "step": 1880 + }, + { + "epoch": 60.48, + "grad_norm": 13.88679313659668, + "learning_rate": 3.297491039426524e-05, + "loss": 0.1031, + "step": 1890 + }, + { + "epoch": 60.8, + "grad_norm": 10.291778564453125, + "learning_rate": 3.285543608124254e-05, + "loss": 0.1116, + "step": 1900 + }, + { + "epoch": 60.992, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.7051534056663513, + "eval_runtime": 9.235, + "eval_samples_per_second": 48.078, + "eval_steps_per_second": 1.516, + "step": 1906 + }, + { + "epoch": 61.12, + "grad_norm": 10.923468589782715, + "learning_rate": 3.273596176821984e-05, + "loss": 0.0994, + "step": 1910 + }, + { + "epoch": 61.44, + "grad_norm": 23.09986114501953, + "learning_rate": 3.261648745519714e-05, + "loss": 0.1304, + "step": 1920 + }, + { + "epoch": 61.76, + "grad_norm": 13.718985557556152, + "learning_rate": 3.249701314217443e-05, + "loss": 0.1251, + "step": 1930 + }, + { + "epoch": 61.984, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.6914505362510681, + "eval_runtime": 9.385, + "eval_samples_per_second": 47.31, + "eval_steps_per_second": 1.492, + "step": 1937 + }, + { + "epoch": 62.08, + "grad_norm": 11.085161209106445, + "learning_rate": 3.237753882915173e-05, + "loss": 0.1472, + "step": 1940 + }, + { + "epoch": 62.4, + "grad_norm": 7.979318618774414, + "learning_rate": 3.2258064516129034e-05, + "loss": 0.1042, + "step": 1950 + }, + { + "epoch": 62.72, + "grad_norm": 7.216133117675781, + "learning_rate": 3.2138590203106334e-05, + "loss": 0.1101, + "step": 1960 + }, + { + "epoch": 62.976, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.6457498669624329, + "eval_runtime": 9.3484, + "eval_samples_per_second": 47.495, + "eval_steps_per_second": 1.498, + "step": 1968 + }, + { + "epoch": 63.04, + "grad_norm": 16.771268844604492, + "learning_rate": 3.2019115890083635e-05, + "loss": 0.1168, + "step": 1970 + }, + { + "epoch": 63.36, + "grad_norm": 42.29957962036133, + "learning_rate": 3.1899641577060935e-05, + "loss": 0.1213, + "step": 1980 + }, + { + "epoch": 63.68, + "grad_norm": 15.948644638061523, + "learning_rate": 3.1780167264038236e-05, + "loss": 0.0991, + "step": 1990 + }, + { + "epoch": 64.0, + "grad_norm": 6.990386962890625, + "learning_rate": 3.1660692951015537e-05, + "loss": 0.0883, + "step": 2000 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.6553166508674622, + "eval_runtime": 9.3629, + "eval_samples_per_second": 47.421, + "eval_steps_per_second": 1.495, + "step": 2000 + }, + { + "epoch": 64.32, + "grad_norm": 10.886075019836426, + "learning_rate": 3.154121863799283e-05, + "loss": 0.1273, + "step": 2010 + }, + { + "epoch": 64.64, + "grad_norm": 7.905044078826904, + "learning_rate": 3.142174432497013e-05, + "loss": 0.1052, + "step": 2020 + }, + { + "epoch": 64.96, + "grad_norm": 4.7507734298706055, + "learning_rate": 3.130227001194743e-05, + "loss": 0.1225, + "step": 2030 + }, + { + "epoch": 64.992, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.6454054713249207, + "eval_runtime": 9.374, + "eval_samples_per_second": 47.365, + "eval_steps_per_second": 1.493, + "step": 2031 + }, + { + "epoch": 65.28, + "grad_norm": 13.874588012695312, + "learning_rate": 3.118279569892473e-05, + "loss": 0.096, + "step": 2040 + }, + { + "epoch": 65.6, + "grad_norm": 13.141563415527344, + "learning_rate": 3.106332138590203e-05, + "loss": 0.1033, + "step": 2050 + }, + { + "epoch": 65.92, + "grad_norm": 9.248854637145996, + "learning_rate": 3.0943847072879333e-05, + "loss": 0.1135, + "step": 2060 + }, + { + "epoch": 65.984, + "eval_accuracy": 0.8513513513513513, + "eval_loss": 0.6616309881210327, + "eval_runtime": 9.2232, + "eval_samples_per_second": 48.14, + "eval_steps_per_second": 1.518, + "step": 2062 + }, + { + "epoch": 66.24, + "grad_norm": 8.99073600769043, + "learning_rate": 3.0824372759856634e-05, + "loss": 0.1237, + "step": 2070 + }, + { + "epoch": 66.56, + "grad_norm": 14.106173515319824, + "learning_rate": 3.070489844683393e-05, + "loss": 0.12, + "step": 2080 + }, + { + "epoch": 66.88, + "grad_norm": 17.03522300720215, + "learning_rate": 3.058542413381123e-05, + "loss": 0.1009, + "step": 2090 + }, + { + "epoch": 66.976, + "eval_accuracy": 0.8536036036036037, + "eval_loss": 0.6374746561050415, + "eval_runtime": 9.2564, + "eval_samples_per_second": 47.967, + "eval_steps_per_second": 1.512, + "step": 2093 + }, + { + "epoch": 67.2, + "grad_norm": 15.714427947998047, + "learning_rate": 3.046594982078853e-05, + "loss": 0.1002, + "step": 2100 + }, + { + "epoch": 67.52, + "grad_norm": 8.479767799377441, + "learning_rate": 3.034647550776583e-05, + "loss": 0.0863, + "step": 2110 + }, + { + "epoch": 67.84, + "grad_norm": 20.34163475036621, + "learning_rate": 3.022700119474313e-05, + "loss": 0.1027, + "step": 2120 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.6754458546638489, + "eval_runtime": 9.3779, + "eval_samples_per_second": 47.346, + "eval_steps_per_second": 1.493, + "step": 2125 + }, + { + "epoch": 68.16, + "grad_norm": 6.332460403442383, + "learning_rate": 3.010752688172043e-05, + "loss": 0.0972, + "step": 2130 + }, + { + "epoch": 68.48, + "grad_norm": 19.568702697753906, + "learning_rate": 2.998805256869773e-05, + "loss": 0.0891, + "step": 2140 + }, + { + "epoch": 68.8, + "grad_norm": 13.815423011779785, + "learning_rate": 2.9868578255675032e-05, + "loss": 0.0925, + "step": 2150 + }, + { + "epoch": 68.992, + "eval_accuracy": 0.8175675675675675, + "eval_loss": 0.7497189044952393, + "eval_runtime": 9.2003, + "eval_samples_per_second": 48.259, + "eval_steps_per_second": 1.522, + "step": 2156 + }, + { + "epoch": 69.12, + "grad_norm": 12.500412940979004, + "learning_rate": 2.974910394265233e-05, + "loss": 0.0903, + "step": 2160 + }, + { + "epoch": 69.44, + "grad_norm": 16.62474250793457, + "learning_rate": 2.962962962962963e-05, + "loss": 0.099, + "step": 2170 + }, + { + "epoch": 69.76, + "grad_norm": 5.302067279815674, + "learning_rate": 2.951015531660693e-05, + "loss": 0.0878, + "step": 2180 + }, + { + "epoch": 69.984, + "eval_accuracy": 0.8490990990990991, + "eval_loss": 0.6572933197021484, + "eval_runtime": 9.2909, + "eval_samples_per_second": 47.789, + "eval_steps_per_second": 1.507, + "step": 2187 + }, + { + "epoch": 70.08, + "grad_norm": 5.19707727432251, + "learning_rate": 2.939068100358423e-05, + "loss": 0.0701, + "step": 2190 + }, + { + "epoch": 70.4, + "grad_norm": 18.371549606323242, + "learning_rate": 2.9271206690561532e-05, + "loss": 0.0992, + "step": 2200 + }, + { + "epoch": 70.72, + "grad_norm": 13.812338829040527, + "learning_rate": 2.9151732377538832e-05, + "loss": 0.1093, + "step": 2210 + }, + { + "epoch": 70.976, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.701532781124115, + "eval_runtime": 9.2401, + "eval_samples_per_second": 48.051, + "eval_steps_per_second": 1.515, + "step": 2218 + }, + { + "epoch": 71.04, + "grad_norm": 7.2299580574035645, + "learning_rate": 2.9032258064516133e-05, + "loss": 0.0814, + "step": 2220 + }, + { + "epoch": 71.36, + "grad_norm": 9.509716987609863, + "learning_rate": 2.8912783751493434e-05, + "loss": 0.0957, + "step": 2230 + }, + { + "epoch": 71.68, + "grad_norm": 21.156930923461914, + "learning_rate": 2.8793309438470727e-05, + "loss": 0.1, + "step": 2240 + }, + { + "epoch": 72.0, + "grad_norm": 9.478891372680664, + "learning_rate": 2.8673835125448028e-05, + "loss": 0.1024, + "step": 2250 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.6907037496566772, + "eval_runtime": 9.2757, + "eval_samples_per_second": 47.867, + "eval_steps_per_second": 1.509, + "step": 2250 + }, + { + "epoch": 72.32, + "grad_norm": 20.353267669677734, + "learning_rate": 2.855436081242533e-05, + "loss": 0.1022, + "step": 2260 + }, + { + "epoch": 72.64, + "grad_norm": 13.162679672241211, + "learning_rate": 2.843488649940263e-05, + "loss": 0.1155, + "step": 2270 + }, + { + "epoch": 72.96, + "grad_norm": 11.615581512451172, + "learning_rate": 2.831541218637993e-05, + "loss": 0.0934, + "step": 2280 + }, + { + "epoch": 72.992, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.7059099078178406, + "eval_runtime": 9.2867, + "eval_samples_per_second": 47.81, + "eval_steps_per_second": 1.508, + "step": 2281 + }, + { + "epoch": 73.28, + "grad_norm": 22.532894134521484, + "learning_rate": 2.819593787335723e-05, + "loss": 0.0751, + "step": 2290 + }, + { + "epoch": 73.6, + "grad_norm": 17.28580093383789, + "learning_rate": 2.807646356033453e-05, + "loss": 0.1037, + "step": 2300 + }, + { + "epoch": 73.92, + "grad_norm": 9.774279594421387, + "learning_rate": 2.7956989247311828e-05, + "loss": 0.103, + "step": 2310 + }, + { + "epoch": 73.984, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.715863823890686, + "eval_runtime": 9.2962, + "eval_samples_per_second": 47.762, + "eval_steps_per_second": 1.506, + "step": 2312 + }, + { + "epoch": 74.24, + "grad_norm": 11.014548301696777, + "learning_rate": 2.783751493428913e-05, + "loss": 0.0769, + "step": 2320 + }, + { + "epoch": 74.56, + "grad_norm": 12.803121566772461, + "learning_rate": 2.771804062126643e-05, + "loss": 0.0728, + "step": 2330 + }, + { + "epoch": 74.88, + "grad_norm": 11.47424602508545, + "learning_rate": 2.759856630824373e-05, + "loss": 0.0974, + "step": 2340 + }, + { + "epoch": 74.976, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.7323743104934692, + "eval_runtime": 9.3389, + "eval_samples_per_second": 47.543, + "eval_steps_per_second": 1.499, + "step": 2343 + }, + { + "epoch": 75.2, + "grad_norm": 14.975933074951172, + "learning_rate": 2.747909199522103e-05, + "loss": 0.0854, + "step": 2350 + }, + { + "epoch": 75.52, + "grad_norm": 10.35051441192627, + "learning_rate": 2.735961768219833e-05, + "loss": 0.0975, + "step": 2360 + }, + { + "epoch": 75.84, + "grad_norm": 22.704221725463867, + "learning_rate": 2.7240143369175632e-05, + "loss": 0.1049, + "step": 2370 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.7397353649139404, + "eval_runtime": 9.3333, + "eval_samples_per_second": 47.572, + "eval_steps_per_second": 1.5, + "step": 2375 + }, + { + "epoch": 76.16, + "grad_norm": 5.02980375289917, + "learning_rate": 2.7120669056152932e-05, + "loss": 0.1032, + "step": 2380 + }, + { + "epoch": 76.48, + "grad_norm": 17.770723342895508, + "learning_rate": 2.7001194743130226e-05, + "loss": 0.0938, + "step": 2390 + }, + { + "epoch": 76.8, + "grad_norm": 11.298961639404297, + "learning_rate": 2.6881720430107527e-05, + "loss": 0.097, + "step": 2400 + }, + { + "epoch": 76.992, + "eval_accuracy": 0.8175675675675675, + "eval_loss": 0.7529385089874268, + "eval_runtime": 9.1812, + "eval_samples_per_second": 48.36, + "eval_steps_per_second": 1.525, + "step": 2406 + }, + { + "epoch": 77.12, + "grad_norm": 10.798528671264648, + "learning_rate": 2.6762246117084827e-05, + "loss": 0.0675, + "step": 2410 + }, + { + "epoch": 77.44, + "grad_norm": 10.555415153503418, + "learning_rate": 2.6642771804062128e-05, + "loss": 0.0894, + "step": 2420 + }, + { + "epoch": 77.76, + "grad_norm": 5.5223612785339355, + "learning_rate": 2.652329749103943e-05, + "loss": 0.0816, + "step": 2430 + }, + { + "epoch": 77.984, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7174968719482422, + "eval_runtime": 9.2611, + "eval_samples_per_second": 47.943, + "eval_steps_per_second": 1.512, + "step": 2437 + }, + { + "epoch": 78.08, + "grad_norm": 7.4646382331848145, + "learning_rate": 2.640382317801673e-05, + "loss": 0.0791, + "step": 2440 + }, + { + "epoch": 78.4, + "grad_norm": 3.034799337387085, + "learning_rate": 2.628434886499403e-05, + "loss": 0.0722, + "step": 2450 + }, + { + "epoch": 78.72, + "grad_norm": 10.572321891784668, + "learning_rate": 2.616487455197133e-05, + "loss": 0.0902, + "step": 2460 + }, + { + "epoch": 78.976, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.7745317816734314, + "eval_runtime": 9.4539, + "eval_samples_per_second": 46.965, + "eval_steps_per_second": 1.481, + "step": 2468 + }, + { + "epoch": 79.04, + "grad_norm": 10.482852935791016, + "learning_rate": 2.6045400238948624e-05, + "loss": 0.0787, + "step": 2470 + }, + { + "epoch": 79.36, + "grad_norm": 8.270842552185059, + "learning_rate": 2.5925925925925925e-05, + "loss": 0.078, + "step": 2480 + }, + { + "epoch": 79.68, + "grad_norm": 7.286362171173096, + "learning_rate": 2.5806451612903226e-05, + "loss": 0.0676, + "step": 2490 + }, + { + "epoch": 80.0, + "grad_norm": 15.592032432556152, + "learning_rate": 2.5686977299880526e-05, + "loss": 0.0827, + "step": 2500 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.701685905456543, + "eval_runtime": 9.2982, + "eval_samples_per_second": 47.751, + "eval_steps_per_second": 1.506, + "step": 2500 + }, + { + "epoch": 80.32, + "grad_norm": 4.7289814949035645, + "learning_rate": 2.5567502986857827e-05, + "loss": 0.0594, + "step": 2510 + }, + { + "epoch": 80.64, + "grad_norm": 10.917496681213379, + "learning_rate": 2.5448028673835127e-05, + "loss": 0.073, + "step": 2520 + }, + { + "epoch": 80.96, + "grad_norm": 9.24520492553711, + "learning_rate": 2.5328554360812428e-05, + "loss": 0.0818, + "step": 2530 + }, + { + "epoch": 80.992, + "eval_accuracy": 0.8243243243243243, + "eval_loss": 0.7711703777313232, + "eval_runtime": 9.3451, + "eval_samples_per_second": 47.511, + "eval_steps_per_second": 1.498, + "step": 2531 + }, + { + "epoch": 81.28, + "grad_norm": 7.302574634552002, + "learning_rate": 2.5209080047789725e-05, + "loss": 0.074, + "step": 2540 + }, + { + "epoch": 81.6, + "grad_norm": 6.845890998840332, + "learning_rate": 2.5089605734767026e-05, + "loss": 0.0649, + "step": 2550 + }, + { + "epoch": 81.92, + "grad_norm": 13.442338943481445, + "learning_rate": 2.4970131421744326e-05, + "loss": 0.076, + "step": 2560 + }, + { + "epoch": 81.984, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7341312766075134, + "eval_runtime": 9.2594, + "eval_samples_per_second": 47.951, + "eval_steps_per_second": 1.512, + "step": 2562 + }, + { + "epoch": 82.24, + "grad_norm": 11.702536582946777, + "learning_rate": 2.4850657108721627e-05, + "loss": 0.086, + "step": 2570 + }, + { + "epoch": 82.56, + "grad_norm": 10.26176643371582, + "learning_rate": 2.4731182795698928e-05, + "loss": 0.079, + "step": 2580 + }, + { + "epoch": 82.88, + "grad_norm": 6.915826320648193, + "learning_rate": 2.4611708482676228e-05, + "loss": 0.0837, + "step": 2590 + }, + { + "epoch": 82.976, + "eval_accuracy": 0.8490990990990991, + "eval_loss": 0.7242028713226318, + "eval_runtime": 9.2096, + "eval_samples_per_second": 48.211, + "eval_steps_per_second": 1.52, + "step": 2593 + }, + { + "epoch": 83.2, + "grad_norm": 11.702289581298828, + "learning_rate": 2.4492234169653525e-05, + "loss": 0.0889, + "step": 2600 + }, + { + "epoch": 83.52, + "grad_norm": 19.560651779174805, + "learning_rate": 2.4372759856630826e-05, + "loss": 0.0814, + "step": 2610 + }, + { + "epoch": 83.84, + "grad_norm": 5.7238874435424805, + "learning_rate": 2.4253285543608127e-05, + "loss": 0.0743, + "step": 2620 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.6999171376228333, + "eval_runtime": 9.5395, + "eval_samples_per_second": 46.543, + "eval_steps_per_second": 1.468, + "step": 2625 + }, + { + "epoch": 84.16, + "grad_norm": 13.587686538696289, + "learning_rate": 2.4133811230585427e-05, + "loss": 0.0888, + "step": 2630 + }, + { + "epoch": 84.48, + "grad_norm": 5.998437881469727, + "learning_rate": 2.4014336917562724e-05, + "loss": 0.0689, + "step": 2640 + }, + { + "epoch": 84.8, + "grad_norm": 12.043107986450195, + "learning_rate": 2.3894862604540025e-05, + "loss": 0.0552, + "step": 2650 + }, + { + "epoch": 84.992, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.6875202059745789, + "eval_runtime": 9.3028, + "eval_samples_per_second": 47.727, + "eval_steps_per_second": 1.505, + "step": 2656 + }, + { + "epoch": 85.12, + "grad_norm": 18.890920639038086, + "learning_rate": 2.3775388291517326e-05, + "loss": 0.101, + "step": 2660 + }, + { + "epoch": 85.44, + "grad_norm": 17.034523010253906, + "learning_rate": 2.3655913978494626e-05, + "loss": 0.0996, + "step": 2670 + }, + { + "epoch": 85.76, + "grad_norm": 16.122133255004883, + "learning_rate": 2.3536439665471923e-05, + "loss": 0.0762, + "step": 2680 + }, + { + "epoch": 85.984, + "eval_accuracy": 0.8581081081081081, + "eval_loss": 0.6743137836456299, + "eval_runtime": 9.3345, + "eval_samples_per_second": 47.566, + "eval_steps_per_second": 1.5, + "step": 2687 + }, + { + "epoch": 86.08, + "grad_norm": 26.113550186157227, + "learning_rate": 2.3416965352449224e-05, + "loss": 0.0827, + "step": 2690 + }, + { + "epoch": 86.4, + "grad_norm": 15.370433807373047, + "learning_rate": 2.3297491039426525e-05, + "loss": 0.0855, + "step": 2700 + }, + { + "epoch": 86.72, + "grad_norm": 7.994245529174805, + "learning_rate": 2.3178016726403825e-05, + "loss": 0.0742, + "step": 2710 + }, + { + "epoch": 86.976, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.7026851177215576, + "eval_runtime": 9.4608, + "eval_samples_per_second": 46.931, + "eval_steps_per_second": 1.48, + "step": 2718 + }, + { + "epoch": 87.04, + "grad_norm": 9.935162544250488, + "learning_rate": 2.3058542413381122e-05, + "loss": 0.08, + "step": 2720 + }, + { + "epoch": 87.36, + "grad_norm": 9.019417762756348, + "learning_rate": 2.2939068100358423e-05, + "loss": 0.0784, + "step": 2730 + }, + { + "epoch": 87.68, + "grad_norm": 15.763988494873047, + "learning_rate": 2.2819593787335724e-05, + "loss": 0.073, + "step": 2740 + }, + { + "epoch": 88.0, + "grad_norm": 13.025769233703613, + "learning_rate": 2.270011947431302e-05, + "loss": 0.0708, + "step": 2750 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.7366507649421692, + "eval_runtime": 9.2515, + "eval_samples_per_second": 47.992, + "eval_steps_per_second": 1.513, + "step": 2750 + }, + { + "epoch": 88.32, + "grad_norm": 5.605508804321289, + "learning_rate": 2.258064516129032e-05, + "loss": 0.0684, + "step": 2760 + }, + { + "epoch": 88.64, + "grad_norm": 11.24120044708252, + "learning_rate": 2.2461170848267622e-05, + "loss": 0.0747, + "step": 2770 + }, + { + "epoch": 88.96, + "grad_norm": 12.07764720916748, + "learning_rate": 2.2341696535244923e-05, + "loss": 0.086, + "step": 2780 + }, + { + "epoch": 88.992, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.6904602646827698, + "eval_runtime": 9.2419, + "eval_samples_per_second": 48.042, + "eval_steps_per_second": 1.515, + "step": 2781 + }, + { + "epoch": 89.28, + "grad_norm": 14.17684555053711, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0653, + "step": 2790 + }, + { + "epoch": 89.6, + "grad_norm": 7.844547271728516, + "learning_rate": 2.2102747909199524e-05, + "loss": 0.0679, + "step": 2800 + }, + { + "epoch": 89.92, + "grad_norm": 17.304725646972656, + "learning_rate": 2.1983273596176824e-05, + "loss": 0.0575, + "step": 2810 + }, + { + "epoch": 89.984, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7041297554969788, + "eval_runtime": 9.165, + "eval_samples_per_second": 48.445, + "eval_steps_per_second": 1.528, + "step": 2812 + }, + { + "epoch": 90.24, + "grad_norm": 13.358567237854004, + "learning_rate": 2.1863799283154125e-05, + "loss": 0.0676, + "step": 2820 + }, + { + "epoch": 90.56, + "grad_norm": 15.546914100646973, + "learning_rate": 2.1744324970131422e-05, + "loss": 0.0569, + "step": 2830 + }, + { + "epoch": 90.88, + "grad_norm": 14.0934476852417, + "learning_rate": 2.1624850657108723e-05, + "loss": 0.0733, + "step": 2840 + }, + { + "epoch": 90.976, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.6464592814445496, + "eval_runtime": 9.22, + "eval_samples_per_second": 48.156, + "eval_steps_per_second": 1.518, + "step": 2843 + }, + { + "epoch": 91.2, + "grad_norm": 5.8752007484436035, + "learning_rate": 2.1505376344086024e-05, + "loss": 0.0577, + "step": 2850 + }, + { + "epoch": 91.52, + "grad_norm": 6.653694152832031, + "learning_rate": 2.1385902031063324e-05, + "loss": 0.0755, + "step": 2860 + }, + { + "epoch": 91.84, + "grad_norm": 6.564297676086426, + "learning_rate": 2.126642771804062e-05, + "loss": 0.0701, + "step": 2870 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.7065722942352295, + "eval_runtime": 9.308, + "eval_samples_per_second": 47.701, + "eval_steps_per_second": 1.504, + "step": 2875 + }, + { + "epoch": 92.16, + "grad_norm": 6.29132080078125, + "learning_rate": 2.1146953405017922e-05, + "loss": 0.0719, + "step": 2880 + }, + { + "epoch": 92.48, + "grad_norm": 12.695429801940918, + "learning_rate": 2.1027479091995223e-05, + "loss": 0.102, + "step": 2890 + }, + { + "epoch": 92.8, + "grad_norm": 5.133272171020508, + "learning_rate": 2.0908004778972523e-05, + "loss": 0.0782, + "step": 2900 + }, + { + "epoch": 92.992, + "eval_accuracy": 0.8243243243243243, + "eval_loss": 0.6954546570777893, + "eval_runtime": 9.1127, + "eval_samples_per_second": 48.723, + "eval_steps_per_second": 1.536, + "step": 2906 + }, + { + "epoch": 93.12, + "grad_norm": 3.4611129760742188, + "learning_rate": 2.078853046594982e-05, + "loss": 0.0674, + "step": 2910 + }, + { + "epoch": 93.44, + "grad_norm": 11.246231079101562, + "learning_rate": 2.066905615292712e-05, + "loss": 0.0818, + "step": 2920 + }, + { + "epoch": 93.76, + "grad_norm": 5.470245838165283, + "learning_rate": 2.054958183990442e-05, + "loss": 0.0754, + "step": 2930 + }, + { + "epoch": 93.984, + "eval_accuracy": 0.8468468468468469, + "eval_loss": 0.6836279630661011, + "eval_runtime": 9.1935, + "eval_samples_per_second": 48.295, + "eval_steps_per_second": 1.523, + "step": 2937 + }, + { + "epoch": 94.08, + "grad_norm": 21.865903854370117, + "learning_rate": 2.0430107526881722e-05, + "loss": 0.0647, + "step": 2940 + }, + { + "epoch": 94.4, + "grad_norm": 12.577595710754395, + "learning_rate": 2.031063321385902e-05, + "loss": 0.0657, + "step": 2950 + }, + { + "epoch": 94.72, + "grad_norm": 5.114993572235107, + "learning_rate": 2.019115890083632e-05, + "loss": 0.0545, + "step": 2960 + }, + { + "epoch": 94.976, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.7289859056472778, + "eval_runtime": 9.4039, + "eval_samples_per_second": 47.215, + "eval_steps_per_second": 1.489, + "step": 2968 + }, + { + "epoch": 95.04, + "grad_norm": 11.622882843017578, + "learning_rate": 2.007168458781362e-05, + "loss": 0.0762, + "step": 2970 + }, + { + "epoch": 95.36, + "grad_norm": 11.010184288024902, + "learning_rate": 1.995221027479092e-05, + "loss": 0.0546, + "step": 2980 + }, + { + "epoch": 95.68, + "grad_norm": 12.926688194274902, + "learning_rate": 1.9832735961768222e-05, + "loss": 0.0712, + "step": 2990 + }, + { + "epoch": 96.0, + "grad_norm": 9.490360260009766, + "learning_rate": 1.9713261648745522e-05, + "loss": 0.0913, + "step": 3000 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.7665247917175293, + "eval_runtime": 9.312, + "eval_samples_per_second": 47.68, + "eval_steps_per_second": 1.503, + "step": 3000 + }, + { + "epoch": 96.32, + "grad_norm": 15.037102699279785, + "learning_rate": 1.9593787335722823e-05, + "loss": 0.0602, + "step": 3010 + }, + { + "epoch": 96.64, + "grad_norm": 9.528903007507324, + "learning_rate": 1.947431302270012e-05, + "loss": 0.0691, + "step": 3020 + }, + { + "epoch": 96.96, + "grad_norm": 15.416064262390137, + "learning_rate": 1.935483870967742e-05, + "loss": 0.0816, + "step": 3030 + }, + { + "epoch": 96.992, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.7661374807357788, + "eval_runtime": 9.2186, + "eval_samples_per_second": 48.164, + "eval_steps_per_second": 1.519, + "step": 3031 + }, + { + "epoch": 97.28, + "grad_norm": 9.588016510009766, + "learning_rate": 1.923536439665472e-05, + "loss": 0.0699, + "step": 3040 + }, + { + "epoch": 97.6, + "grad_norm": 8.645281791687012, + "learning_rate": 1.9115890083632022e-05, + "loss": 0.0645, + "step": 3050 + }, + { + "epoch": 97.92, + "grad_norm": 9.152593612670898, + "learning_rate": 1.899641577060932e-05, + "loss": 0.0696, + "step": 3060 + }, + { + "epoch": 97.984, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.6920885443687439, + "eval_runtime": 9.5718, + "eval_samples_per_second": 46.386, + "eval_steps_per_second": 1.463, + "step": 3062 + }, + { + "epoch": 98.24, + "grad_norm": 11.67238712310791, + "learning_rate": 1.887694145758662e-05, + "loss": 0.0661, + "step": 3070 + }, + { + "epoch": 98.56, + "grad_norm": 9.657654762268066, + "learning_rate": 1.875746714456392e-05, + "loss": 0.078, + "step": 3080 + }, + { + "epoch": 98.88, + "grad_norm": 10.411086082458496, + "learning_rate": 1.863799283154122e-05, + "loss": 0.0627, + "step": 3090 + }, + { + "epoch": 98.976, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.7070387005805969, + "eval_runtime": 9.2361, + "eval_samples_per_second": 48.072, + "eval_steps_per_second": 1.516, + "step": 3093 + }, + { + "epoch": 99.2, + "grad_norm": 9.71650505065918, + "learning_rate": 1.8518518518518518e-05, + "loss": 0.0686, + "step": 3100 + }, + { + "epoch": 99.52, + "grad_norm": 6.4877519607543945, + "learning_rate": 1.839904420549582e-05, + "loss": 0.0587, + "step": 3110 + }, + { + "epoch": 99.84, + "grad_norm": 6.228066444396973, + "learning_rate": 1.827956989247312e-05, + "loss": 0.0562, + "step": 3120 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.7441924214363098, + "eval_runtime": 9.3695, + "eval_samples_per_second": 47.388, + "eval_steps_per_second": 1.494, + "step": 3125 + }, + { + "epoch": 100.16, + "grad_norm": 12.939260482788086, + "learning_rate": 1.816009557945042e-05, + "loss": 0.0581, + "step": 3130 + }, + { + "epoch": 100.48, + "grad_norm": 7.985968589782715, + "learning_rate": 1.8040621266427717e-05, + "loss": 0.0618, + "step": 3140 + }, + { + "epoch": 100.8, + "grad_norm": 8.757869720458984, + "learning_rate": 1.7921146953405018e-05, + "loss": 0.0742, + "step": 3150 + }, + { + "epoch": 100.992, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.699988842010498, + "eval_runtime": 9.324, + "eval_samples_per_second": 47.619, + "eval_steps_per_second": 1.501, + "step": 3156 + }, + { + "epoch": 101.12, + "grad_norm": 13.87547779083252, + "learning_rate": 1.780167264038232e-05, + "loss": 0.0611, + "step": 3160 + }, + { + "epoch": 101.44, + "grad_norm": 10.741342544555664, + "learning_rate": 1.768219832735962e-05, + "loss": 0.0636, + "step": 3170 + }, + { + "epoch": 101.76, + "grad_norm": 14.43264389038086, + "learning_rate": 1.7562724014336916e-05, + "loss": 0.0545, + "step": 3180 + }, + { + "epoch": 101.984, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.7311636209487915, + "eval_runtime": 9.1417, + "eval_samples_per_second": 48.569, + "eval_steps_per_second": 1.531, + "step": 3187 + }, + { + "epoch": 102.08, + "grad_norm": 4.225771903991699, + "learning_rate": 1.7443249701314217e-05, + "loss": 0.0597, + "step": 3190 + }, + { + "epoch": 102.4, + "grad_norm": 9.73902416229248, + "learning_rate": 1.7323775388291518e-05, + "loss": 0.0646, + "step": 3200 + }, + { + "epoch": 102.72, + "grad_norm": 6.130120277404785, + "learning_rate": 1.7204301075268818e-05, + "loss": 0.0635, + "step": 3210 + }, + { + "epoch": 102.976, + "eval_accuracy": 0.8490990990990991, + "eval_loss": 0.7231103181838989, + "eval_runtime": 9.4119, + "eval_samples_per_second": 47.174, + "eval_steps_per_second": 1.487, + "step": 3218 + }, + { + "epoch": 103.04, + "grad_norm": 7.3819451332092285, + "learning_rate": 1.708482676224612e-05, + "loss": 0.0519, + "step": 3220 + }, + { + "epoch": 103.36, + "grad_norm": 11.180880546569824, + "learning_rate": 1.696535244922342e-05, + "loss": 0.0719, + "step": 3230 + }, + { + "epoch": 103.68, + "grad_norm": 11.228340148925781, + "learning_rate": 1.684587813620072e-05, + "loss": 0.0625, + "step": 3240 + }, + { + "epoch": 104.0, + "grad_norm": 7.932921409606934, + "learning_rate": 1.6726403823178017e-05, + "loss": 0.0608, + "step": 3250 + }, + { + "epoch": 104.0, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.7332068085670471, + "eval_runtime": 9.1899, + "eval_samples_per_second": 48.314, + "eval_steps_per_second": 1.523, + "step": 3250 + }, + { + "epoch": 104.32, + "grad_norm": 2.481245756149292, + "learning_rate": 1.6606929510155318e-05, + "loss": 0.0753, + "step": 3260 + }, + { + "epoch": 104.64, + "grad_norm": 13.661800384521484, + "learning_rate": 1.648745519713262e-05, + "loss": 0.0717, + "step": 3270 + }, + { + "epoch": 104.96, + "grad_norm": 11.668190002441406, + "learning_rate": 1.636798088410992e-05, + "loss": 0.0769, + "step": 3280 + }, + { + "epoch": 104.992, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.7328325510025024, + "eval_runtime": 9.1902, + "eval_samples_per_second": 48.312, + "eval_steps_per_second": 1.523, + "step": 3281 + }, + { + "epoch": 105.28, + "grad_norm": 12.411203384399414, + "learning_rate": 1.6248506571087216e-05, + "loss": 0.0596, + "step": 3290 + }, + { + "epoch": 105.6, + "grad_norm": 7.294625759124756, + "learning_rate": 1.6129032258064517e-05, + "loss": 0.0528, + "step": 3300 + }, + { + "epoch": 105.92, + "grad_norm": 6.775350093841553, + "learning_rate": 1.6009557945041817e-05, + "loss": 0.057, + "step": 3310 + }, + { + "epoch": 105.984, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.695361852645874, + "eval_runtime": 9.2092, + "eval_samples_per_second": 48.213, + "eval_steps_per_second": 1.52, + "step": 3312 + }, + { + "epoch": 106.24, + "grad_norm": 9.063501358032227, + "learning_rate": 1.5890083632019118e-05, + "loss": 0.0515, + "step": 3320 + }, + { + "epoch": 106.56, + "grad_norm": 7.814884185791016, + "learning_rate": 1.5770609318996415e-05, + "loss": 0.0465, + "step": 3330 + }, + { + "epoch": 106.88, + "grad_norm": 7.107222080230713, + "learning_rate": 1.5651135005973716e-05, + "loss": 0.0447, + "step": 3340 + }, + { + "epoch": 106.976, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7006074786186218, + "eval_runtime": 9.2139, + "eval_samples_per_second": 48.188, + "eval_steps_per_second": 1.519, + "step": 3343 + }, + { + "epoch": 107.2, + "grad_norm": 9.964568138122559, + "learning_rate": 1.5531660692951016e-05, + "loss": 0.0646, + "step": 3350 + }, + { + "epoch": 107.52, + "grad_norm": 5.107904434204102, + "learning_rate": 1.5412186379928317e-05, + "loss": 0.0513, + "step": 3360 + }, + { + "epoch": 107.84, + "grad_norm": 5.2939453125, + "learning_rate": 1.5292712066905614e-05, + "loss": 0.0629, + "step": 3370 + }, + { + "epoch": 108.0, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7149015069007874, + "eval_runtime": 9.5241, + "eval_samples_per_second": 46.619, + "eval_steps_per_second": 1.47, + "step": 3375 + }, + { + "epoch": 108.16, + "grad_norm": 12.05675220489502, + "learning_rate": 1.5173237753882915e-05, + "loss": 0.0627, + "step": 3380 + }, + { + "epoch": 108.48, + "grad_norm": 5.3016791343688965, + "learning_rate": 1.5053763440860215e-05, + "loss": 0.0502, + "step": 3390 + }, + { + "epoch": 108.8, + "grad_norm": 17.884672164916992, + "learning_rate": 1.4934289127837516e-05, + "loss": 0.0394, + "step": 3400 + }, + { + "epoch": 108.992, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.74688321352005, + "eval_runtime": 9.3012, + "eval_samples_per_second": 47.736, + "eval_steps_per_second": 1.505, + "step": 3406 + }, + { + "epoch": 109.12, + "grad_norm": 13.95982551574707, + "learning_rate": 1.4814814814814815e-05, + "loss": 0.0472, + "step": 3410 + }, + { + "epoch": 109.44, + "grad_norm": 7.970043659210205, + "learning_rate": 1.4695340501792116e-05, + "loss": 0.0587, + "step": 3420 + }, + { + "epoch": 109.76, + "grad_norm": 11.51051139831543, + "learning_rate": 1.4575866188769416e-05, + "loss": 0.0602, + "step": 3430 + }, + { + "epoch": 109.984, + "eval_accuracy": 0.8468468468468469, + "eval_loss": 0.7274497151374817, + "eval_runtime": 9.2259, + "eval_samples_per_second": 48.126, + "eval_steps_per_second": 1.517, + "step": 3437 + }, + { + "epoch": 110.08, + "grad_norm": 8.181411743164062, + "learning_rate": 1.4456391875746717e-05, + "loss": 0.062, + "step": 3440 + }, + { + "epoch": 110.4, + "grad_norm": 13.90931224822998, + "learning_rate": 1.4336917562724014e-05, + "loss": 0.0608, + "step": 3450 + }, + { + "epoch": 110.72, + "grad_norm": 8.419418334960938, + "learning_rate": 1.4217443249701315e-05, + "loss": 0.0635, + "step": 3460 + }, + { + "epoch": 110.976, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.7494940757751465, + "eval_runtime": 9.4779, + "eval_samples_per_second": 46.846, + "eval_steps_per_second": 1.477, + "step": 3468 + }, + { + "epoch": 111.04, + "grad_norm": 13.364533424377441, + "learning_rate": 1.4097968936678615e-05, + "loss": 0.0781, + "step": 3470 + }, + { + "epoch": 111.36, + "grad_norm": 5.620299339294434, + "learning_rate": 1.3978494623655914e-05, + "loss": 0.0526, + "step": 3480 + }, + { + "epoch": 111.68, + "grad_norm": 14.518438339233398, + "learning_rate": 1.3859020310633215e-05, + "loss": 0.0452, + "step": 3490 + }, + { + "epoch": 112.0, + "grad_norm": 3.248464822769165, + "learning_rate": 1.3739545997610515e-05, + "loss": 0.0565, + "step": 3500 + }, + { + "epoch": 112.0, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.78851717710495, + "eval_runtime": 9.2483, + "eval_samples_per_second": 48.009, + "eval_steps_per_second": 1.514, + "step": 3500 + }, + { + "epoch": 112.32, + "grad_norm": 9.410392761230469, + "learning_rate": 1.3620071684587816e-05, + "loss": 0.0411, + "step": 3510 + }, + { + "epoch": 112.64, + "grad_norm": 3.4295830726623535, + "learning_rate": 1.3500597371565113e-05, + "loss": 0.0465, + "step": 3520 + }, + { + "epoch": 112.96, + "grad_norm": 6.670384407043457, + "learning_rate": 1.3381123058542414e-05, + "loss": 0.035, + "step": 3530 + }, + { + "epoch": 112.992, + "eval_accuracy": 0.8468468468468469, + "eval_loss": 0.7178235650062561, + "eval_runtime": 9.1762, + "eval_samples_per_second": 48.386, + "eval_steps_per_second": 1.526, + "step": 3531 + }, + { + "epoch": 113.28, + "grad_norm": 6.2322468757629395, + "learning_rate": 1.3261648745519714e-05, + "loss": 0.0626, + "step": 3540 + }, + { + "epoch": 113.6, + "grad_norm": 5.694119453430176, + "learning_rate": 1.3142174432497015e-05, + "loss": 0.0638, + "step": 3550 + }, + { + "epoch": 113.92, + "grad_norm": 5.017783164978027, + "learning_rate": 1.3022700119474312e-05, + "loss": 0.0604, + "step": 3560 + }, + { + "epoch": 113.984, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.7574167251586914, + "eval_runtime": 9.3469, + "eval_samples_per_second": 47.502, + "eval_steps_per_second": 1.498, + "step": 3562 + }, + { + "epoch": 114.24, + "grad_norm": 9.012121200561523, + "learning_rate": 1.2903225806451613e-05, + "loss": 0.0628, + "step": 3570 + }, + { + "epoch": 114.56, + "grad_norm": 6.070770740509033, + "learning_rate": 1.2783751493428913e-05, + "loss": 0.0559, + "step": 3580 + }, + { + "epoch": 114.88, + "grad_norm": 6.4133477210998535, + "learning_rate": 1.2664277180406214e-05, + "loss": 0.0507, + "step": 3590 + }, + { + "epoch": 114.976, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.790080726146698, + "eval_runtime": 9.284, + "eval_samples_per_second": 47.824, + "eval_steps_per_second": 1.508, + "step": 3593 + }, + { + "epoch": 115.2, + "grad_norm": 10.017669677734375, + "learning_rate": 1.2544802867383513e-05, + "loss": 0.0737, + "step": 3600 + }, + { + "epoch": 115.52, + "grad_norm": 11.685331344604492, + "learning_rate": 1.2425328554360813e-05, + "loss": 0.0551, + "step": 3610 + }, + { + "epoch": 115.84, + "grad_norm": 6.028275489807129, + "learning_rate": 1.2305854241338114e-05, + "loss": 0.05, + "step": 3620 + }, + { + "epoch": 116.0, + "eval_accuracy": 0.8198198198198198, + "eval_loss": 0.7729606032371521, + "eval_runtime": 9.2626, + "eval_samples_per_second": 47.935, + "eval_steps_per_second": 1.511, + "step": 3625 + }, + { + "epoch": 116.16, + "grad_norm": 20.972734451293945, + "learning_rate": 1.2186379928315413e-05, + "loss": 0.053, + "step": 3630 + }, + { + "epoch": 116.48, + "grad_norm": 21.527067184448242, + "learning_rate": 1.2066905615292714e-05, + "loss": 0.061, + "step": 3640 + }, + { + "epoch": 116.8, + "grad_norm": 2.8793394565582275, + "learning_rate": 1.1947431302270013e-05, + "loss": 0.0465, + "step": 3650 + }, + { + "epoch": 116.992, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.7966709136962891, + "eval_runtime": 9.2609, + "eval_samples_per_second": 47.944, + "eval_steps_per_second": 1.512, + "step": 3656 + }, + { + "epoch": 117.12, + "grad_norm": 10.335177421569824, + "learning_rate": 1.1827956989247313e-05, + "loss": 0.057, + "step": 3660 + }, + { + "epoch": 117.44, + "grad_norm": 10.481749534606934, + "learning_rate": 1.1708482676224612e-05, + "loss": 0.0417, + "step": 3670 + }, + { + "epoch": 117.76, + "grad_norm": 8.608804702758789, + "learning_rate": 1.1589008363201913e-05, + "loss": 0.042, + "step": 3680 + }, + { + "epoch": 117.984, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7766792178153992, + "eval_runtime": 9.2816, + "eval_samples_per_second": 47.837, + "eval_steps_per_second": 1.508, + "step": 3687 + }, + { + "epoch": 118.08, + "grad_norm": 10.435688972473145, + "learning_rate": 1.1469534050179212e-05, + "loss": 0.0474, + "step": 3690 + }, + { + "epoch": 118.4, + "grad_norm": 3.1348280906677246, + "learning_rate": 1.135005973715651e-05, + "loss": 0.0528, + "step": 3700 + }, + { + "epoch": 118.72, + "grad_norm": 17.36944007873535, + "learning_rate": 1.1230585424133811e-05, + "loss": 0.0609, + "step": 3710 + }, + { + "epoch": 118.976, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.7871721982955933, + "eval_runtime": 9.3483, + "eval_samples_per_second": 47.495, + "eval_steps_per_second": 1.498, + "step": 3718 + }, + { + "epoch": 119.04, + "grad_norm": 5.15559720993042, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.0544, + "step": 3720 + }, + { + "epoch": 119.36, + "grad_norm": 9.22418212890625, + "learning_rate": 1.0991636798088412e-05, + "loss": 0.0358, + "step": 3730 + }, + { + "epoch": 119.68, + "grad_norm": 9.791460037231445, + "learning_rate": 1.0872162485065711e-05, + "loss": 0.0579, + "step": 3740 + }, + { + "epoch": 120.0, + "grad_norm": 8.853157997131348, + "learning_rate": 1.0752688172043012e-05, + "loss": 0.0379, + "step": 3750 + }, + { + "epoch": 120.0, + "eval_accuracy": 0.8513513513513513, + "eval_loss": 0.7684924602508545, + "eval_runtime": 9.2729, + "eval_samples_per_second": 47.882, + "eval_steps_per_second": 1.51, + "step": 3750 + }, + { + "epoch": 120.32, + "grad_norm": 11.307464599609375, + "learning_rate": 1.063321385902031e-05, + "loss": 0.0509, + "step": 3760 + }, + { + "epoch": 120.64, + "grad_norm": 7.135359287261963, + "learning_rate": 1.0513739545997611e-05, + "loss": 0.0359, + "step": 3770 + }, + { + "epoch": 120.96, + "grad_norm": 8.934365272521973, + "learning_rate": 1.039426523297491e-05, + "loss": 0.0579, + "step": 3780 + }, + { + "epoch": 120.992, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7708752751350403, + "eval_runtime": 9.1957, + "eval_samples_per_second": 48.283, + "eval_steps_per_second": 1.522, + "step": 3781 + }, + { + "epoch": 121.28, + "grad_norm": 10.474456787109375, + "learning_rate": 1.027479091995221e-05, + "loss": 0.0457, + "step": 3790 + }, + { + "epoch": 121.6, + "grad_norm": 7.360742092132568, + "learning_rate": 1.015531660692951e-05, + "loss": 0.0542, + "step": 3800 + }, + { + "epoch": 121.92, + "grad_norm": 8.417169570922852, + "learning_rate": 1.003584229390681e-05, + "loss": 0.0471, + "step": 3810 + }, + { + "epoch": 121.984, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7601270079612732, + "eval_runtime": 9.6072, + "eval_samples_per_second": 46.215, + "eval_steps_per_second": 1.457, + "step": 3812 + }, + { + "epoch": 122.24, + "grad_norm": 3.6518542766571045, + "learning_rate": 9.916367980884111e-06, + "loss": 0.0537, + "step": 3820 + }, + { + "epoch": 122.56, + "grad_norm": 1.70374596118927, + "learning_rate": 9.796893667861412e-06, + "loss": 0.0381, + "step": 3830 + }, + { + "epoch": 122.88, + "grad_norm": 6.339093208312988, + "learning_rate": 9.67741935483871e-06, + "loss": 0.0488, + "step": 3840 + }, + { + "epoch": 122.976, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.8230583667755127, + "eval_runtime": 9.2102, + "eval_samples_per_second": 48.208, + "eval_steps_per_second": 1.52, + "step": 3843 + }, + { + "epoch": 123.2, + "grad_norm": 7.372219562530518, + "learning_rate": 9.557945041816011e-06, + "loss": 0.0487, + "step": 3850 + }, + { + "epoch": 123.52, + "grad_norm": 10.117424964904785, + "learning_rate": 9.43847072879331e-06, + "loss": 0.0617, + "step": 3860 + }, + { + "epoch": 123.84, + "grad_norm": 8.506390571594238, + "learning_rate": 9.31899641577061e-06, + "loss": 0.0531, + "step": 3870 + }, + { + "epoch": 124.0, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.8015686869621277, + "eval_runtime": 9.327, + "eval_samples_per_second": 47.604, + "eval_steps_per_second": 1.501, + "step": 3875 + }, + { + "epoch": 124.16, + "grad_norm": 2.586430788040161, + "learning_rate": 9.19952210274791e-06, + "loss": 0.0484, + "step": 3880 + }, + { + "epoch": 124.48, + "grad_norm": 21.505414962768555, + "learning_rate": 9.08004778972521e-06, + "loss": 0.0432, + "step": 3890 + }, + { + "epoch": 124.8, + "grad_norm": 6.578880786895752, + "learning_rate": 8.960573476702509e-06, + "loss": 0.0446, + "step": 3900 + }, + { + "epoch": 124.992, + "eval_accuracy": 0.8423423423423423, + "eval_loss": 0.7805784940719604, + "eval_runtime": 9.2689, + "eval_samples_per_second": 47.902, + "eval_steps_per_second": 1.51, + "step": 3906 + }, + { + "epoch": 125.12, + "grad_norm": 14.002837181091309, + "learning_rate": 8.84109916367981e-06, + "loss": 0.0499, + "step": 3910 + }, + { + "epoch": 125.44, + "grad_norm": 9.619620323181152, + "learning_rate": 8.721624850657108e-06, + "loss": 0.0635, + "step": 3920 + }, + { + "epoch": 125.76, + "grad_norm": 10.778247833251953, + "learning_rate": 8.602150537634409e-06, + "loss": 0.0479, + "step": 3930 + }, + { + "epoch": 125.984, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.7668151259422302, + "eval_runtime": 9.1034, + "eval_samples_per_second": 48.773, + "eval_steps_per_second": 1.538, + "step": 3937 + }, + { + "epoch": 126.08, + "grad_norm": 7.485611438751221, + "learning_rate": 8.48267622461171e-06, + "loss": 0.0602, + "step": 3940 + }, + { + "epoch": 126.4, + "grad_norm": 4.371471405029297, + "learning_rate": 8.363201911589009e-06, + "loss": 0.0607, + "step": 3950 + }, + { + "epoch": 126.72, + "grad_norm": 21.91805648803711, + "learning_rate": 8.24372759856631e-06, + "loss": 0.0525, + "step": 3960 + }, + { + "epoch": 126.976, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.7874143719673157, + "eval_runtime": 9.4361, + "eval_samples_per_second": 47.053, + "eval_steps_per_second": 1.484, + "step": 3968 + }, + { + "epoch": 127.04, + "grad_norm": 17.411008834838867, + "learning_rate": 8.124253285543608e-06, + "loss": 0.0577, + "step": 3970 + }, + { + "epoch": 127.36, + "grad_norm": 3.1272220611572266, + "learning_rate": 8.004778972520909e-06, + "loss": 0.0504, + "step": 3980 + }, + { + "epoch": 127.68, + "grad_norm": 2.4175899028778076, + "learning_rate": 7.885304659498208e-06, + "loss": 0.053, + "step": 3990 + }, + { + "epoch": 128.0, + "grad_norm": 10.839136123657227, + "learning_rate": 7.765830346475508e-06, + "loss": 0.0512, + "step": 4000 + }, + { + "epoch": 128.0, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.7651507258415222, + "eval_runtime": 9.2854, + "eval_samples_per_second": 47.817, + "eval_steps_per_second": 1.508, + "step": 4000 + }, + { + "epoch": 128.32, + "grad_norm": 8.561548233032227, + "learning_rate": 7.646356033452807e-06, + "loss": 0.0456, + "step": 4010 + }, + { + "epoch": 128.64, + "grad_norm": 6.735084533691406, + "learning_rate": 7.526881720430108e-06, + "loss": 0.0403, + "step": 4020 + }, + { + "epoch": 128.96, + "grad_norm": 10.706457138061523, + "learning_rate": 7.4074074074074075e-06, + "loss": 0.0473, + "step": 4030 + }, + { + "epoch": 128.992, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.7721133828163147, + "eval_runtime": 9.1988, + "eval_samples_per_second": 48.267, + "eval_steps_per_second": 1.522, + "step": 4031 + }, + { + "epoch": 129.28, + "grad_norm": 5.523827075958252, + "learning_rate": 7.287933094384708e-06, + "loss": 0.0368, + "step": 4040 + }, + { + "epoch": 129.6, + "grad_norm": 7.4960103034973145, + "learning_rate": 7.168458781362007e-06, + "loss": 0.0439, + "step": 4050 + }, + { + "epoch": 129.92, + "grad_norm": 12.937583923339844, + "learning_rate": 7.048984468339308e-06, + "loss": 0.0579, + "step": 4060 + }, + { + "epoch": 129.984, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.7606847286224365, + "eval_runtime": 9.3067, + "eval_samples_per_second": 47.708, + "eval_steps_per_second": 1.504, + "step": 4062 + }, + { + "epoch": 130.24, + "grad_norm": 10.424357414245605, + "learning_rate": 6.929510155316607e-06, + "loss": 0.0328, + "step": 4070 + }, + { + "epoch": 130.56, + "grad_norm": 4.310163974761963, + "learning_rate": 6.810035842293908e-06, + "loss": 0.0356, + "step": 4080 + }, + { + "epoch": 130.88, + "grad_norm": 3.945925235748291, + "learning_rate": 6.690561529271207e-06, + "loss": 0.0444, + "step": 4090 + }, + { + "epoch": 130.976, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.7916550040245056, + "eval_runtime": 9.2498, + "eval_samples_per_second": 48.001, + "eval_steps_per_second": 1.514, + "step": 4093 + }, + { + "epoch": 131.2, + "grad_norm": 8.806529998779297, + "learning_rate": 6.5710872162485075e-06, + "loss": 0.0383, + "step": 4100 + }, + { + "epoch": 131.52, + "grad_norm": 4.7513322830200195, + "learning_rate": 6.451612903225806e-06, + "loss": 0.0536, + "step": 4110 + }, + { + "epoch": 131.84, + "grad_norm": 8.66804313659668, + "learning_rate": 6.332138590203107e-06, + "loss": 0.0462, + "step": 4120 + }, + { + "epoch": 132.0, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.7877137660980225, + "eval_runtime": 9.4541, + "eval_samples_per_second": 46.964, + "eval_steps_per_second": 1.481, + "step": 4125 + }, + { + "epoch": 132.16, + "grad_norm": 4.396764278411865, + "learning_rate": 6.212664277180407e-06, + "loss": 0.0365, + "step": 4130 + }, + { + "epoch": 132.48, + "grad_norm": 5.487627983093262, + "learning_rate": 6.0931899641577065e-06, + "loss": 0.0361, + "step": 4140 + }, + { + "epoch": 132.8, + "grad_norm": 9.7935791015625, + "learning_rate": 5.973715651135006e-06, + "loss": 0.0483, + "step": 4150 + }, + { + "epoch": 132.992, + "eval_accuracy": 0.8400900900900901, + "eval_loss": 0.8122358322143555, + "eval_runtime": 9.4201, + "eval_samples_per_second": 47.133, + "eval_steps_per_second": 1.486, + "step": 4156 + }, + { + "epoch": 133.12, + "grad_norm": 24.21430206298828, + "learning_rate": 5.854241338112306e-06, + "loss": 0.0441, + "step": 4160 + }, + { + "epoch": 133.44, + "grad_norm": 10.01777458190918, + "learning_rate": 5.734767025089606e-06, + "loss": 0.0517, + "step": 4170 + }, + { + "epoch": 133.76, + "grad_norm": 8.408638000488281, + "learning_rate": 5.6152927120669055e-06, + "loss": 0.042, + "step": 4180 + }, + { + "epoch": 133.984, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.7956117987632751, + "eval_runtime": 9.2257, + "eval_samples_per_second": 48.126, + "eval_steps_per_second": 1.518, + "step": 4187 + }, + { + "epoch": 134.08, + "grad_norm": 11.456778526306152, + "learning_rate": 5.495818399044206e-06, + "loss": 0.0511, + "step": 4190 + }, + { + "epoch": 134.4, + "grad_norm": 9.832137107849121, + "learning_rate": 5.376344086021506e-06, + "loss": 0.0402, + "step": 4200 + }, + { + "epoch": 134.72, + "grad_norm": 6.971923828125, + "learning_rate": 5.256869772998806e-06, + "loss": 0.0439, + "step": 4210 + }, + { + "epoch": 134.976, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.8281213641166687, + "eval_runtime": 9.3773, + "eval_samples_per_second": 47.348, + "eval_steps_per_second": 1.493, + "step": 4218 + }, + { + "epoch": 135.04, + "grad_norm": 9.771930694580078, + "learning_rate": 5.137395459976105e-06, + "loss": 0.053, + "step": 4220 + }, + { + "epoch": 135.36, + "grad_norm": 17.273054122924805, + "learning_rate": 5.017921146953405e-06, + "loss": 0.0351, + "step": 4230 + }, + { + "epoch": 135.68, + "grad_norm": 16.49187660217285, + "learning_rate": 4.898446833930706e-06, + "loss": 0.0375, + "step": 4240 + }, + { + "epoch": 136.0, + "grad_norm": 4.234256744384766, + "learning_rate": 4.7789725209080055e-06, + "loss": 0.0458, + "step": 4250 + }, + { + "epoch": 136.0, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.7723496556282043, + "eval_runtime": 9.1454, + "eval_samples_per_second": 48.549, + "eval_steps_per_second": 1.531, + "step": 4250 + }, + { + "epoch": 136.32, + "grad_norm": 6.829943656921387, + "learning_rate": 4.659498207885305e-06, + "loss": 0.0605, + "step": 4260 + }, + { + "epoch": 136.64, + "grad_norm": 6.178986072540283, + "learning_rate": 4.540023894862605e-06, + "loss": 0.0387, + "step": 4270 + }, + { + "epoch": 136.96, + "grad_norm": 6.282145023345947, + "learning_rate": 4.420549581839905e-06, + "loss": 0.0307, + "step": 4280 + }, + { + "epoch": 136.992, + "eval_accuracy": 0.8445945945945946, + "eval_loss": 0.7685997486114502, + "eval_runtime": 9.3166, + "eval_samples_per_second": 47.657, + "eval_steps_per_second": 1.503, + "step": 4281 + }, + { + "epoch": 137.28, + "grad_norm": 2.165778636932373, + "learning_rate": 4.3010752688172045e-06, + "loss": 0.0423, + "step": 4290 + }, + { + "epoch": 137.6, + "grad_norm": 2.3030285835266113, + "learning_rate": 4.181600955794504e-06, + "loss": 0.0381, + "step": 4300 + }, + { + "epoch": 137.92, + "grad_norm": 13.007665634155273, + "learning_rate": 4.062126642771804e-06, + "loss": 0.0481, + "step": 4310 + }, + { + "epoch": 137.984, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.7834463715553284, + "eval_runtime": 9.1579, + "eval_samples_per_second": 48.483, + "eval_steps_per_second": 1.529, + "step": 4312 + }, + { + "epoch": 138.24, + "grad_norm": 6.034422874450684, + "learning_rate": 3.942652329749104e-06, + "loss": 0.0428, + "step": 4320 + }, + { + "epoch": 138.56, + "grad_norm": 5.156213283538818, + "learning_rate": 3.8231780167264036e-06, + "loss": 0.0524, + "step": 4330 + }, + { + "epoch": 138.88, + "grad_norm": 7.5509443283081055, + "learning_rate": 3.7037037037037037e-06, + "loss": 0.0503, + "step": 4340 + }, + { + "epoch": 138.976, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.7986668944358826, + "eval_runtime": 9.2264, + "eval_samples_per_second": 48.123, + "eval_steps_per_second": 1.517, + "step": 4343 + }, + { + "epoch": 139.2, + "grad_norm": 3.5894615650177, + "learning_rate": 3.5842293906810035e-06, + "loss": 0.0367, + "step": 4350 + }, + { + "epoch": 139.52, + "grad_norm": 10.714777946472168, + "learning_rate": 3.4647550776583037e-06, + "loss": 0.0451, + "step": 4360 + }, + { + "epoch": 139.84, + "grad_norm": 20.11847496032715, + "learning_rate": 3.3452807646356034e-06, + "loss": 0.038, + "step": 4370 + }, + { + "epoch": 140.0, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.8156119585037231, + "eval_runtime": 9.2868, + "eval_samples_per_second": 47.81, + "eval_steps_per_second": 1.508, + "step": 4375 + }, + { + "epoch": 140.16, + "grad_norm": 5.215194225311279, + "learning_rate": 3.225806451612903e-06, + "loss": 0.0475, + "step": 4380 + }, + { + "epoch": 140.48, + "grad_norm": 7.803160667419434, + "learning_rate": 3.1063321385902034e-06, + "loss": 0.0391, + "step": 4390 + }, + { + "epoch": 140.8, + "grad_norm": 1.6925849914550781, + "learning_rate": 2.986857825567503e-06, + "loss": 0.0472, + "step": 4400 + }, + { + "epoch": 140.992, + "eval_accuracy": 0.8355855855855856, + "eval_loss": 0.802955150604248, + "eval_runtime": 9.2568, + "eval_samples_per_second": 47.965, + "eval_steps_per_second": 1.512, + "step": 4406 + }, + { + "epoch": 141.12, + "grad_norm": 6.969939231872559, + "learning_rate": 2.867383512544803e-06, + "loss": 0.0448, + "step": 4410 + }, + { + "epoch": 141.44, + "grad_norm": 5.957333087921143, + "learning_rate": 2.747909199522103e-06, + "loss": 0.0499, + "step": 4420 + }, + { + "epoch": 141.76, + "grad_norm": 6.240506172180176, + "learning_rate": 2.628434886499403e-06, + "loss": 0.0282, + "step": 4430 + }, + { + "epoch": 141.984, + "eval_accuracy": 0.8378378378378378, + "eval_loss": 0.7883952260017395, + "eval_runtime": 9.4327, + "eval_samples_per_second": 47.07, + "eval_steps_per_second": 1.484, + "step": 4437 + }, + { + "epoch": 142.08, + "grad_norm": 3.7007412910461426, + "learning_rate": 2.5089605734767026e-06, + "loss": 0.0417, + "step": 4440 + }, + { + "epoch": 142.4, + "grad_norm": 9.855685234069824, + "learning_rate": 2.3894862604540028e-06, + "loss": 0.0439, + "step": 4450 + }, + { + "epoch": 142.72, + "grad_norm": 3.489447832107544, + "learning_rate": 2.2700119474313025e-06, + "loss": 0.0541, + "step": 4460 + }, + { + "epoch": 142.976, + "eval_accuracy": 0.831081081081081, + "eval_loss": 0.7968920469284058, + "eval_runtime": 9.2527, + "eval_samples_per_second": 47.986, + "eval_steps_per_second": 1.513, + "step": 4468 + }, + { + "epoch": 143.04, + "grad_norm": 3.9319980144500732, + "learning_rate": 2.1505376344086023e-06, + "loss": 0.0368, + "step": 4470 + }, + { + "epoch": 143.36, + "grad_norm": 12.418336868286133, + "learning_rate": 2.031063321385902e-06, + "loss": 0.0279, + "step": 4480 + }, + { + "epoch": 143.68, + "grad_norm": 1.1635448932647705, + "learning_rate": 1.9115890083632018e-06, + "loss": 0.037, + "step": 4490 + }, + { + "epoch": 144.0, + "grad_norm": 6.279101371765137, + "learning_rate": 1.7921146953405017e-06, + "loss": 0.0415, + "step": 4500 + }, + { + "epoch": 144.0, + "eval_accuracy": 0.8333333333333334, + "eval_loss": 0.789898693561554, + "eval_runtime": 9.2202, + "eval_samples_per_second": 48.155, + "eval_steps_per_second": 1.518, + "step": 4500 + }, + { + "epoch": 144.32, + "grad_norm": 1.137380838394165, + "learning_rate": 1.6726403823178017e-06, + "loss": 0.0449, + "step": 4510 + }, + { + "epoch": 144.64, + "grad_norm": 11.844816207885742, + "learning_rate": 1.5531660692951017e-06, + "loss": 0.0322, + "step": 4520 + }, + { + "epoch": 144.96, + "grad_norm": 10.018691062927246, + "learning_rate": 1.4336917562724014e-06, + "loss": 0.0579, + "step": 4530 + }, + { + "epoch": 144.992, + "eval_accuracy": 0.8265765765765766, + "eval_loss": 0.7978772521018982, + "eval_runtime": 9.4369, + "eval_samples_per_second": 47.05, + "eval_steps_per_second": 1.484, + "step": 4531 + }, + { + "epoch": 145.28, + "grad_norm": 10.856122016906738, + "learning_rate": 1.3142174432497014e-06, + "loss": 0.0534, + "step": 4540 + }, + { + "epoch": 145.6, + "grad_norm": 12.98085880279541, + "learning_rate": 1.1947431302270014e-06, + "loss": 0.045, + "step": 4550 + }, + { + "epoch": 145.92, + "grad_norm": 9.042325019836426, + "learning_rate": 1.0752688172043011e-06, + "loss": 0.048, + "step": 4560 + }, + { + "epoch": 145.984, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.793506383895874, + "eval_runtime": 9.2712, + "eval_samples_per_second": 47.89, + "eval_steps_per_second": 1.51, + "step": 4562 + }, + { + "epoch": 146.24, + "grad_norm": 6.015984535217285, + "learning_rate": 9.557945041816009e-07, + "loss": 0.0561, + "step": 4570 + }, + { + "epoch": 146.56, + "grad_norm": 13.623724937438965, + "learning_rate": 8.363201911589009e-07, + "loss": 0.0435, + "step": 4580 + }, + { + "epoch": 146.88, + "grad_norm": 7.4142842292785645, + "learning_rate": 7.168458781362007e-07, + "loss": 0.0353, + "step": 4590 + }, + { + "epoch": 146.976, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.793285608291626, + "eval_runtime": 9.135, + "eval_samples_per_second": 48.604, + "eval_steps_per_second": 1.533, + "step": 4593 + }, + { + "epoch": 147.2, + "grad_norm": 15.34078598022461, + "learning_rate": 5.973715651135007e-07, + "loss": 0.0294, + "step": 4600 + }, + { + "epoch": 147.52, + "grad_norm": 5.058185577392578, + "learning_rate": 4.778972520908004e-07, + "loss": 0.0447, + "step": 4610 + }, + { + "epoch": 147.84, + "grad_norm": 14.668283462524414, + "learning_rate": 3.5842293906810036e-07, + "loss": 0.0438, + "step": 4620 + }, + { + "epoch": 148.0, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.7915613651275635, + "eval_runtime": 9.1582, + "eval_samples_per_second": 48.481, + "eval_steps_per_second": 1.529, + "step": 4625 + }, + { + "epoch": 148.16, + "grad_norm": 8.498198509216309, + "learning_rate": 2.389486260454002e-07, + "loss": 0.0458, + "step": 4630 + }, + { + "epoch": 148.48, + "grad_norm": 3.9683709144592285, + "learning_rate": 1.194743130227001e-07, + "loss": 0.0298, + "step": 4640 + }, + { + "epoch": 148.8, + "grad_norm": 21.15007209777832, + "learning_rate": 0.0, + "loss": 0.0487, + "step": 4650 + }, + { + "epoch": 148.8, + "eval_accuracy": 0.8288288288288288, + "eval_loss": 0.792281985282898, + "eval_runtime": 9.2874, + "eval_samples_per_second": 47.806, + "eval_steps_per_second": 1.507, + "step": 4650 }, { - "epoch": 86.66666666666667, - "step": 130, - "total_flos": 1.574865655328932e+17, - "train_loss": 0.3546037518061124, - "train_runtime": 225.362, - "train_samples_per_second": 41.533, - "train_steps_per_second": 0.577 + "epoch": 148.8, + "step": 4650, + "total_flos": 1.4980450073050055e+19, + "train_loss": 0.38013411539216196, + "train_runtime": 17110.8333, + "train_samples_per_second": 35.03, + "train_steps_per_second": 0.272 } ], "logging_steps": 10, - "max_steps": 130, + "max_steps": 4650, "num_input_tokens_seen": 0, - "num_train_epochs": 130, + "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -909,7 +4631,7 @@ "attributes": {} } }, - "total_flos": 1.574865655328932e+17, + "total_flos": 1.4980450073050055e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null